From d0b3af80c723a0407e6dad8e6b2070829aef8f07 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <me@gsnedders.com>
Date: Wed, 18 May 2016 18:03:23 +0100
Subject: [PATCH 001/314] Limit html5lib version before sanitizer API changes
 get released.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ddd95719..1c716ff4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 ordereddict
 six
-html5lib>=0.999
+html5lib>=0.999,<0.99999999
 
 # Requirements to run the test suite:
 nose

From 6ef04917d14b47005ee0ebdae68fec3c144d4577 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 May 2016 09:26:48 -0400
Subject: [PATCH 002/314] Update CHANGES re: html5lib limitation

---
 CHANGES | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CHANGES b/CHANGES
index 4588b80a..8784e83e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,6 +17,15 @@ Version 1.5? (in progress)
   allowed protocols. Thank you, Andreas Malecki! #149
 
 
+Version 1.4.3 (May 23rd, 2016)
+------------------------------
+
+**Changes**
+
+- Limit to html5lib >=0.999<0.99999999 because of impending change to
+  sanitizer api. #195
+
+
 Version 1.4.2 (September 11, 2015)
 ----------------------------------
 

From 532463ef194be11f2b73af78a65b0016b68000c1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 May 2016 09:27:52 -0400
Subject: [PATCH 003/314] Add html5lib restriction to setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6562aa97..8c37dcf1 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 install_requires = [
     'six',
-    'html5lib>=0.999',
+    'html5lib>=0.999,<0.99999999',
 ]
 
 try:

From b652ef4f72f888fab5ee2416293c0f8c18cbfacd Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 May 2016 09:28:56 -0400
Subject: [PATCH 004/314] Update for 1.4.3 release

---
 CONTRIBUTORS       | 1 +
 bleach/__init__.py | 2 +-
 docs/conf.py       | 2 +-
 setup.py           | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 3eb6c7f8..d93749c5 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -24,6 +24,7 @@ Contributors:
 - Chris Beaven
 - Erik Rose
 - Gaurav Dadhania
+- Geoffrey Sneddon
 - Jaime Irurzun
 - Jeff Balogh
 - Lee, Cheon-il
diff --git a/bleach/__init__.py b/bleach/__init__.py
index aec2d340..0a574a3e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -13,7 +13,7 @@
 from .sanitizer import BleachSanitizer
 
 
-VERSION = (1, 4, 2)
+VERSION = (1, 4, 3)
 __version__ = '.'.join([str(n) for n in VERSION])
 
 __all__ = ['clean', 'linkify']
diff --git a/docs/conf.py b/docs/conf.py
index c1e953fd..3ea7bd10 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,7 +51,7 @@
 # The short X.Y version.
 version = '1.4'
 # The full version, including alpha/beta/rc tags.
-release = '1.4.1'
+release = '1.4.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/setup.py b/setup.py
index 8c37dcf1..9283a803 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def get_long_desc():
 
 setup(
     name='bleach',
-    version='1.4.2',
+    version='1.4.3',
     description='An easy whitelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
     maintainer='Jannis Leidel, Will Kahn-Greene',

From e24095c8f3c2f00fea7dda9c43d5dd6ee200bf2f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 May 2016 09:40:21 -0400
Subject: [PATCH 005/314] Add testing for python 3.5

---
 .travis.yml      | 1 +
 MANIFEST.in      | 1 +
 requirements.txt | 3 +++
 setup.py         | 1 +
 tox.ini          | 2 +-
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 193f70a0..a0e93b16 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ python:
  - "3.2"
  - "3.3"
  - "3.4"
+ - "3.5"
  - "pypy"
 install: 
   - "pip install -r requirements.txt"
diff --git a/MANIFEST.in b/MANIFEST.in
index 9d5d250d..870f669c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
+include CHANGES
 include LICENSE
 include README.rst
diff --git a/requirements.txt b/requirements.txt
index 1c716ff4..b6d538e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,6 @@ tox
 
 # Requirements for building docs
 Sphinx
+
+# Requirements for updating package
+twine
diff --git a/setup.py b/setup.py
index 9283a803..da408f0a 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@ def get_long_desc():
         'Programming Language :: Python :: 3.2',
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ]
 )
diff --git a/tox.ini b/tox.ini
index 5d4fe518..704c3b51 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py32, py33, py34, pypy
+envlist = py26, py27, py32, py33, py34, py35, pypy
 
 [testenv]
 commands = nosetests {posargs:-v}

From 55d5ffbeb72cab230af17ea51e2577285ae59037 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 May 2016 09:40:46 -0400
Subject: [PATCH 006/314] Add dev docs

This walks through the release process which will make it easier to
remember how to do a release in the future.
---
 docs/dev.rst   | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst |  1 +
 2 files changed, 67 insertions(+)
 create mode 100644 docs/dev.rst

diff --git a/docs/dev.rst b/docs/dev.rst
new file mode 100644
index 00000000..7338953a
--- /dev/null
+++ b/docs/dev.rst
@@ -0,0 +1,66 @@
+==================
+Bleach development
+==================
+
+Docs
+====
+
+Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs
+via a GitHub webhook.
+
+
+Testing
+=======
+
+Run::
+
+    $ tox
+
+That'll run bleach tests in all the supported Python environments. Note
+that you need the necessary Python binaries for them all to be tested.
+
+Tests are run in Travis CI via a GitHub webhook.
+
+
+Release process
+===============
+
+1. Checkout master tip.
+
+2. Check to make sure ``setup.py`` and ``requirements.txt``.
+
+3. Update version numbers in:
+
+   * ``setup.py``
+   * ``bleach/__init__.py``
+   * ``docs/confg.py``
+
+   Set the version to something like ``0.4``.
+
+4. Update ``CONTRIBUTORS``, ``CHANGES`` and ``MANIFEST.in``.
+
+5. Verify correctness.
+
+   1. Run tests with tox
+   2. Build the docs
+   3. Verify everything works
+
+6. Push everything to GitHub. This will cause Travis to run the tests.
+
+7. After Travis is happy, tag the release::
+
+     $ git tag -a v0.4
+
+   Copy the details from ``CHANGES`` into the tag comment.
+
+8. Push the new tag::
+
+     $ git push --tags official master
+
+9. Update PyPI::
+
+     $ rm -rf dist
+     $ python setup.py sdist bdist_wheel
+     $ twine upload sdist/*
+
+10. Blog posts, twitter, update topic in ``#bleach``, etc.
diff --git a/docs/index.rst b/docs/index.rst
index 1d8c94b9..217dc159 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,6 +10,7 @@ Contents
    clean
    linkify
    goals
+   dev
    changes
 
 

From 26aac508617a2ad298479f2cd4cffe1b788d92f0 Mon Sep 17 00:00:00 2001
From: Tim Dumol <tim@timdumol.com>
Date: Fri, 27 May 2016 20:45:54 +0800
Subject: [PATCH 007/314] Unify version information to rely on a single source.

---
 bleach/__init__.py |  5 +----
 bleach/version.py  |  6 ++++++
 docs/conf.py       |  6 ++++--
 setup.py           | 21 ++++++++++++++++++++-
 4 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 bleach/version.py

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 0a574a3e..3092cb7f 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -11,10 +11,7 @@
 from . import callbacks as linkify_callbacks
 from .encoding import force_unicode
 from .sanitizer import BleachSanitizer
-
-
-VERSION = (1, 4, 3)
-__version__ = '.'.join([str(n) for n in VERSION])
+from .version import __version__, VERSION # flake8: noqa
 
 __all__ = ['clean', 'linkify']
 
diff --git a/bleach/version.py b/bleach/version.py
new file mode 100644
index 00000000..134e4857
--- /dev/null
+++ b/bleach/version.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+VERSION = (1, 4, 3)
+__version__ = '.'.join([str(n) for n in VERSION])
diff --git a/docs/conf.py b/docs/conf.py
index 3ea7bd10..88fe431c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -13,6 +13,8 @@
 
 import sys, os
 
+from bleach import __version__, VERSION
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -49,9 +51,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.4'
+version = '.'.join([str(n) for n in VERSION[:2]])
 # The full version, including alpha/beta/rc tags.
-release = '1.4.3'
+release = __version__
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/setup.py b/setup.py
index da408f0a..d84e407f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,7 @@
+import re
+
 from setuptools import setup, find_packages
+from distutils.util import convert_path
 
 install_requires = [
     'six',
@@ -20,10 +23,26 @@ def get_long_desc():
     desc += open('CHANGES').read()
     return desc
 
+# foolproof way of single-sourcing version as per
+# http://stackoverflow.com/a/24517154/112943
+# Here we use re.search instead of exec to avoid any
+# possibility of side effects in version.py
+version_path = convert_path('bleach/version.py')
+with open(version_path) as version_file:
+    for line in version_file:
+        if line.startswith('VERSION = '):
+            match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line)
+            __version__ = '{0!s}.{1!s}.{2!s}'.format(
+                match.group(1),
+                match.group(2),
+                match.group(3)
+            )
+            break
+
 
 setup(
     name='bleach',
-    version='1.4.3',
+    version=__version__,
     description='An easy whitelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
     maintainer='Jannis Leidel, Will Kahn-Greene',

From b6da6ca9719dbb6aba0180686a593f5cc6a4c5af Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 14 Jun 2016 15:26:44 +0100
Subject: [PATCH 008/314] Move version acquiring to a function

---
 setup.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index d84e407f..39d9696e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,26 +23,23 @@ def get_long_desc():
     desc += open('CHANGES').read()
     return desc
 
-# foolproof way of single-sourcing version as per
-# http://stackoverflow.com/a/24517154/112943
-# Here we use re.search instead of exec to avoid any
-# possibility of side effects in version.py
-version_path = convert_path('bleach/version.py')
-with open(version_path) as version_file:
-    for line in version_file:
-        if line.startswith('VERSION = '):
-            match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line)
-            __version__ = '{0!s}.{1!s}.{2!s}'.format(
-                match.group(1),
-                match.group(2),
-                match.group(3)
-            )
-            break
+
+def get_version():
+    version_path = convert_path('bleach/version.py')
+    with open(version_path) as version_file:
+        for line in version_file:
+            if line.startswith('VERSION = '):
+                match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line)
+                return '{0!s}.{1!s}.{2!s}'.format(
+                    match.group(1),
+                    match.group(2),
+                    match.group(3)
+                )
 
 
 setup(
     name='bleach',
-    version=__version__,
+    version=get_version(),
     description='An easy whitelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
     maintainer='Jannis Leidel, Will Kahn-Greene',

From 2235b8fcadc8abef3a2845bb0ce67206982f3489 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 14 Jun 2016 15:26:54 +0100
Subject: [PATCH 009/314] Add Tim to CONTRIBUTORS list

---
 CONTRIBUTORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index d93749c5..a587e807 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -39,6 +39,7 @@ Contributors:
 - Ricky Rosario
 - Ryan Niemeyer
 - Sébastien Fievet
+- Tim Dumol
 - Timothy Fitz
 - Vitaly Volkov
 - mdxs

From 592972bd12962bed9ca1f5af551199828a678be1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 14 Jun 2016 16:10:51 +0100
Subject: [PATCH 010/314] Fix linkify edge case with ending )

It's common to put urls in parentheses. Sometimes the url_re will pick
up end parentheses that are part of the parenthetical, but shouldn't be
part of the url. This fix alleviates that somewhat by checking to see if
the url has a ( and if not, stripping any ) from the end. This assumes
that urls won't end in ) without also having a ( in them.

This is based on a fix from Istvan Albert.

Fixes #190
---
 bleach/__init__.py         |  8 ++++++++
 bleach/tests/test_links.py | 15 ++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3092cb7f..c55bc6b7 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -344,6 +344,14 @@ def link_repl(match):
         if url.startswith('('):
             _wrapping = strip_wrapping_parentheses(url)
             url, open_brackets, close_brackets = _wrapping
+        if url.endswith(')') and '(' not in url:
+            # This is a clumsy handling for the case where we have something
+            # like (foo http://example.com) and the ) gets picked up by the
+            # url_re but we don't want it part of the link.
+            new_url = url.rstrip(')')
+            close_brackets += len(url) - len(new_url)
+            url = new_url
+
         end = ''
         m = re.search(punct_re, url)
         if m:
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index 62da8d19..97226c1c 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -360,11 +360,16 @@ def test_wrapping_parentheses():
     tests = (
         ('(example.com)', ('(', 'example.com', 'example.com', ')')),
         ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')),
-        ('(example.com/foo)', ('(', 'example.com/foo',
-         'example.com/foo', ')')),
-        ('(((example.com/))))', ('(((', 'example.com/)',
-         'example.com/)', ')))')),
-        ('example.com/))', ('', 'example.com/))', 'example.com/))', '')),
+        ('(example.com/foo)',
+         ('(', 'example.com/foo', 'example.com/foo', ')')),
+        ('(((example.com/))))',
+         ('(((', 'example.com/', 'example.com/', '))))')),
+        ('example.com/))',
+         ('', 'example.com/', 'example.com/', '))')),
+        ('(foo http://example.com/)',
+         ('(foo ', 'example.com/', 'http://example.com/', ')')),
+        ('(foo http://example.com)',
+         ('(foo ', 'example.com', 'http://example.com', ')')),
         ('http://en.wikipedia.org/wiki/Test_(assessment)',
          ('', 'en.wikipedia.org/wiki/Test_(assessment)',
           'http://en.wikipedia.org/wiki/Test_(assessment)', '')),

From f635db1718db681afa712da98e0ca76e3a70d298 Mon Sep 17 00:00:00 2001
From: Dan Gayle <dangayle@gmail.com>
Date: Thu, 3 Dec 2015 15:55:32 -0800
Subject: [PATCH 011/314] Fix logging when there's no handler

Added NullHandler to logging, to prevent "No handlers could be found for
logger "bleach"" warnings in applications that haven't set up logging
properly.

Fixes #182
---
 bleach/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3092cb7f..5c53d3a4 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,6 +2,12 @@
 
 from __future__ import unicode_literals
 import logging
+try:  # Python 2.7+
+    from logging import NullHandler
+except ImportError:
+    class NullHandler(logging.Handler):
+        def emit(self, record):
+            pass
 import re
 
 import html5lib
@@ -15,7 +21,7 @@
 
 __all__ = ['clean', 'linkify']
 
-log = logging.getLogger('bleach')
+log = logging.getLogger(__name__).addHandler(NullHandler())
 
 ALLOWED_TAGS = [
     'a',

From 86af2305f2f4e04ca9f62a00cae7cfba782bc20e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 14 Jun 2016 16:34:25 +0100
Subject: [PATCH 012/314] Fix log so it's not None

Fixes #182
---
 bleach/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 5c53d3a4..53217bac 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -21,7 +21,8 @@ def emit(self, record):
 
 __all__ = ['clean', 'linkify']
 
-log = logging.getLogger(__name__).addHandler(NullHandler())
+log = logging.getLogger(__name__)
+log.addHandler(NullHandler())
 
 ALLOWED_TAGS = [
     'a',

From 9bd8f721f2eabc5a014f33e7053e03bbba98c736 Mon Sep 17 00:00:00 2001
From: Istvan Albert <istvan.albert@gmail.com>
Date: Tue, 14 Jun 2016 17:10:39 +0100
Subject: [PATCH 013/314] Children of <pre> tags should not be linkified when
 skip_pre=True

Fixes #150
---
 bleach/__init__.py         | 2 +-
 bleach/tests/test_links.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3092cb7f..ac163d12 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -315,7 +315,7 @@ def linkify_nodes(tree, parse_text=True):
                 if node.tag == ETREE_TAG('pre') and skip_pre:
                     linkify_nodes(node, False)
                 elif not (node in _seen):
-                    linkify_nodes(node, True)
+                    linkify_nodes(node, parse_text)
 
             current_child += 1
 
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index 62da8d19..2958f5e6 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -314,6 +314,13 @@ def test_skip_pre():
     eq_(nofollowed, linkify(already_linked))
     eq_(nofollowed, linkify(already_linked, skip_pre=True))
 
+    eq_(
+        linkify('<pre><code>http://example.com</code></pre>http://example.com',
+                skip_pre=True),
+        ('<pre><code>http://example.com</code></pre>'
+         '<a href="http://example.com" rel="nofollow">http://example.com</a>')
+    )
+
 
 def test_libgl():
     """libgl.so.1 should not be linkified."""

From c28b9e37ed659a588e49bf7bf1881ec4e6d7bc25 Mon Sep 17 00:00:00 2001
From: Jannis Leidel <jannis@leidel.info>
Date: Tue, 21 Jun 2016 16:48:21 +0200
Subject: [PATCH 014/314] Auto-release to PyPI on tag. Fix #209.

This also makes wheel files be universal (Python 2 & 3).
---
 .travis.yml | 35 ++++++++++++++++++++++++-----------
 setup.cfg   |  3 +++
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a0e93b16..9a1f0b43 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,15 +1,28 @@
 sudo: false
 language: python
+cache:
+  directories:
+  - "~/.cache/pip"
 python:
- - "2.6"
- - "2.7"
- - "3.2"
- - "3.3"
- - "3.4"
- - "3.5"
- - "pypy"
-install: 
-  - "pip install -r requirements.txt"
+- "2.6"
+- "2.7"
+- "3.2"
+- "3.3"
+- "3.4"
+- "3.5"
+- "pypy"
+install:
+- pip install -r requirements.txt
 script:
- - nosetests
- - flake8 bleach/
+- nosetests
+- flake8 bleach/
+deploy:
+  provider: pypi
+  user: jezdez
+  distributions: sdist bdist_wheel
+  password:
+    secure: TTLpnNBAmRBPe4qITwtM6MRXw3CvGpflnkG6V97oKYL1RJhDXmxIxxImkGyVoT2IR4Oy/jqEikWUCCC3aDoqDnIkkDVriTPmo5PGnS2WgvEmYdcaTIp+RXdKwKhpCVX8ITEuye0iCXYu28vDaySGjnxjlYAP4S0PGPUzh/tn4DY=
+  on:
+    tags: true
+    repo: mozilla/bleach
+    python: "2.7"
diff --git a/setup.cfg b/setup.cfg
index 81cd366c..38f6166d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,5 @@
 [flake8]
 ignore = E731,W503
+
+[wheel]
+universal=1

From 7aebc95eb95224f9303762adfbbce70689ad4b81 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 16 Jun 2016 12:08:55 +0100
Subject: [PATCH 015/314] Change requirements to use py.test

---
 requirements.txt | 22 ++++++++++++----------
 setup.py         |  4 ----
 tox.ini          |  2 +-
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b6d538e3..a45f7810 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,16 @@
-ordereddict
-six
-html5lib>=0.999,<0.99999999
+-e .
 
+ordereddict==1.1
+six==1.10.0
+html5lib>=0.999,<0.99999999
+ 
 # Requirements to run the test suite:
-nose
-flake8
-tox
-
+pytest==2.9.2
+flake8==2.6.0
+tox==2.3.1
+ 
 # Requirements for building docs
-Sphinx
-
+Sphinx==1.4.4
+ 
 # Requirements for updating package
-twine
+twine==1.6.5
diff --git a/setup.py b/setup.py
index 39d9696e..26686efd 100644
--- a/setup.py
+++ b/setup.py
@@ -50,10 +50,6 @@ def get_version():
     package_data={'': ['README.rst']},
     zip_safe=False,
     install_requires=install_requires,
-    tests_require=[
-        'nose>=1.3',
-    ],
-    test_suite='nose.collector',
     classifiers=[
         'Development Status :: 5 - Production/Stable',
         'Environment :: Web Environment',
diff --git a/tox.ini b/tox.ini
index 704c3b51..da138989 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,7 +7,7 @@
 envlist = py26, py27, py32, py33, py34, py35, pypy
 
 [testenv]
-commands = nosetests {posargs:-v}
+commands = py.test {posargs:-v}
 deps =
     six
     html5lib==0.999

From 01f8cf62d9e70948d412d9dc48b0216d2edec216 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 16 Jun 2016 12:10:25 +0100
Subject: [PATCH 016/314] Move tests out of the code root

---
 {bleach/tests => tests}/__init__.py      | 0
 {bleach/tests => tests}/test_basics.py   | 0
 {bleach/tests => tests}/test_css.py      | 0
 {bleach/tests => tests}/test_links.py    | 0
 {bleach/tests => tests}/test_security.py | 0
 {bleach/tests => tests}/test_unicode.py  | 0
 {bleach/tests => tests}/tools.py         | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename {bleach/tests => tests}/__init__.py (100%)
 rename {bleach/tests => tests}/test_basics.py (100%)
 rename {bleach/tests => tests}/test_css.py (100%)
 rename {bleach/tests => tests}/test_links.py (100%)
 rename {bleach/tests => tests}/test_security.py (100%)
 rename {bleach/tests => tests}/test_unicode.py (100%)
 rename {bleach/tests => tests}/tools.py (100%)

diff --git a/bleach/tests/__init__.py b/tests/__init__.py
similarity index 100%
rename from bleach/tests/__init__.py
rename to tests/__init__.py
diff --git a/bleach/tests/test_basics.py b/tests/test_basics.py
similarity index 100%
rename from bleach/tests/test_basics.py
rename to tests/test_basics.py
diff --git a/bleach/tests/test_css.py b/tests/test_css.py
similarity index 100%
rename from bleach/tests/test_css.py
rename to tests/test_css.py
diff --git a/bleach/tests/test_links.py b/tests/test_links.py
similarity index 100%
rename from bleach/tests/test_links.py
rename to tests/test_links.py
diff --git a/bleach/tests/test_security.py b/tests/test_security.py
similarity index 100%
rename from bleach/tests/test_security.py
rename to tests/test_security.py
diff --git a/bleach/tests/test_unicode.py b/tests/test_unicode.py
similarity index 100%
rename from bleach/tests/test_unicode.py
rename to tests/test_unicode.py
diff --git a/bleach/tests/tools.py b/tests/tools.py
similarity index 100%
rename from bleach/tests/tools.py
rename to tests/tools.py

From 04f49ff1f93ec6344479a226df5b02cf5bf5a715 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 11:03:32 -0400
Subject: [PATCH 017/314] Update dev requirements to latest

---
 requirements.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a45f7810..79aa6e61 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,12 +5,12 @@ six==1.10.0
 html5lib>=0.999,<0.99999999
  
 # Requirements to run the test suite:
-pytest==2.9.2
-flake8==2.6.0
-tox==2.3.1
+pytest==3.0.3
+flake8==3.0.4
+tox==2.4.1
  
 # Requirements for building docs
-Sphinx==1.4.4
+Sphinx==1.4.8
  
 # Requirements for updating package
-twine==1.6.5
+twine==1.8.1

From 6c070a8a132f9de3b57d9293fa0aebdfa5ce3af7 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 11:10:34 -0400
Subject: [PATCH 018/314] Rewrite test_basics.py to work with py.test

---
 tests/__init__.py    |   0
 tests/test_basics.py | 165 +++++++++++++++++++++++++++----------------
 2 files changed, 103 insertions(+), 62 deletions(-)
 delete mode 100644 tests/__init__.py

diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 18fc2a6c..6fa6c22e 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,13 +1,11 @@
 import six
 import html5lib
-from nose.tools import eq_
 
 import bleach
-from bleach.tests.tools import in_
 
 
 def test_empty():
-    eq_('', bleach.clean(''))
+    assert bleach.clean('') == ''
 
 
 def test_nbsp():
@@ -16,48 +14,58 @@ def test_nbsp():
     else:
         expected = six.u('\\xa0test string\\xa0')
 
-    eq_(expected, bleach.clean('&nbsp;test string&nbsp;'))
+    assert bleach.clean('&nbsp;test string&nbsp;') == expected
 
 
 def test_comments_only():
     comment = '<!-- this is a comment -->'
     open_comment = '<!-- this is an open comment'
-    eq_('', bleach.clean(comment))
-    eq_('', bleach.clean(open_comment))
-    eq_(comment, bleach.clean(comment, strip_comments=False))
-    eq_('{0!s}-->'.format(open_comment), bleach.clean(open_comment,
-                                                      strip_comments=False))
+    assert bleach.clean(comment) == ''
+    assert bleach.clean(open_comment) == ''
+    assert bleach.clean(comment, strip_comments=False) == comment
+    assert (
+        bleach.clean(open_comment, strip_comments=False) ==
+        '{0!s}-->'.format(open_comment)
+    )
 
 
 def test_with_comments():
     html = '<!-- comment -->Just text'
-    eq_('Just text', bleach.clean(html))
-    eq_(html, bleach.clean(html, strip_comments=False))
+    assert 'Just text', bleach.clean(html) == 'Just text'
+    assert bleach.clean(html, strip_comments=False) == html
 
 
 def test_no_html():
-    eq_('no html string', bleach.clean('no html string'))
+    assert bleach.clean('no html string') == 'no html string'
 
 
 def test_allowed_html():
-    eq_('an <strong>allowed</strong> tag',
-        bleach.clean('an <strong>allowed</strong> tag'))
-    eq_('another <em>good</em> tag',
-        bleach.clean('another <em>good</em> tag'))
+    assert (
+        bleach.clean('an <strong>allowed</strong> tag') ==
+        'an <strong>allowed</strong> tag'
+    )
+    assert (
+        bleach.clean('another <em>good</em> tag') ==
+        'another <em>good</em> tag'
+    )
 
 
 def test_bad_html():
-    eq_('a <em>fixed tag</em>',
-        bleach.clean('a <em>fixed tag'))
+    assert (
+        bleach.clean('a <em>fixed tag') ==
+        'a <em>fixed tag</em>'
+    )
 
 
 def test_function_arguments():
     TAGS = ['span', 'br']
     ATTRS = {'span': ['style']}
 
-    eq_('a <br><span style="">test</span>',
+    assert (
         bleach.clean('a <br/><span style="color:red">test</span>',
-                     tags=TAGS, attributes=ATTRS))
+                     tags=TAGS, attributes=ATTRS) ==
+        'a <br><span style="">test</span>'
+    )
 
 
 def test_named_arguments():
@@ -65,73 +73,104 @@ def test_named_arguments():
     s = ('<a href="http://xx.com" rel="alternate">xx.com</a>',
          '<a rel="alternate" href="http://xx.com">xx.com</a>')
 
-    eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s[0]))
-    in_(s, bleach.clean(s[0], attributes=ATTRS))
+    assert bleach.clean(s[0]) == '<a href="http://xx.com">xx.com</a>'
+    # FIXME: This might not be needed if attribute order is stable now.
+    assert bleach.clean(s[0], attributes=ATTRS) in s
 
 
 def test_disallowed_html():
-    eq_('a &lt;script&gt;safe()&lt;/script&gt; test',
-        bleach.clean('a <script>safe()</script> test'))
-    eq_('a &lt;style&gt;body{}&lt;/style&gt; test',
-        bleach.clean('a <style>body{}</style> test'))
+    assert (
+        bleach.clean('a <script>safe()</script> test') ==
+        'a &lt;script&gt;safe()&lt;/script&gt; test'
+    )
+    assert (
+        bleach.clean('a <style>body{}</style> test') ==
+        'a &lt;style&gt;body{}&lt;/style&gt; test'
+    )
 
 
 def test_bad_href():
-    eq_('<em>no link</em>',
-        bleach.clean('<em href="fail">no link</em>'))
+    assert (
+        bleach.clean('<em href="fail">no link</em>') ==
+        '<em>no link</em>'
+    )
 
 
 def test_bare_entities():
-    eq_('an &amp; entity', bleach.clean('an & entity'))
-    eq_('an &lt; entity', bleach.clean('an < entity'))
-    eq_('tag &lt; <em>and</em> entity',
-        bleach.clean('tag < <em>and</em> entity'))
-    eq_('&amp;', bleach.clean('&amp;'))
+    assert (
+        bleach.clean('an & entity') ==
+        'an &amp; entity'
+    )
+    assert (
+        bleach.clean('an < entity') ==
+        'an &lt; entity'
+    )
+
+    assert (
+        bleach.clean('tag < <em>and</em> entity') ==
+        'tag &lt; <em>and</em> entity'
+    )
+
+    assert (
+        bleach.clean('&amp;') ==
+        '&amp;'
+    )
 
 
 def test_escaped_entities():
     s = '&lt;em&gt;strong&lt;/em&gt;'
-    eq_(s, bleach.clean(s))
+    assert bleach.clean(s) == s
 
 
 def test_serializer():
     s = '<table></table>'
-    eq_(s, bleach.clean(s, tags=['table']))
-    eq_('test<table></table>', bleach.linkify('<table>test</table>'))
-    eq_('<p>test</p>', bleach.clean('<p>test</p>', tags=['p']))
+    assert bleach.clean(s, tags=['table']) == s
+    assert bleach.linkify('<table>test</table>') == 'test<table></table>'
+    assert bleach.clean('<p>test</p>', tags=['p']) == '<p>test</p>'
 
 
 def test_no_href_links():
     s = '<a name="anchor">x</a>'
-    eq_(s, bleach.linkify(s))
+    assert bleach.linkify(s) == s
 
 
 def test_weird_strings():
     s = '</3'
-    eq_(bleach.clean(s), '')
+    assert bleach.clean(s) == ''
 
 
 def test_xml_render():
     parser = html5lib.HTMLParser()
-    eq_(bleach._render(parser.parseFragment('')), '')
+    assert bleach._render(parser.parseFragment('')) == ''
 
 
 def test_stripping():
-    eq_('a test <em>with</em> <b>html</b> tags',
-        bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True))
-    eq_('a test <em>with</em>  <b>html</b> tags',
-        bleach.clean('a test <em>with</em> <img src="http://example.com/"> '
-                     '<b>html</b> tags', strip=True))
+    assert (
+        bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True) ==
+        'a test <em>with</em> <b>html</b> tags'
+    )
+    assert (
+        bleach.clean('a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags', strip=True) ==
+        'a test <em>with</em>  <b>html</b> tags'
+    )
 
     s = '<p><a href="http://example.com/">link text</a></p>'
-    eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True))
+    assert (
+        bleach.clean(s, tags=['p'], strip=True) ==
+        '<p>link text</p>'
+    )
     s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
-    eq_('<p>multiply nested text</p>', bleach.clean(s, tags=['p'], strip=True))
+    assert (
+        bleach.clean(s, tags=['p'], strip=True) ==
+        '<p>multiply nested text</p>'
+    )
 
     s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
          '</a></p>')
-    eq_('<p><a href="http://example.com/"></a></p>',
-        bleach.clean(s, tags=['p', 'a'], strip=True))
+    assert (
+        bleach.clean(s, tags=['p', 'a'], strip=True) ==
+        '<p><a href="http://example.com/"></a></p>'
+    )
 
 
 def test_allowed_styles():
@@ -139,10 +178,12 @@ def test_allowed_styles():
     STYLE = ['color']
     blank = '<b style=""></b>'
     s = '<b style="color: blue;"></b>'
-    eq_(blank, bleach.clean('<b style="top:0"></b>', attributes=ATTR))
-    eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE))
-    eq_(s, bleach.clean('<b style="top: 0; color: blue;"></b>',
-                        attributes=ATTR, styles=STYLE))
+    assert bleach.clean('<b style="top:0"></b>', attributes=ATTR) == blank
+    assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s
+    assert (
+        bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTR, styles=STYLE) ==
+        s
+    )
 
 
 def test_idempotent():
@@ -150,10 +191,10 @@ def test_idempotent():
     dirty = '<span>invalid & </span> < extra http://link.com<em>'
 
     clean = bleach.clean(dirty)
-    eq_(clean, bleach.clean(clean))
+    assert bleach.clean(clean) == clean
 
     linked = bleach.linkify(dirty)
-    eq_(linked, bleach.linkify(linked))
+    assert bleach.linkify(linked) == linked
 
 
 def test_rel_already_there():
@@ -165,15 +206,15 @@ def test_rel_already_there():
                  ('Click <a rel="tooltip nofollow" href="http://example.com">'
                   'here</a>.'))
 
-    in_(link_good, bleach.linkify(linked))
-    in_(link_good, bleach.linkify(link_good[0]))
+    assert bleach.linkify(linked) in link_good
+    assert bleach.linkify(link_good[0]) in link_good
 
 
 def test_lowercase_html():
     """We should output lowercase HTML."""
     dirty = '<EM CLASS="FOO">BAR</EM>'
     clean = '<em class="FOO">BAR</em>'
-    eq_(clean, bleach.clean(dirty, attributes=['class']))
+    assert bleach.clean(dirty, attributes=['class']) == clean
 
 
 def test_wildcard_attributes():
@@ -186,22 +227,22 @@ def test_wildcard_attributes():
              '<img id="bar" src="foo"/>')
     clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">',
              'both <em id="foo">can</em> have <img id="bar" src="foo">')
-    in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+    assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
 
 
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     dirty = 'Yeah right <sarcasm/>'
     clean = 'Yeah right &lt;sarcasm/&gt;'
-    eq_(clean, bleach.clean(dirty))
+    assert bleach.clean(dirty) == clean
 
 
 def test_user_defined_protocols_valid():
     valid_href = '<a href="my_protocol://more_text">allowed href</a>'
-    eq_(valid_href, bleach.clean(valid_href, protocols=['my_protocol']))
+    assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href
 
 
 def test_user_defined_protocols_invalid():
     invalid_href = '<a href="http://xx.com">invalid href</a>'
     cleaned_href = '<a>invalid href</a>'
-    eq_(cleaned_href, bleach.clean(invalid_href, protocols=['my_protocol']))
+    assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href

From 30637ed15a7912af215c1b4ac141b1e41f5ef2f7 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 11:18:04 -0400
Subject: [PATCH 019/314] Rewrite test_css.py to work with py.test

---
 tests/test_css.py | 107 +++++++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 35 deletions(-)

diff --git a/tests/test_css.py b/tests/test_css.py
index b40596ff..0b92f40b 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,6 +1,6 @@
 from functools import partial
 
-from nose.tools import eq_
+import pytest
 
 from bleach import clean
 
@@ -8,47 +8,85 @@
 clean = partial(clean, tags=['p'], attributes=['style'])
 
 
-def test_allowed_css():
-    tests = (
-        ('font-family: Arial; color: red; float: left; '
-         'background-color: red;', 'color: red;', ['color']),
-        ('border: 1px solid blue; color: red; float: left;', 'color: red;',
-         ['color']),
-        ('border: 1px solid blue; color: red; float: left;',
-         'color: red; float: left;', ['color', 'float']),
-        ('color: red; float: left; padding: 1em;', 'color: red; float: left;',
-         ['color', 'float']),
-        ('color: red; float: left; padding: 1em;', 'color: red;', ['color']),
-        ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']),
-        ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']),
-        ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']),
-        ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;",
-         ['text-overflow']),
-        ('text-overflow: "," ellipsis;', 'text-overflow: "," ellipsis;',
-         ['text-overflow']),
-        ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']),
-    )
+@pytest.mark.parametrize('data,styles,expected', [
+    (
+        'font-family: Arial; color: red; float: left; background-color: red;',
+        ['color'],
+        'color: red;'
+    ),
+    (
+        'border: 1px solid blue; color: red; float: left;',
+        ['color'],
+        'color: red;'
+    ),
+    (
+        'border: 1px solid blue; color: red; float: left;',
+        ['color', 'float'],
+        'color: red; float: left;'
+    ),
+    (
+        'color: red; float: left; padding: 1em;',
+        ['color', 'float'],
+        'color: red; float: left;'
+    ),
+    (
+        'color: red; float: left; padding: 1em;',
+        ['color'],
+        'color: red;'
+    ),
+    (
+        'cursor: -moz-grab;',
+        ['cursor'],
+        'cursor: -moz-grab;'
+    ),
+    (
+        'color: hsl(30,100%,50%);',
+        ['color'],
+        'color: hsl(30,100%,50%);'
+    ),
+    (
+        'color: rgba(255,0,0,0.4);',
+        ['color'],
+        'color: rgba(255,0,0,0.4);'
+    ),
+    (
+        "text-overflow: ',' ellipsis;",
+        ['text-overflow'],
+        "text-overflow: ',' ellipsis;"
+    ),
+    (
+        'text-overflow: "," ellipsis;',
+        ['text-overflow'],
+        'text-overflow: "," ellipsis;'
+    ),
+    (
+        'font-family: "Arial";',
+        ['font-family'],
+        'font-family: "Arial";'
+    ),
+])
+def test_allowed_css(data, styles, expected):
 
     p_single = '<p style="{0!s}">bar</p>'
     p_double = "<p style='{0!s}'>bar</p>"
 
-    def check(i, o, s):
-        if '"' in i:
-            eq_(p_double.format(o), clean(p_double.format(i), styles=s))
-        else:
-            eq_(p_single.format(o), clean(p_single.format(i), styles=s))
-
-    for i, o, s in tests:
-        yield check, i, o, s
+    if '"' in data:
+        assert clean(p_double.format(data), styles=styles) == p_double.format(expected)
+    else:
+        assert clean(p_single.format(data), styles=styles) == p_single.format(expected)
 
 
 def test_valid_css():
     """The sanitizer should fix missing CSS values."""
     styles = ['color', 'float']
-    eq_('<p style="float: left;">foo</p>',
-        clean('<p style="float: left; color: ">foo</p>', styles=styles))
-    eq_('<p style="">foo</p>',
-        clean('<p style="color: float: left;">foo</p>', styles=styles))
+    assert (
+        clean('<p style="float: left; color: ">foo</p>', styles=styles) ==
+        '<p style="float: left;">foo</p>'
+    )
+    assert (
+        clean('<p style="color: float: left;">foo</p>', styles=styles) ==
+        '<p style="">foo</p>'
+    )
 
 
 def test_style_hang():
@@ -90,5 +128,4 @@ def test_style_hang():
                 """100%/normal 'Courier New', 'Andale Mono', monospace;">"""
                 """Hello world</p>""")
 
-    result = clean(html, styles=styles)
-    eq_(expected, result)
+    assert clean(html, styles=styles) == expected

From dbfb5401922c6b252f0ccb8cb3279d11ea51809b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 13:19:13 -0400
Subject: [PATCH 020/314] Rewrite test_links.py to work with py.test

---
 tests/test_links.py | 634 ++++++++++++++++++++++++--------------------
 1 file changed, 353 insertions(+), 281 deletions(-)

diff --git a/tests/test_links.py b/tests/test_links.py
index 20d50ac8..15e40018 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -4,43 +4,49 @@
     from urllib import quote_plus
 
 from html5lib.tokenizer import HTMLTokenizer
-from nose.tools import eq_
+import pytest
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
 
 
 def test_url_re():
-    def no_match(s):
-        match = url_re.search(s)
-        if match:
-            assert not match, 'matched {0!s}'.format(s[slice(*match.span())])
-    yield no_match, 'just what i am looking for...it'
+    text = 'just what i am looking for...it'
+    match = url_re.search(text)
+    assert not match, 'matched {0!s}'.format(text[slice(*match.span())])
 
 
 def test_empty():
-    eq_('', linkify(''))
+    assert linkify('') == ''
 
 
 def test_simple_link():
-    eq_('a <a href="http://example.com" rel="nofollow">http://example.com'
-        '</a> link',
-        linkify('a http://example.com link'))
-    eq_('a <a href="https://example.com" rel="nofollow">https://example.com'
-        '</a> link',
-        linkify('a https://example.com link'))
-    eq_('a <a href="http://example.com" rel="nofollow">example.com</a> link',
-        linkify('a example.com link'))
+    assert (
+        linkify('a http://example.com link') ==
+        'a <a href="http://example.com" rel="nofollow">http://example.com</a> link'
+    )
+    assert (
+        linkify('a https://example.com link') ==
+        'a <a href="https://example.com" rel="nofollow">https://example.com</a> link'
+    )
+    assert (
+        linkify('a example.com link') ==
+        'a <a href="http://example.com" rel="nofollow">example.com</a> link'
+    )
 
 
 def test_trailing_slash():
-    eq_('<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>',
-        linkify('http://examp.com/'))
-    eq_('<a href="http://example.com/foo/" rel="nofollow">'
-        'http://example.com/foo/</a>',
-        linkify('http://example.com/foo/'))
-    eq_('<a href="http://example.com/foo/bar/" rel="nofollow">'
-        'http://example.com/foo/bar/</a>',
-        linkify('http://example.com/foo/bar/'))
+    assert (
+        linkify('http://examp.com/') ==
+        '<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>'
+    )
+    assert (
+        linkify('http://example.com/foo/') ==
+        '<a href="http://example.com/foo/" rel="nofollow">http://example.com/foo/</a>'
+    )
+    assert (
+        linkify('http://example.com/foo/bar/') ==
+        '<a href="http://example.com/foo/bar/" rel="nofollow">http://example.com/foo/bar/</a>'
+    )
 
 
 def test_mangle_link():
@@ -50,9 +56,10 @@ def filter_url(attrs, new=False):
         attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
         return attrs
 
-    eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
-        'http://example.com</a>',
-        linkify('http://example.com', DC + [filter_url]))
+    assert (
+        linkify('http://example.com', DC + [filter_url]) ==
+        '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>'
+    )
 
 
 def test_mangle_text():
@@ -62,92 +69,114 @@ def ft(attrs, new=False):
         attrs['_text'] = 'bar'
         return attrs
 
-    eq_('<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>',
-        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', [ft]))
-
-
-def test_email_link():
-    tests = (
-        ('a james@example.com mailto', False, 'a james@example.com mailto'),
-        ('a james@example.com.au mailto', False,
-            'a james@example.com.au mailto'),
-        ('a <a href="mailto:james@example.com">james@example.com</a> mailto',
-            True, 'a james@example.com mailto'),
-        ('aussie <a href="mailto:james@example.com.au">'
-            'james@example.com.au</a> mailto', True,
-            'aussie james@example.com.au mailto'),
-        # This is kind of a pathological case. I guess we do our best here.
-        ('email to <a href="james@example.com" rel="nofollow">'
-         'james@example.com</a>',
-         True,
-         'email to <a href="james@example.com">james@example.com</a>'),
-        ('<br><a href="mailto:jinkyun@example.com">'
-         'jinkyun@example.com</a>',
-         True,
-         '<br>jinkyun@example.com'),
+    assert (
+        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', [ft]) ==
+        '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>'
     )
 
-    def _check(o, p, i):
-        eq_(o, linkify(i, parse_email=p))
 
-    for (o, p, i) in tests:
-        yield _check, o, p, i
-
-
-def test_email_link_escaping():
-    tests = (
-        ('''<a href='mailto:"james"@example.com'>'''
-            '''"james"@example.com</a>''',
-            '"james"@example.com'),
-        ('''<a href="mailto:&quot;j'ames&quot;@example.com">'''
-            '''"j'ames"@example.com</a>''',
-            '"j\'ames"@example.com'),
-        ('''<a href='mailto:"ja>mes"@example.com'>'''
-            '''"ja&gt;mes"@example.com</a>''',
-            '"ja>mes"@example.com'),
+@pytest.mark.parametrize('data,parse_email,expected', [
+    (
+        'a james@example.com mailto',
+        False,
+        'a james@example.com mailto'
+    ),
+    (
+        'a james@example.com.au mailto',
+        False,
+        'a james@example.com.au mailto'
+    ),
+    (
+        'a james@example.com mailto',
+        True,
+        'a <a href="mailto:james@example.com">james@example.com</a> mailto'
+    ),
+    (
+        'aussie james@example.com.au mailto',
+        True,
+        'aussie <a href="mailto:james@example.com.au">james@example.com.au</a> mailto'
+    ),
+    # This is kind of a pathological case. I guess we do our best here.
+    (
+        'email to <a href="james@example.com">james@example.com</a>',
+        True,
+        'email to <a href="james@example.com" rel="nofollow">james@example.com</a>'
+    ),
+    (
+        '<br>jinkyun@example.com',
+        True,
+        '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>'
     )
-
-    def _check(o, i):
-        eq_(o, linkify(i, parse_email=True))
-
-    for (o, i) in tests:
-        yield _check, o, i
-
-
-def test_prevent_links():
+])
+def test_email_link(data, parse_email, expected):
+    assert linkify(data, parse_email=parse_email) == expected
+
+
+@pytest.mark.parametrize('data,expected', [
+    (
+        '"james"@example.com',
+        '''<a href='mailto:"james"@example.com'>"james"@example.com</a>'''
+    ),
+    (
+        '"j\'ames"@example.com',
+        '''<a href="mailto:&quot;j'ames&quot;@example.com">"j'ames"@example.com</a>'''
+    ),
+    (
+        '"ja>mes"@example.com',
+        '''<a href='mailto:"ja>mes"@example.com'>"ja&gt;mes"@example.com</a>'''
+    ),
+])
+def test_email_link_escaping(data, expected):
+    assert linkify(data, parse_email=True) == expected
+
+
+def no_new_links(attrs, new=False):
+    if new:
+        return None
+    return attrs
+
+
+def no_old_links(attrs, new=False):
+    if not new:
+        return None
+    return attrs
+
+
+def noop(attrs, new=False):
+    return attrs
+
+
+@pytest.mark.parametrize('callback,expected', [
+    (
+        [noop],
+        'a <a href="http://ex.mp">ex.mp</a> <a href="http://example.com">example</a>'
+    ),
+    (
+        [no_new_links, noop],
+        'a ex.mp <a href="http://example.com">example</a>'
+    ),
+    (
+        [noop, no_new_links],
+        'a ex.mp <a href="http://example.com">example</a>'
+    ),
+    (
+        [no_old_links, noop],
+        'a <a href="http://ex.mp">ex.mp</a> example'
+    ),
+    (
+        [noop, no_old_links],
+        'a <a href="http://ex.mp">ex.mp</a> example'
+    ),
+    (
+        [no_old_links, no_new_links],
+        'a ex.mp example'
+    )
+])
+def test_prevent_links(callback, expected):
     """Returning None from any callback should remove links or prevent them
     from being created."""
-
-    def no_new_links(attrs, new=False):
-        if new:
-            return None
-        return attrs
-
-    def no_old_links(attrs, new=False):
-        if not new:
-            return None
-        return attrs
-
-    def noop(attrs, new=False):
-        return attrs
-
-    in_text = 'a ex.mp <a href="http://example.com">example</a>'
-    out_text = 'a <a href="http://ex.mp">ex.mp</a> example'
-    tests = (
-        ([noop], ('a <a href="http://ex.mp">ex.mp</a> '
-                  '<a href="http://example.com">example</a>'), 'noop'),
-        ([no_new_links, noop], in_text, 'no new, noop'),
-        ([noop, no_new_links], in_text, 'noop, no new'),
-        ([no_old_links, noop], out_text, 'no old, noop'),
-        ([noop, no_old_links], out_text, 'noop, no old'),
-        ([no_old_links, no_new_links], 'a ex.mp example', 'no links'),
-    )
-
-    def _check(cb, o, msg):
-        eq_(o, linkify(in_text, cb), msg)
-
-    for (cb, o, msg) in tests:
-        yield _check, cb, o, msg
+    text = 'a ex.mp <a href="http://example.com">example</a>'
+    assert linkify(text, callback) == expected
 
 
 def test_set_attrs():
@@ -157,8 +186,10 @@ def set_attr(attrs, new=False):
         attrs['rev'] = 'canonical'
         return attrs
 
-    eq_('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
-        linkify('ex.mp', [set_attr]))
+    assert (
+        linkify('ex.mp', [set_attr]) ==
+        '<a href="http://ex.mp" rev="canonical">ex.mp</a>'
+    )
 
 
 def test_only_proto_links():
@@ -169,9 +200,10 @@ def only_proto(attrs, new=False):
         return attrs
 
     in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
-    out_text = ('a ex.mp <a href="http://ex.mp">http://ex.mp</a> '
-                '<a href="/foo">bar</a>')
-    eq_(out_text, linkify(in_text, [only_proto]))
+    assert (
+        linkify(in_text, [only_proto]) ==
+        'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>'
+    )
 
 
 def test_stop_email():
@@ -181,121 +213,138 @@ def no_email(attrs, new=False):
             return None
         return attrs
     text = 'do not link james@example.com'
-    eq_(text, linkify(text, parse_email=True, callbacks=[no_email]))
-
-
-def test_tlds():
-    eq_('<a href="http://example.com" rel="nofollow">example.com</a>',
-        linkify('example.com'))
-    eq_('<a href="http://example.co" rel="nofollow">example.co</a>',
-        linkify('example.co'))
-    eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
-        linkify('example.co.uk'))
-    eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>',
-        linkify('example.edu'))
-    eq_('<a href="http://example.xxx" rel="nofollow">example.xxx</a>',
-        linkify('example.xxx'))
-    eq_('example.yyy', linkify('example.yyy'))
-    eq_(' brie', linkify(' brie'))
-    eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
-        linkify('bit.ly/fun'))
+
+    assert linkify(text, parse_email=True, callbacks=[no_email]) == text
+
+
+@pytest.mark.parametrize('data,expected', [
+    # tlds
+    ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'),
+    ('example.co', '<a href="http://example.co" rel="nofollow">example.co</a>'),
+    ('example.co.uk', '<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>'),
+    ('example.edu', '<a href="http://example.edu" rel="nofollow">example.edu</a>'),
+    ('example.xxx', '<a href="http://example.xxx" rel="nofollow">example.xxx</a>'),
+    ('bit.ly/fun', '<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>'),
+
+    # non-tlds
+    ('example.yyy', 'example.yyy'),
+    ('brie', 'brie'),
+])
+def test_tlds(data, expected):
+    assert linkify(data) == expected
 
 
 def test_escaping():
-    eq_('&lt; unrelated', linkify('< unrelated'))
+    assert linkify('< unrelated') == '&lt; unrelated'
 
 
 def test_nofollow_off():
-    eq_('<a href="http://example.com">example.com</a>',
-        linkify('example.com', []))
+    assert linkify('example.com', []) == '<a href="http://example.com">example.com</a>'
 
 
 def test_link_in_html():
-    eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
-        linkify('<i>http://yy.com</i>'))
-
-    eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com'
-        '</a></strong></em>',
-        linkify('<em><strong>http://xx.com</strong></em>'))
+    assert (
+        linkify('<i>http://yy.com</i>') ==
+        '<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>'
+    )
+    assert (
+        linkify('<em><strong>http://xx.com</strong></em>') ==
+        '<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a></strong></em>'
+    )
 
 
 def test_links_https():
-    eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
-        linkify('https://yy.com'))
+    assert (
+        linkify('https://yy.com') ==
+        '<a href="https://yy.com" rel="nofollow">https://yy.com</a>'
+    )
 
 
 def test_add_rel_nofollow():
     """Verify that rel="nofollow" is added to an existing link"""
-    eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
-        linkify('<a href="http://yy.com">http://yy.com</a>'))
+    assert (
+        linkify('<a href="http://yy.com">http://yy.com</a>') ==
+        '<a href="http://yy.com" rel="nofollow">http://yy.com</a>'
+    )
 
 
 def test_url_with_path():
-    eq_('<a href="http://example.com/path/to/file" rel="nofollow">'
-        'http://example.com/path/to/file</a>',
-        linkify('http://example.com/path/to/file'))
+    assert (
+        linkify('http://example.com/path/to/file') ==
+        '<a href="http://example.com/path/to/file" rel="nofollow">http://example.com/path/to/file</a>'
+    )
 
 
 def test_link_ftp():
-    eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
-        'ftp://ftp.mozilla.org/some/file</a>',
-        linkify('ftp://ftp.mozilla.org/some/file'))
+    assert (
+        linkify('ftp://ftp.mozilla.org/some/file') ==
+        '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">ftp://ftp.mozilla.org/some/file</a>'
+    )
 
 
 def test_link_query():
-    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
-        'http://xx.com/?test=win</a>',
-        linkify('http://xx.com/?test=win'))
-    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
-        'xx.com/?test=win</a>',
-        linkify('xx.com/?test=win'))
-    eq_('<a href="http://xx.com?test=win" rel="nofollow">'
-        'xx.com?test=win</a>',
-        linkify('xx.com?test=win'))
+    assert (
+        linkify('http://xx.com/?test=win') ==
+        '<a href="http://xx.com/?test=win" rel="nofollow">http://xx.com/?test=win</a>'
+    )
+    assert (
+        linkify('xx.com/?test=win') ==
+        '<a href="http://xx.com/?test=win" rel="nofollow">xx.com/?test=win</a>'
+    )
+    assert (
+        linkify('xx.com?test=win') ==
+        '<a href="http://xx.com?test=win" rel="nofollow">xx.com?test=win</a>'
+    )
 
 
 def test_link_fragment():
-    eq_('<a href="http://xx.com/path#frag" rel="nofollow">'
-        'http://xx.com/path#frag</a>',
-        linkify('http://xx.com/path#frag'))
+    assert (
+        linkify('http://xx.com/path#frag') ==
+        '<a href="http://xx.com/path#frag" rel="nofollow">http://xx.com/path#frag</a>'
+    )
 
 
 def test_link_entities():
-    eq_('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
-        'http://xx.com/?a=1&amp;b=2</a>',
-        linkify('http://xx.com/?a=1&b=2'))
+    assert (
+        linkify('http://xx.com/?a=1&b=2') ==
+        '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
+    )
 
 
 def test_escaped_html():
     """If I pass in escaped HTML, it should probably come out escaped."""
     s = '&lt;em&gt;strong&lt;/em&gt;'
-    eq_(s, linkify(s))
+    assert linkify(s) == s
 
 
 def test_link_http_complete():
-    eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
-        '&amp;e#f" rel="nofollow">'
-        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
-        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
+    assert (
+        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') ==
+        (
+            '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
+            'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'
+        )
+    )
 
 
 def test_non_url():
     """document.vulnerable should absolutely not be linkified."""
     s = 'document.vulnerable'
-    eq_(s, linkify(s))
+    assert linkify(s) == s
 
 
 def test_javascript_url():
     """javascript: urls should never be linkified."""
     s = 'javascript:document.vulnerable'
-    eq_(s, linkify(s))
+    assert linkify(s) == s
 
 
 def test_unsafe_url():
     """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
-    eq_('All your{"<a href="http://xx.yy.com/grover.png" '
-        'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
-        linkify('All your{"xx.yy.com/grover.png"}base are'))
+    assert (
+        linkify('All your{"xx.yy.com/grover.png"}base are') ==
+        'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}base are'
+    )
 
 
 def test_skip_pre():
@@ -306,171 +355,194 @@ def test_skip_pre():
     all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
                   '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
                   '</a></pre>')
-    eq_(linked, linkify(simple, skip_pre=True))
-    eq_(all_linked, linkify(simple))
+    assert linkify(simple, skip_pre=True) == linked
+    assert linkify(simple) == all_linked
 
     already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
     nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
-    eq_(nofollowed, linkify(already_linked))
-    eq_(nofollowed, linkify(already_linked, skip_pre=True))
-
-    eq_(
-        linkify('<pre><code>http://example.com</code></pre>http://example.com',
-                skip_pre=True),
-        ('<pre><code>http://example.com</code></pre>'
-         '<a href="http://example.com" rel="nofollow">http://example.com</a>')
+    assert linkify(already_linked) == nofollowed
+    assert linkify(already_linked, skip_pre=True) == nofollowed
+
+    assert (
+        linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_pre=True) ==
+        (
+            '<pre><code>http://example.com</code></pre>'
+            '<a href="http://example.com" rel="nofollow">http://example.com</a>'
+        )
     )
 
 
 def test_libgl():
     """libgl.so.1 should not be linkified."""
-    eq_('libgl.so.1', linkify('libgl.so.1'))
+    s = 'libgl.so.1'
+    assert linkify(s) == s
 
 
-def test_end_of_sentence():
+@pytest.mark.parametrize('url,periods', [
+    ('example.com', '.'),
+    ('example.com', '...'),
+    ('ex.com/foo', '.'),
+    ('ex.com/foo', '....'),
+])
+def test_end_of_sentence(url, periods):
     """example.com. should match."""
     out = '<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}'
     intxt = '{0!s}{1!s}'
 
-    def check(u, p):
-        eq_(out.format(u, p),
-            linkify(intxt.format(u, p)))
-
-    tests = (
-        ('example.com', '.'),
-        ('example.com', '...'),
-        ('ex.com/foo', '.'),
-        ('ex.com/foo', '....'),
-    )
-
-    for u, p in tests:
-        yield check, u, p
+    assert linkify(intxt.format(url, periods)) == out.format(url, periods)
 
 
 def test_end_of_clause():
     """example.com/foo, shouldn't include the ,"""
-    eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
-        linkify('ex.com/foo, bar'))
+    assert (
+        linkify('ex.com/foo, bar') ==
+        '<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar'
+    )
 
 
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
-    dirty = 'Yeah right <sarcasm/>'
-    clean = 'Yeah right &lt;sarcasm/&gt;'
-    eq_(clean, linkify(dirty))
-
-
-def test_wrapping_parentheses():
+    assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
+
+
+@pytest.mark.parametrize('data,expected_data', [
+    (
+        '(example.com)',
+        ('(', 'example.com', 'example.com', ')')
+    ),
+    (
+        '(example.com/)',
+        ('(', 'example.com/', 'example.com/', ')')
+    ),
+    (
+        '(example.com/foo)',
+        ('(', 'example.com/foo', 'example.com/foo', ')')
+    ),
+    (
+        '(((example.com/))))',
+        ('(((', 'example.com/', 'example.com/', '))))')
+    ),
+    (
+        'example.com/))',
+        ('', 'example.com/', 'example.com/', '))')
+    ),
+    (
+        '(foo http://example.com/)',
+        ('(foo ', 'example.com/', 'http://example.com/', ')')
+    ),
+    (
+        '(foo http://example.com)',
+        ('(foo ', 'example.com', 'http://example.com', ')')
+    ),
+    (
+        'http://en.wikipedia.org/wiki/Test_(assessment)',
+        ('', 'en.wikipedia.org/wiki/Test_(assessment)',
+         'http://en.wikipedia.org/wiki/Test_(assessment)', '')
+    ),
+    (
+        '(http://en.wikipedia.org/wiki/Test_(assessment))',
+        ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
+         'http://en.wikipedia.org/wiki/Test_(assessment)', ')')
+    ),
+    (
+        '((http://en.wikipedia.org/wiki/Test_(assessment))',
+        ('((', 'en.wikipedia.org/wiki/Test_(assessment',
+         'http://en.wikipedia.org/wiki/Test_(assessment', '))')
+    ),
+    (
+        '(http://en.wikipedia.org/wiki/Test_(assessment)))',
+        ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
+         'http://en.wikipedia.org/wiki/Test_(assessment))', ')')
+    ),
+    (
+        '(http://en.wikipedia.org/wiki/)Test_(assessment',
+        ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
+         'http://en.wikipedia.org/wiki/)Test_(assessment', '')
+    )
+])
+def test_wrapping_parentheses(data, expected_data):
     """URLs wrapped in parantheses should not include them."""
     out = '{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}'
 
-    tests = (
-        ('(example.com)', ('(', 'example.com', 'example.com', ')')),
-        ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')),
-        ('(example.com/foo)',
-         ('(', 'example.com/foo', 'example.com/foo', ')')),
-        ('(((example.com/))))',
-         ('(((', 'example.com/', 'example.com/', '))))')),
-        ('example.com/))',
-         ('', 'example.com/', 'example.com/', '))')),
-        ('(foo http://example.com/)',
-         ('(foo ', 'example.com/', 'http://example.com/', ')')),
-        ('(foo http://example.com)',
-         ('(foo ', 'example.com', 'http://example.com', ')')),
-        ('http://en.wikipedia.org/wiki/Test_(assessment)',
-         ('', 'en.wikipedia.org/wiki/Test_(assessment)',
-          'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
-        ('(http://en.wikipedia.org/wiki/Test_(assessment))',
-         ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
-          'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
-        ('((http://en.wikipedia.org/wiki/Test_(assessment))',
-         ('((', 'en.wikipedia.org/wiki/Test_(assessment',
-          'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
-        ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
-         ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
-          'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
-        ('(http://en.wikipedia.org/wiki/)Test_(assessment',
-         ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
-          'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
-    )
-
-    def check(test, expected_output):
-        eq_(out.format(*expected_output), linkify(test))
-
-    for test, expected_output in tests:
-        yield check, test, expected_output
+    assert linkify(data) == out.format(*expected_data)
 
 
 def test_parentheses_with_removing():
-    expect = '(test.py)'
-    eq_(expect, linkify(expect, callbacks=[lambda *a: None]))
-
-
-def test_ports():
+    expected = '(test.py)'
+    assert linkify(expected, callbacks=[lambda *a: None]) == expected
+
+
+@pytest.mark.parametrize('data,expected_data', [
+    ('http://foo.com:8000', ('http://foo.com:8000', '')),
+    ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
+    ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
+    ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
+    ('http://foo.com:', ('http://foo.com', ':')),
+])
+def test_ports(data, expected_data):
     """URLs can contain port numbers."""
-    tests = (
-        ('http://foo.com:8000', ('http://foo.com:8000', '')),
-        ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
-        ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
-        ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
-        ('http://foo.com:', ('http://foo.com', ':')),
-    )
-
-    def check(test, output):
-        out = '<a href="{0}" rel="nofollow">{0}</a>{1}'
-        eq_(out.format(*output),
-            linkify(test))
-
-    for test, output in tests:
-        yield check, test, output
+    out = '<a href="{0}" rel="nofollow">{0}</a>{1}'
+    assert linkify(data) == out.format(*expected_data)
 
 
 def test_tokenizer():
     """Linkify doesn't always have to sanitize."""
     raw = '<em>test<x></x></em>'
-    eq_('<em>test&lt;x&gt;&lt;/x&gt;</em>', linkify(raw))
-    eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))
+    assert linkify(raw) == '<em>test&lt;x&gt;&lt;/x&gt;</em>'
+    assert linkify(raw, tokenizer=HTMLTokenizer) == raw
 
 
 def test_ignore_bad_protocols():
-    eq_('foohttp://bar',
-        linkify('foohttp://bar'))
-    eq_('fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
-        linkify('fohttp://exampl.com'))
+    assert (
+        linkify('foohttp://bar') ==
+        'foohttp://bar'
+    )
+    assert (
+        linkify('fohttp://exampl.com') ==
+        'fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>'
+    )
 
 
 def test_max_recursion_depth():
     """If we hit the max recursion depth, just return the string."""
     test = '<em>' * 2000 + 'foo' + '</em>' * 2000
-    eq_(test, linkify(test))
+    assert linkify(test) == test
 
 
 def test_link_emails_and_urls():
     """parse_email=True shouldn't prevent URLs from getting linkified."""
-    output = ('<a href="http://example.com" rel="nofollow">'
-              'http://example.com</a> <a href="mailto:person@example.com">'
-              'person@example.com</a>')
-    eq_(output, linkify('http://example.com person@example.com',
-                        parse_email=True))
+    assert (
+        linkify('http://example.com person@example.com', parse_email=True) ==
+        (
+            '<a href="http://example.com" rel="nofollow">'
+            'http://example.com</a> <a href="mailto:person@example.com">'
+            'person@example.com</a>'
+        )
+    )
 
 
 def test_links_case_insensitive():
     """Protocols and domain names are case insensitive."""
     expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
               'HTTP://EXAMPLE.COM</a>')
-    eq_(expect, linkify('HTTP://EXAMPLE.COM'))
+    assert linkify('HTTP://EXAMPLE.COM') == expect
 
 
 def test_elements_inside_links():
-    eq_('<a href="#" rel="nofollow">hello<br></a>',
-        linkify('<a href="#">hello<br></a>'))
+    assert (
+        linkify('<a href="#">hello<br></a>') ==
+        '<a href="#" rel="nofollow">hello<br></a>'
+    )
 
-    eq_('<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
-        linkify('<a href="#"><strong>bold</strong> hello<br></a>'))
+    assert (
+        linkify('<a href="#"><strong>bold</strong> hello<br></a>') ==
+        '<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>'
+    )
 
 
 def test_remove_first_childlink():
-    expect = '<p>something</p>'
     callbacks = [lambda *a: None]
-    eq_(expect,
-        linkify('<p><a href="/foo">something</a></p>', callbacks=callbacks))
+    assert (
+        linkify('<p><a href="/foo">something</a></p>', callbacks=callbacks) ==
+        '<p>something</p>'
+    )

From 530fcd283c9eab23a72739ae60e37acf16f23eec Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 13:47:16 -0400
Subject: [PATCH 021/314] Rewrite test_security.py to work with py.test

---
 tests/test_security.py | 121 +++++++++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 47 deletions(-)

diff --git a/tests/test_security.py b/tests/test_security.py
index 6adab59b..7ebb25cd 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -1,90 +1,118 @@
 """More advanced security tests"""
 
-from nose.tools import eq_
-
 from bleach import clean
 
 
 def test_nested_script_tag():
-    eq_('&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;',
-        clean('<<script>script>evil()<</script>/script>'))
-    eq_('&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;',
-        clean('<<x>script>evil()<</x>/script>'))
+    assert (
+        clean('<<script>script>evil()<</script>/script>') ==
+        '&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;'
+    )
+    assert (
+        clean('<<x>script>evil()<</x>/script>') ==
+        '&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;'
+    )
 
 
 def test_nested_script_tag_r():
-    eq_('&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;',
-        clean('<script<script>>evil()</script</script>>'))
+    assert (
+        clean('<script<script>>evil()</script</script>>') ==
+        '&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;'
+    )
 
 
 def test_invalid_attr():
     IMG = ['img', ]
     IMG_ATTR = ['src']
 
-    eq_('<a href="test">test</a>',
-        clean('<a onclick="evil" href="test">test</a>'))
-    eq_('<img src="test">',
-        clean('<img onclick="evil" src="test" />',
-              tags=IMG, attributes=IMG_ATTR))
-    eq_('<img src="test">',
-        clean('<img href="invalid" src="test" />',
-              tags=IMG, attributes=IMG_ATTR))
+    assert (
+        clean('<a onclick="evil" href="test">test</a>') ==
+        '<a href="test">test</a>'
+    )
+    assert (
+        clean('<img onclick="evil" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="test">'
+    )
+    assert (
+        clean('<img href="invalid" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="test">'
+    )
 
 
 def test_unquoted_attr():
-    eq_('<abbr title="mytitle">myabbr</abbr>',
-        clean('<abbr title=mytitle>myabbr</abbr>'))
+    assert (
+        clean('<abbr title=mytitle>myabbr</abbr>') ==
+        '<abbr title="mytitle">myabbr</abbr>'
+    )
 
 
 def test_unquoted_event_handler():
-    eq_('<a href="http://xx.com">xx.com</a>',
-        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>'))
+    assert (
+        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>') ==
+        '<a href="http://xx.com">xx.com</a>'
+    )
 
 
 def test_invalid_attr_value():
-    eq_('&lt;img src="javascript:alert(\'XSS\');"&gt;',
-        clean('<img src="javascript:alert(\'XSS\');">'))
+    assert (
+        clean('<img src="javascript:alert(\'XSS\');">') ==
+        '&lt;img src="javascript:alert(\'XSS\');"&gt;'
+    )
 
 
 def test_invalid_href_attr():
-    eq_('<a>xss</a>',
-        clean('<a href="javascript:alert(\'XSS\')">xss</a>'))
+    assert (
+        clean('<a href="javascript:alert(\'XSS\')">xss</a>') ==
+        '<a>xss</a>'
+    )
 
 
 def test_invalid_filter_attr():
     IMG = ['img', ]
     IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
 
-    eq_('<img src="http://example.com/">',
-        clean('<img onclick="evil" src="http://example.com/" />',
-              tags=IMG, attributes=IMG_ATTR))
-
-    eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />',
-                       tags=IMG, attributes=IMG_ATTR))
+    assert (
+        clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="http://example.com/">'
+    )
+    assert (
+        clean('<img onclick="evil" src="http://badhost.com/" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img>'
+    )
 
 
 def test_invalid_tag_char():
-    eq_('&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
-        clean('<script/xss src="http://xx.com/xss.js"></script>'))
-    eq_('&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
-        clean('<script/src="http://xx.com/xss.js"></script>'))
+    assert (
+        clean('<script/xss src="http://xx.com/xss.js"></script>') ==
+        '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+    )
+    assert (
+        clean('<script/src="http://xx.com/xss.js"></script>') ==
+        '&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+    )
 
 
 def test_unclosed_tag():
-    eq_('&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;',
-        clean('<script src=http://xx.com/xss.js<b>'))
-    eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
-        clean('<script src="http://xx.com/xss.js"<b>'))
-    eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
-        clean('<script src="http://xx.com/xss.js" <b>'))
+    assert (
+        clean('<script src=http://xx.com/xss.js<b>') ==
+        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;'
+    )
+    assert (
+        clean('<script src="http://xx.com/xss.js"<b>') ==
+        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+    )
+    assert (
+        clean('<script src="http://xx.com/xss.js" <b>') ==
+        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+    )
 
 
 def test_strip():
     """Using strip=True shouldn't result in malicious content."""
     s = '<scri<script>pt>alert(1)</scr</script>ipt>'
-    eq_('pt&gt;alert(1)ipt&gt;', clean(s, strip=True))
+    assert clean(s, strip=True) == 'pt&gt;alert(1)ipt&gt;'
     s = '<scri<scri<script>pt>pt>alert(1)</script>'
-    eq_('pt&gt;pt&gt;alert(1)', clean(s, strip=True))
+    assert clean(s, strip=True) == 'pt&gt;pt&gt;alert(1)'
 
 
 def test_nasty():
@@ -94,7 +122,7 @@ def test_nasty():
     expect = ('&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
               '&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
               '&gt;')
-    eq_(expect, clean(test))
+    assert clean(test) == expect
 
 
 def test_poster_attribute():
@@ -102,11 +130,10 @@ def test_poster_attribute():
     tags = ['video']
     attrs = {'video': ['poster']}
     test = '<video poster="javascript:alert(1)"></video>'
-    expect = '<video></video>'
-    eq_(expect, clean(test, tags=tags, attributes=attrs))
+    assert clean(test, tags=tags, attributes=attrs) == '<video></video>'
     ok = '<video poster="/foo.png"></video>'
-    eq_(ok, clean(ok, tags=tags, attributes=attrs))
+    assert clean(ok, tags=tags, attributes=attrs) == ok
 
 
 def test_feed_protocol():
-    eq_('<a>foo</a>', clean('<a href="feed:file:///tmp/foo">foo</a>'))
+    assert clean('<a href="feed:file:///tmp/foo">foo</a>') == '<a>foo</a>'

From 76f54caf0c4b3b53a2708b3a1f45f83c2be346eb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 13:53:43 -0400
Subject: [PATCH 022/314] Rewrite test_unicode.py to work with py.test

---
 tests/test_unicode.py | 57 +++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index 723df5f2..b8b670e8 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -1,59 +1,52 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
-from nose.tools import eq_
+
+import pytest
 
 from bleach import clean, linkify
-from bleach.tests.tools import in_
 
 
 def test_japanese_safe_simple():
-    eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル'))
-    eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル'))
+    assert clean('ヘルプとチュートリアル') == 'ヘルプとチュートリアル'
+    assert linkify('ヘルプとチュートリアル') == 'ヘルプとチュートリアル'
 
 
 def test_japanese_strip():
-    eq_('<em>ヘルプとチュートリアル</em>',
-        clean('<em>ヘルプとチュートリアル</em>'))
-    eq_('&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
-        clean('<span>ヘルプとチュートリアル</span>'))
+    assert clean('<em>ヘルプとチュートリアル</em>') == '<em>ヘルプとチュートリアル</em>'
+    assert clean('<span>ヘルプとチュートリアル</span>') == '&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;'
 
 
 def test_russian_simple():
-    eq_('Домашняя', clean('Домашняя'))
-    eq_('Домашняя', linkify('Домашняя'))
+    assert clean('Домашняя') == 'Домашняя'
+    assert linkify('Домашняя') == 'Домашняя'
 
 
 def test_mixed():
-    eq_('Домашняяヘルプとチュートリアル',
-        clean('Домашняяヘルプとチュートリアル'))
+    assert clean('Домашняяヘルプとチュートリアル') == 'Домашняяヘルプとチュートリアル'
 
 
 def test_mixed_linkify():
-    in_(('Домашняя <a href="http://example.com" rel="nofollow">'
-         'http://example.com</a> ヘルプとチュートリアル',
-         'Домашняя <a rel="nofollow" href="http://example.com">'
-         'http://example.com</a> ヘルプとチュートリアル'),
-        linkify('Домашняя http://example.com ヘルプとチュートリアル'))
+    assert (
+        linkify('Домашняя http://example.com ヘルプとチュートリアル') in
+        (
+            'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル',
+            'Домашняя <a rel="nofollow" href="http://example.com">http://example.com</a> ヘルプとチュートリアル'
+        )
+    )
 
 
-def test_url_utf8():
+@pytest.mark.parametrize('test,expected', [
+    ('http://éxámplé.com/', 'http://éxámplé.com/'),
+    ('http://éxámplé.com/íàñá/', 'http://éxámplé.com/íàñá/'),
+    ('http://éxámplé.com/íàñá/?foo=bar', 'http://éxámplé.com/íàñá/?foo=bar'),
+    ('http://éxámplé.com/íàñá/?fóo=bár', 'http://éxámplé.com/íàñá/?fóo=bár'),
+])
+def test_url_utf8(test, expected):
     """Allow UTF8 characters in URLs themselves."""
     outs = ('<a href="{0!s}" rel="nofollow">{0!s}</a>',
             '<a rel="nofollow" href="{0!s}">{0!s}</a>')
 
     out = lambda url: [x.format(url) for x in outs]
 
-    tests = (
-        ('http://éxámplé.com/', out('http://éxámplé.com/')),
-        ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')),
-        ('http://éxámplé.com/íàñá/?foo=bar',
-         out('http://éxámplé.com/íàñá/?foo=bar')),
-        ('http://éxámplé.com/íàñá/?fóo=bár',
-         out('http://éxámplé.com/íàñá/?fóo=bár')),
-    )
-
-    def check(test, expected_output):
-        in_(expected_output, linkify(test))
-
-    for test, expected_output in tests:
-        yield check, test, expected_output
+    expected = out(expected)
+    assert linkify(test) in expected

From ba7a7825b25a628e2810279b75d9254f6caee199 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 14:06:20 -0400
Subject: [PATCH 023/314] Update travis and tox files to use py.test

---
 .travis.yml | 2 +-
 tox.ini     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9a1f0b43..8b45c177 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,7 +14,7 @@ python:
 install:
 - pip install -r requirements.txt
 script:
-- nosetests
+- py.test
 - flake8 bleach/
 deploy:
   provider: pypi
diff --git a/tox.ini b/tox.ini
index da138989..8d409cf4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -11,4 +11,4 @@ commands = py.test {posargs:-v}
 deps =
     six
     html5lib==0.999
-    nose
+    pytest

From 4d749c9e36d95b852d470537a0b03bdb06e7f5fc Mon Sep 17 00:00:00 2001
From: Alexandre Macabies <web+git@zopieux.com>
Date: Sun, 19 Jun 2016 00:59:43 +0200
Subject: [PATCH 024/314] Use ASCII digits in port number parsing

---
 bleach/__init__.py  | 2 +-
 tests/test_links.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index bf67bf33..3f08bfdf 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -71,7 +71,7 @@ def emit(self, record):
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
     \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
     (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
diff --git a/tests/test_links.py b/tests/test_links.py
index 15e40018..67fc2d1a 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -473,11 +473,18 @@ def test_parentheses_with_removing():
 
 
 @pytest.mark.parametrize('data,expected_data', [
+    # Test valid ports
     ('http://foo.com:8000', ('http://foo.com:8000', '')),
     ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
+
+    # Test non ports
     ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
     ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
     ('http://foo.com:', ('http://foo.com', ':')),
+
+    # Test non-ascii ports
+    ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')),
+    ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')),
 ])
 def test_ports(data, expected_data):
     """URLs can contain port numbers."""

From 7a6ab9d3fb7f61d71d4215f5ee308c266cbe8454 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 15:23:56 -0400
Subject: [PATCH 025/314] Allow travis to fail with Python 2.6 and 3.2

It's failing now with Python 2.6 because flake8 doesn't work with Python
2.6. It's failing with Python 3.2 because py.test doesn't work with
Python 3.2.

I'm not quite ready to drop support for both of them, though. So for
now, let's allow the failures.
---
 .travis.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 8b45c177..93941ff5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,10 @@ python:
 - "3.4"
 - "3.5"
 - "pypy"
+matrix:
+  allow_failures:
+  - python: "2.6"
+  - python: "3.2"
 install:
 - pip install -r requirements.txt
 script:

From 3cd9d32371b0e03d33071eb82735dc48e3f91a6e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 15:44:48 -0400
Subject: [PATCH 026/314] Fix test_idempotent to not sometimes fail

linkify can return text with html attributes in a different order
depending on how the attributes come out of the attrs dict. Because of
that, there are several possible outcomes.

This fixes the linkify test to accept both of them.

Fixes #161
---
 tests/test_basics.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_basics.py b/tests/test_basics.py
index 6fa6c22e..459b29a0 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -193,8 +193,12 @@ def test_idempotent():
     clean = bleach.clean(dirty)
     assert bleach.clean(clean) == clean
 
+    possible_outs = (
+        '<span>invalid &amp; </span> &lt; extra <a rel="nofollow" href="http://link.com">http://link.com</a><em></em>',
+        '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" rel="nofollow">http://link.com</a><em></em>'
+    )
     linked = bleach.linkify(dirty)
-    assert bleach.linkify(linked) == linked
+    assert bleach.linkify(linked) == possible_outs
 
 
 def test_rel_already_there():

From 4df81b9b6fdf1e4d38715e6e77b22b7e22a30152 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 15:52:09 -0400
Subject: [PATCH 027/314] Fix the test I just broke

---
 tests/test_basics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_basics.py b/tests/test_basics.py
index 459b29a0..07d4d918 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -198,7 +198,7 @@ def test_idempotent():
         '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" rel="nofollow">http://link.com</a><em></em>'
     )
     linked = bleach.linkify(dirty)
-    assert bleach.linkify(linked) == possible_outs
+    assert bleach.linkify(linked) in possible_outs
 
 
 def test_rel_already_there():

From 0c070f6733501b5e2bb3c36e7946f7229b94b79b Mon Sep 17 00:00:00 2001
From: Lorenz Schori <lo@znerol.ch>
Date: Fri, 9 Sep 2016 10:23:11 +0200
Subject: [PATCH 028/314] Do not add trailing period when email address is at
 the end of a sentence

---
 bleach/__init__.py  | 2 +-
 tests/test_links.py | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3f08bfdf..3a53870d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -88,7 +88,7 @@ def emit(self, record):
         (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
     |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
         |\\[\001-011\013\014\016-\177])*"  # quoted-string
-    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
+    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
     """,
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 
diff --git a/tests/test_links.py b/tests/test_links.py
index 67fc2d1a..40260785 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -106,7 +106,13 @@ def ft(attrs, new=False):
         '<br>jinkyun@example.com',
         True,
         '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>'
-    )
+    ),
+    # Mailto links at the end of a sentence.
+    (
+        'mailto james@example.com.au.',
+        True,
+        'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.'
+    ),
 ])
 def test_email_link(data, parse_email, expected):
     assert linkify(data, parse_email=parse_email) == expected

From 751619b43668d8e509b00afc4835fbb31955d64f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 16:57:35 -0400
Subject: [PATCH 029/314] Update CHANGES; add .cache to .gitignore

---
 .gitignore | 1 +
 CHANGES    | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/.gitignore b/.gitignore
index c24310fb..78421070 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ dist
 build
 .tox
 docs/_build/
+.cache
diff --git a/CHANGES b/CHANGES
index 8784e83e..645e7d03 100644
--- a/CHANGES
+++ b/CHANGES
@@ -15,6 +15,12 @@ Version 1.5? (in progress)
 
 - clean: Added ``protocols`` to arguments list to let you override the list of
   allowed protocols. Thank you, Andreas Malecki! #149
+- linkify: Fix a bug involving periods at the end of an email address. Thank you,
+  Lorenz Schori! #219
+- linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
+  #207
+- Fixed a test that failed periodically. #161
+- Switched from nose to py.test.
 
 
 Version 1.4.3 (May 23rd, 2016)

From e8649dfa7d1a758eba2558240123f0c1cedc735e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 21:05:38 -0400
Subject: [PATCH 030/314] Nix duplicate requirements; add english description
 of html5lib versions

The requirements.txt file specified requirements already in setup.py, so
there's no point in having both versions. Therefore I nixed the ones in
requirements.txt.

The html5lib versions are hard to read, so I added english descriptions
of them.
---
 requirements.txt | 4 ----
 setup.py         | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 79aa6e61..a026c46e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,5 @@
 -e .
 
-ordereddict==1.1
-six==1.10.0
-html5lib>=0.999,<0.99999999
- 
 # Requirements to run the test suite:
 pytest==3.0.3
 flake8==3.0.4
diff --git a/setup.py b/setup.py
index 26686efd..40b62a72 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 install_requires = [
     'six',
-    'html5lib>=0.999,<0.99999999',
+    'html5lib>=0.999,<0.99999999',  # 3 9s to 8 9s
 ]
 
 try:

From feaad361ce840f080c81b4c88f5d8b60190807c5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Oct 2016 21:07:21 -0400
Subject: [PATCH 031/314] Fix .travis.yml and tox.ini to test multiple html5lib
 versions

This fixes .travis.yml and tox.ini to test multiple html5lib versions
that we allegedly support.
---
 .travis.yml |  7 +++++++
 tox.ini     | 21 ++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 93941ff5..a6498a1f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,12 +11,19 @@ python:
 - "3.4"
 - "3.5"
 - "pypy"
+env:
+- HTML5LIB=0.999       # 3
+- HTML5LIB=0.9999      # 4
+- HTML5LIB=0.99999     # 5
+- HTML5LIB=0.999999    # 6
+- HTML5LIB=0.9999999   # 7
 matrix:
   allow_failures:
   - python: "2.6"
   - python: "3.2"
 install:
 - pip install -r requirements.txt
+- pip install html5lib==$HTML5LIB
 script:
 - py.test
 - flake8 bleach/
diff --git a/tox.ini b/tox.ini
index 8d409cf4..3c0ba1f8 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,11 +4,22 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py32, py33, py34, py35, pypy
+envlist = py{26,27,32,33,34,35}-html5lib{999,9999,99999,999999,9999999},pypy-html5lib9999999
 
 [testenv]
-commands = py.test {posargs:-v}
+basepython =
+    py26: python2.6
+    py27: python2.7
+    py32: python3.2
+    py33: python3.3
+    py34: python3.4
+    py35: python3.5
 deps =
-    six
-    html5lib==0.999
-    pytest
+    -rrequirements.txt
+    html5lib999: html5lib==0.999
+    html5lib9999: html5lib==0.9999
+    html5lib99999: html5lib==0.99999
+    html5lib999999: html5lib==0.999999
+    html5lib9999999: html5lib==0.9999999
+commands =
+    py.test {posargs:-v}

From 8f879871a60cb6cd5e3b610fbc01c884633c1ecf Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 1 Nov 2016 09:37:27 -0400
Subject: [PATCH 032/314] Remove support for html5lib 0.9999 and 0.99999

html5lib 0.9999 and 0.99999 have a bug about relative urls and the
bleach tests fail with those two versions. Given that, this removes
support for both of those.

Additionally, I tweaked the CHANGES file a bit.
---
 .travis.yml |  2 --
 CHANGES     | 13 ++++++++-----
 setup.py    |  4 +++-
 tox.ini     |  4 +---
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a6498a1f..59912666 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,8 +13,6 @@ python:
 - "pypy"
 env:
 - HTML5LIB=0.999       # 3
-- HTML5LIB=0.9999      # 4
-- HTML5LIB=0.99999     # 5
 - HTML5LIB=0.999999    # 6
 - HTML5LIB=0.9999999   # 7
 matrix:
diff --git a/CHANGES b/CHANGES
index 645e7d03..000054d6 100644
--- a/CHANGES
+++ b/CHANGES
@@ -20,7 +20,10 @@ Version 1.5? (in progress)
 - linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
   #207
 - Fixed a test that failed periodically. #161
-- Switched from nose to py.test.
+- Switched from nose to py.test. #204
+- Add test matrix for all supported Python and html5lib versions. #230
+- Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
+  and 0.99999 are busted.
 
 
 Version 1.4.3 (May 23rd, 2016)
@@ -28,7 +31,7 @@ Version 1.4.3 (May 23rd, 2016)
 
 **Changes**
 
-- Limit to html5lib >=0.999<0.99999999 because of impending change to
+- Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to
   sanitizer api. #195
 
 
@@ -37,7 +40,7 @@ Version 1.4.2 (September 11, 2015)
 
 **Changes**
 
-- linkify: Fix hang in linkify with parse_email=True. #124
+- linkify: Fix hang in linkify with ``parse_email=True``. #124
 - linkify: Fix crash in linkify when removing a link that is a first-child. #136
 - Updated TLDs.
 - linkify: Don't remove exterior brackets when linkifying. #146
@@ -58,7 +61,7 @@ Version 1.4 (January 12, 2014)
 **Changes**
 
 - linkify: Update linkify to use etree type Treewalker instead of simpletree.
-- Updated html5lib to version >= 0.999.
+- Updated html5lib to version ``>=0.999``.
 - Update all code to be compatible with Python 3 and 2 using six.
 - Switch to Apache License.
 
@@ -77,7 +80,7 @@ Version 1.2.2 (May 18, 2013)
 Version 1.2.1 (February 19, 2013)
 ---------------------------------
 
-- clean() no longer considers "feed:" an acceptable protocol due to
+- clean() no longer considers ``feed:`` an acceptable protocol due to
   inconsistencies in browser behavior.
 
 
diff --git a/setup.py b/setup.py
index 40b62a72..872970e0 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,9 @@
 
 install_requires = [
     'six',
-    'html5lib>=0.999,<0.99999999',  # 3 9s to 8 9s
+    # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're
+    # busted
+    'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999',
 ]
 
 try:
diff --git a/tox.ini b/tox.ini
index 3c0ba1f8..73c8511c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{26,27,32,33,34,35}-html5lib{999,9999,99999,999999,9999999},pypy-html5lib9999999
+envlist = py{26,27,32,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999
 
 [testenv]
 basepython =
@@ -17,8 +17,6 @@ basepython =
 deps =
     -rrequirements.txt
     html5lib999: html5lib==0.999
-    html5lib9999: html5lib==0.9999
-    html5lib99999: html5lib==0.99999
     html5lib999999: html5lib==0.999999
     html5lib9999999: html5lib==0.9999999
 commands =

From 93ac61f6969fb1c0290febaed2fc7e161dc02d7e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 2 Nov 2016 09:15:14 -0400
Subject: [PATCH 033/314] Add support for "python setup.py test"

---
 .gitignore |  3 ++-
 CHANGES    |  1 +
 setup.cfg  |  3 +++
 setup.py   | 12 ++++++++++++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 78421070..f5adb549 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ dist
 build
 .tox
 docs/_build/
-.cache
+.cache/
+.eggs/
diff --git a/CHANGES b/CHANGES
index 000054d6..ee4f2806 100644
--- a/CHANGES
+++ b/CHANGES
@@ -24,6 +24,7 @@ Version 1.5? (in progress)
 - Add test matrix for all supported Python and html5lib versions. #230
 - Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
   and 0.99999 are busted.
+- Add support for ``python setup.py test``. #97
 
 
 Version 1.4.3 (May 23rd, 2016)
diff --git a/setup.cfg b/setup.cfg
index 38f6166d..f3a416e4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,6 @@
+[aliases]
+test=pytest
+
 [flake8]
 ignore = E731,W503
 
diff --git a/setup.py b/setup.py
index 872970e0..2a28da45 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,18 @@
 import re
+import sys
 
 from setuptools import setup, find_packages
 from distutils.util import convert_path
 
+setup_requires = []
+if 'test' in sys.argv:
+    # Only add pytest-runner to setup_requires if running tests
+    setup_requires.append('pytest-runner>=2.0,<3dev')
+
+tests_require = [
+    'pytest==3.0.3',
+]
+    
 install_requires = [
     'six',
     # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're
@@ -52,6 +62,8 @@ def get_version():
     package_data={'': ['README.rst']},
     zip_safe=False,
     install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
         'Environment :: Web Environment',

From 4e868219083a0bd6799e5a47b521d20847640661 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 2 Nov 2016 10:16:52 -0400
Subject: [PATCH 034/314] Clarify docs regarding appropriate use cases and
 goals

This clarifies some of the guiding principles behind what bleach is for
and not for so it's clearer to users whether their needs will be met.
---
 bleach/__init__.py | 25 +++++++++++++-----
 docs/goals.rst     | 63 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3a53870d..097c0d93 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -104,7 +104,14 @@ def emit(self, record):
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
           strip_comments=True):
-    """Clean an HTML fragment and return it
+    """Clean an HTML fragment of malicious content and return it
+
+    This function is a security-focused function whose sole purpose is to
+    remove malicious content from a string such that it can be displayed as
+    content in a web page.
+
+    This function is not designed to use to transform content to be used in
+    non-web-page contexts.
 
     :arg text: the text to clean
     :arg tags: whitelist of allowed tags; defaults to
@@ -139,12 +146,18 @@ class s(BleachSanitizer):
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
             parse_email=False, tokenizer=HTMLSanitizer):
-    """Convert URL-like strings in an HTML fragment to links.
+    """Convert URL-like strings in an HTML fragment to links
+
+    ``linkify()`` converts strings that look like URLs or domain names in a
+    blob of text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    ``linkify()`` does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
 
-    linkify() converts strings that look like URLs or domain names in a
-    blob of text that may be an HTML fragment to links, while preserving
-    (a) links already in the string, (b) urls found in attributes, and
-    (c) email addresses.
     """
     text = force_unicode(text)
 
diff --git a/docs/goals.rst b/docs/goals.rst
index d62d54b5..74d0b171 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -6,13 +6,15 @@ This document lists the goals and non-goals of Bleach. My hope is that by
 focusing on these goals and explicitly listing the non-goals, the project will
 evolve in a stronger direction.
 
+.. contents::
+
 
 Goals
 =====
 
 
-Whitelisting
-------------
+Always take a whitelist-based approach
+--------------------------------------
 
 Bleach should always take a whitelist-based approach to allowing any kind of
 content or markup. Blacklisting is error-prone and not future proof.
@@ -22,8 +24,8 @@ not blacklist all the other ``on*`` attributes. Future versions of HTML may add
 new event handlers, like ``ontouch``, that old blacklists would not prevent.
 
 
-Sanitizing Input
-----------------
+Main goal is to sanitize input of malicious content
+---------------------------------------------------
 
 The primary goal of Bleach is to sanitize user input that is allowed to contain
 *some* HTML as markup and is to be included in the content of a larger page.
@@ -43,8 +45,8 @@ content, and will use the HTML5 parsing algorithm to handle invalid markup.
 See the :ref:`chapter on clean() <clean-chapter>` for more info.
 
 
-Safely Creating Links
----------------------
+Safely create cinks
+-------------------
 
 The secondary goal of Bleach is to provide a mechanism for finding or altering
 links (``<a>`` tags with ``href`` attributes, or things that look like URLs or
@@ -62,18 +64,43 @@ Non-Goals
 Bleach is designed to work with fragments of HTML by untrusted users. Some
 non-goal use cases include:
 
-* **Sanitizing complete HTML documents.** Once you're creating whole documents,
-  you have to allow so many tags that a blacklist approach (e.g. forbidding
-  ``<script>`` or ``<object>``) may be more appropriate.
 
-* **Removing *all* HTML.** There are much faster tools available if you want to
-  remove or escape all HTML from a document.
+Sanitize complete HTML documents
+--------------------------------
+
+Once you're creating whole documents, you have to allow so many tags that a
+blacklist approach (e.g. forbidding ``<script>`` or ``<object>``) may be more
+appropriate.
+
+
+Remove all HTML or transforming content for some non-web-page purpose
+---------------------------------------------------------------------
+
+There are much faster tools available if you want to remove or escape all HTML
+from a document.
+
+
+Clean up after trusted users
+----------------------------
+
+Bleach is powerful but it is not fast. If you trust your users, trust them and
+don't rely on Bleach to clean up their mess.
+
+
+Make malicious content look pretty or sane
+------------------------------------------
+
+Malicious content is designed to be malicious. Making it safe is a design goal
+of bleach. Making it pretty or sane-looking is not.
+
+If you want your malicious content to look pretty, you should pass it through
+bleach to make it safe and then do your own transform afterwards.
+
 
-* **Cleaning up after trusted users.** Bleach is powerful but it is not fast.
-  If you trust your users, trust them and don't rely on Bleach to clean up
-  their mess.
+Allow arbitrary styling
+-----------------------
 
-* **Allowing arbitrary styling.** There are a number of interesting CSS
-  properties that can do dangerous things, like Opera's ``-o-link``. Painful as
-  it is, if you want your users to be able to change nearly anything in a
-  ``style`` attribute, you should have to opt into this.
+There are a number of interesting CSS properties that can do dangerous things,
+like Opera's ``-o-link``. Painful as it is, if you want your users to be able to
+change nearly anything in a ``style`` attribute, you should have to opt into
+this.

From be9ddd8c991eea8b2118972aa46547906333d6b4 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 2 Nov 2016 15:31:03 -0400
Subject: [PATCH 035/314] Fix issues

---
 bleach/__init__.py | 4 ++--
 docs/goals.rst     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 097c0d93..59a00ee6 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -148,8 +148,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
             parse_email=False, tokenizer=HTMLSanitizer):
     """Convert URL-like strings in an HTML fragment to links
 
-    ``linkify()`` converts strings that look like URLs or domain names in a
-    blob of text that may be an HTML fragment to links, while preserving:
+    ``linkify()`` converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
 
     1. links already in the string
     2. urls found in attributes
diff --git a/docs/goals.rst b/docs/goals.rst
index 74d0b171..01f63a94 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -45,7 +45,7 @@ content, and will use the HTML5 parsing algorithm to handle invalid markup.
 See the :ref:`chapter on clean() <clean-chapter>` for more info.
 
 
-Safely create cinks
+Safely create links
 -------------------
 
 The secondary goal of Bleach is to provide a mechanism for finding or altering

From 86982ef08b3d458b2fc8cacfb950d7d80971c1ac Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 3 Nov 2016 11:30:42 -0400
Subject: [PATCH 036/314] Fix linkify with node tails

When linkify is removing nodes, it wasn't capturing the node tail text.
Thus it'd lose text it shouldn't have been losing.

Fixes #132
---
 bleach/__init__.py  | 41 +++++++++++++++++++++++++++--------------
 tests/test_links.py |  9 +++++++++
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 59a00ee6..13b5e2e7 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -170,30 +170,44 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     _seen = set([])
 
     def replace_nodes(tree, new_frag, node, index=0):
-        """
-        Doesn't really replace nodes, but inserts the nodes contained in
-        new_frag into the treee at position index and returns the number
+        """Doesn't really replace nodes, but inserts the nodes contained in
+        ``new_frag`` into ``tree`` at position ``index`` and returns the number
         of nodes inserted.
-        If node is passed in, it is removed from the tree
+
+        If ``node`` is passed in, it is removed from the resulting tree.
+
+        :arg tree: tree
+        :arg new_frag: fragment of html text to insert
+        :arg node: the node to "replace"
+        :arg index: the index position to focus on
+
+        :returns: number of nodes inserted so that you can skip ahead
+
         """
         count = 0
         new_tree = parser.parseFragment(new_frag)
         # capture any non-tag text at the start of the fragment
         if new_tree.text:
             if index == 0:
-                tree.text = tree.text or ''
-                tree.text += new_tree.text
+                tree.text = (tree.text or '') + new_tree.text
             else:
-                tree[index - 1].tail = tree[index - 1].tail or ''
-                tree[index - 1].tail += new_tree.text
-        # the put in the tagged elements into the old tree
+                tree[index-1].tail = (tree[index-1].tail or '') + new_tree.text
+
+        # then put in the tagged elements into the old tree
         for n in new_tree:
             if n.tag == ETREE_TAG('a'):
                 _seen.add(n)
             tree.insert(index + count, n)
             count += 1
+
         # if we got a node to remove...
         if node is not None:
+            # first, grab the node tail so we don't lose text
+            if node.tail:
+                if index + count == 0:
+                    tree.text = (tree.text or '') + node.tail
+                else:
+                    tree[index+count-1].tail = (tree[index+count-1].tail or '') + node.tail
             tree.remove(node)
         return count
 
@@ -312,12 +326,11 @@ def linkify_nodes(tree, parse_text=True):
                     attrs = apply_callbacks(attrs, False)
 
                     if attrs is None:
-                        # <a> tag replaced by the text within it
-                        adj = replace_nodes(tree, _text, node,
-                                            current_child)
+                        # # <a> tag replaced by the text within it
+                        adj = replace_nodes(tree, _text, node, current_child)
+                        # pull back current_child by 1 to scan the new nodes
+                        # again.
                         current_child -= 1
-                        # pull back current_child by 1 to scan the
-                        # new nodes again.
                     else:
                         text = force_unicode(attrs.pop('_text'))
                         for attr_key, attr_val in attrs.items():
diff --git a/tests/test_links.py b/tests/test_links.py
index 40260785..ac38ee70 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -559,3 +559,12 @@ def test_remove_first_childlink():
         linkify('<p><a href="/foo">something</a></p>', callbacks=callbacks) ==
         '<p>something</p>'
     )
+
+
+def test_drop_link_tags():
+    """Verify that dropping link tags *just* drops the tag and not the content"""
+    html = """first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">fourth</a> fifth"""
+    assert (
+        linkify(html, callbacks=[lambda attrs, new: None]) ==
+        'first second third fourth fifth'
+    )

From ed215532f1b96f8c32fdcfc027b269778e96fa20 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 3 Nov 2016 11:33:35 -0400
Subject: [PATCH 037/314] Update CHANGES

---
 CHANGES | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGES b/CHANGES
index ee4f2806..84645029 100644
--- a/CHANGES
+++ b/CHANGES
@@ -19,6 +19,8 @@ Version 1.5? (in progress)
   Lorenz Schori! #219
 - linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
   #207
+- linkify: Fix linkify inappropriately removing node tails when dropping nodes.
+  #132
 - Fixed a test that failed periodically. #161
 - Switched from nose to py.test. #204
 - Add test matrix for all supported Python and html5lib versions. #230

From 0d06fa32a6bdc117ca396b56232a06c2e083fe73 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 4 Nov 2016 15:44:15 -0400
Subject: [PATCH 038/314] Prep for 1.5 release

---
 CHANGES           |   4 +-
 CONTRIBUTORS      |   9 ++-
 LICENSE           |   2 +-
 MANIFEST.in       |  11 +++
 bleach/version.py |   2 +-
 docs/conf.py      |   8 +-
 docs/dev.rst      |  19 ++---
 docs/make.bat     | 190 ----------------------------------------------
 setup.py          |   2 +-
 9 files changed, 33 insertions(+), 214 deletions(-)
 delete mode 100644 docs/make.bat

diff --git a/CHANGES b/CHANGES
index 84645029..eb8f46af 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach Changes
 ==============
 
-Version 1.5? (in progress)
---------------------------
+Version 1.5 (November 4th, 2016)
+--------------------------------
 
 **Backwards incompatible changes**
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index a587e807..f2d02ab6 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -4,11 +4,11 @@ It is currently maintained by Jannis Leidel and Will Kahn-Greene.
 
 Maintainers:
 
-- Jannis Leidel <jleidel@mozilla.com>
 - Will Kahn-Greene <willkg@mozilla.com>
 
 Maintainer emeritus:
 
+- Jannis Leidel <jleidel@mozilla.com>
 - James Socol <me@jamessocol.com>
 
 Contributors:
@@ -18,6 +18,7 @@ Contributors:
 - Alek
 - Alex Ehlke
 - Alireza Savand
+- Alexandre Macabies
 - Andreas Malecki
 - Andy Freeland
 - Anton Kovalyov
@@ -25,15 +26,19 @@ Contributors:
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
+- Istvan Albert
 - Jaime Irurzun
 - Jeff Balogh
 - Lee, Cheon-il
 - Les Orchard
+- Lorenz Schori
 - Luis Nell
 - Marc Abramowitz
 - Marc DM
 - Mark Lee
 - Mark Paschal
+- mdxs
+- nikolas
 - Oh Jinkyun
 - Paul Craciunoiu
 - Ricky Rosario
@@ -42,6 +47,4 @@ Contributors:
 - Tim Dumol
 - Timothy Fitz
 - Vitaly Volkov
-- mdxs
-- nikolas
 - zyegfryed
diff --git a/LICENSE b/LICENSE
index b0cde3ee..90a2cb9b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2014-2015, Mozilla Foundation
+Copyright (c) 2014-2016, Mozilla Foundation
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/MANIFEST.in b/MANIFEST.in
index 870f669c..ab0c8fbf 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,14 @@
 include CHANGES
+include CONTRIBUTORS
+include CONTRIBUTING.rst
+include requirements.txt
+include tox.ini
 include LICENSE
 include README.rst
+
+include docs/conf.py
+include docs/Makefile
+
+recursive-include docs *.rst
+
+recursive-include tests *.py
diff --git a/bleach/version.py b/bleach/version.py
index 134e4857..98d40e59 100644
--- a/bleach/version.py
+++ b/bleach/version.py
@@ -2,5 +2,5 @@
 
 from __future__ import unicode_literals
 
-VERSION = (1, 4, 3)
+VERSION = (1, 5, 0)
 __version__ = '.'.join([str(n) for n in VERSION])
diff --git a/docs/conf.py b/docs/conf.py
index 88fe431c..ed09f08e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -44,7 +44,7 @@
 
 # General information about the project.
 project = u'Bleach'
-copyright = u'2012-2015, James Socol; 2015, Mozilla Foundation'
+copyright = u'2012-2015, James Socol; 2015-2016, Mozilla Foundation'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -187,7 +187,7 @@
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
   ('index', 'Bleach.tex', u'Bleach Documentation',
-   u'Jannis Leidel, Will Kahn-Greene', 'manual'),
+   u'Will Kahn-Greene', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -217,7 +217,7 @@
 # (source start file, name, description, authors, manual section).
 man_pages = [
     ('index', 'bleach', u'Bleach Documentation',
-     [u'Jannis Leidel, Will Kahn-Greene'], 1)
+     [u'Will Kahn-Greene'], 1)
 ]
 
 # If true, show URL addresses after external links.
@@ -231,7 +231,7 @@
 #  dir menu entry, description, category)
 texinfo_documents = [
   ('index', 'Bleach', u'Bleach Documentation',
-   u'Jannis Leidel, Will Kahn-Greene', 'Bleach', 'One line description of project.',
+   u'Will Kahn-Greene', 'Bleach', 'One line description of project.',
    'Miscellaneous'),
 ]
 
diff --git a/docs/dev.rst b/docs/dev.rst
index 7338953a..027a0a76 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -27,15 +27,14 @@ Release process
 
 1. Checkout master tip.
 
-2. Check to make sure ``setup.py`` and ``requirements.txt``.
+2. Check to make sure ``setup.py`` and ``requirements.txt`` are
+   correct and match requirements-wise.
 
-3. Update version numbers in:
+3. Update version number in:
 
-   * ``setup.py``
-   * ``bleach/__init__.py``
-   * ``docs/confg.py``
+   * ``bleach/version.py``
 
-   Set the version to something like ``0.4``.
+   Set the version to something like ``VERSION = (1, 4, 3)``.
 
 4. Update ``CONTRIBUTORS``, ``CHANGES`` and ``MANIFEST.in``.
 
@@ -57,10 +56,6 @@ Release process
 
      $ git push --tags official master
 
-9. Update PyPI::
+   That will push the release to PyPI.
 
-     $ rm -rf dist
-     $ python setup.py sdist bdist_wheel
-     $ twine upload sdist/*
-
-10. Blog posts, twitter, update topic in ``#bleach``, etc.
+9. Blog posts, twitter, update topic in ``#bleach``, etc.
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 84c919b5..00000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,190 +0,0 @@
-@ECHO OFF
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set BUILDDIR=_build
-set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
-set I18NSPHINXOPTS=%SPHINXOPTS% .
-if NOT "%PAPER%" == "" (
-	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
-	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
-)
-
-if "%1" == "" goto help
-
-if "%1" == "help" (
-	:help
-	echo.Please use `make ^<target^>` where ^<target^> is one of
-	echo.  html       to make standalone HTML files
-	echo.  dirhtml    to make HTML files named index.html in directories
-	echo.  singlehtml to make a single large HTML file
-	echo.  pickle     to make pickle files
-	echo.  json       to make JSON files
-	echo.  htmlhelp   to make HTML files and a HTML help project
-	echo.  qthelp     to make HTML files and a qthelp project
-	echo.  devhelp    to make HTML files and a Devhelp project
-	echo.  epub       to make an epub
-	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
-	echo.  text       to make text files
-	echo.  man        to make manual pages
-	echo.  texinfo    to make Texinfo files
-	echo.  gettext    to make PO message catalogs
-	echo.  changes    to make an overview over all changed/added/deprecated items
-	echo.  linkcheck  to check all external links for integrity
-	echo.  doctest    to run all doctests embedded in the documentation if enabled
-	goto end
-)
-
-if "%1" == "clean" (
-	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
-	del /q /s %BUILDDIR%\*
-	goto end
-)
-
-if "%1" == "html" (
-	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
-	goto end
-)
-
-if "%1" == "dirhtml" (
-	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
-	goto end
-)
-
-if "%1" == "singlehtml" (
-	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
-	goto end
-)
-
-if "%1" == "pickle" (
-	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can process the pickle files.
-	goto end
-)
-
-if "%1" == "json" (
-	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can process the JSON files.
-	goto end
-)
-
-if "%1" == "htmlhelp" (
-	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can run HTML Help Workshop with the ^
-.hhp project file in %BUILDDIR%/htmlhelp.
-	goto end
-)
-
-if "%1" == "qthelp" (
-	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can run "qcollectiongenerator" with the ^
-.qhcp project file in %BUILDDIR%/qthelp, like this:
-	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Bleach.qhcp
-	echo.To view the help file:
-	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Bleach.ghc
-	goto end
-)
-
-if "%1" == "devhelp" (
-	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished.
-	goto end
-)
-
-if "%1" == "epub" (
-	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The epub file is in %BUILDDIR%/epub.
-	goto end
-)
-
-if "%1" == "latex" (
-	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
-	goto end
-)
-
-if "%1" == "text" (
-	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The text files are in %BUILDDIR%/text.
-	goto end
-)
-
-if "%1" == "man" (
-	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The manual pages are in %BUILDDIR%/man.
-	goto end
-)
-
-if "%1" == "texinfo" (
-	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
-	goto end
-)
-
-if "%1" == "gettext" (
-	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
-	goto end
-)
-
-if "%1" == "changes" (
-	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.The overview file is in %BUILDDIR%/changes.
-	goto end
-)
-
-if "%1" == "linkcheck" (
-	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Link check complete; look for any errors in the above output ^
-or in %BUILDDIR%/linkcheck/output.txt.
-	goto end
-)
-
-if "%1" == "doctest" (
-	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Testing of doctests in the sources finished, look at the ^
-results in %BUILDDIR%/doctest/output.txt.
-	goto end
-)
-
-:end
diff --git a/setup.py b/setup.py
index 2a28da45..26f7eefa 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ def get_version():
     version=get_version(),
     description='An easy whitelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
-    maintainer='Jannis Leidel, Will Kahn-Greene',
+    maintainer='Will Kahn-Greene',
     url='http://github.com/mozilla/bleach',
     license='Apache Software License',
     packages=find_packages(),

From 94e2ed83932e12cd5e33176e3e2f398cb3dbd9f0 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 5 Nov 2016 16:56:44 -0400
Subject: [PATCH 039/314] Fix sidebar in docs

The alabaster theme requires you set the html_sidebar such that the
navigation works right. This sets that up.
---
 docs/conf.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index ed09f08e..00b9c239 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -134,7 +134,14 @@
 #html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+html_sidebars = {
+    '**': [
+        'about.html',
+        'navigation.html',
+        'relations.html',
+        'searchbox.html',
+    ]
+}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.

From 5cd9bcf77791ff96de6c3281387ad55bd9b311dd Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 30 Nov 2016 09:58:44 -0500
Subject: [PATCH 040/314] Clarify strip argument for bleach.clean

It wasn't particularly clear that strip changes the behavior of
bleach.clean for tags not listed in the whitelist. This clarifies that a
bit.

Fixes #237
---
 docs/clean.rst | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index 288c3912..ebd82055 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -181,14 +181,16 @@ Default protocols are in ``bleach.ALLOWED_PROTOCOLS``.
 Stripping Markup
 ================
 
-By default, Bleach *escapes* disallowed or invalid markup. For example:
+By default, Bleach *escapes* tags that aren't specified in the tags
+whitelist and invalid markup. For example:
 
 .. doctest::
 
    >>> import bleach
-
    >>> bleach.clean('<span>is not allowed</span>')
    u'&lt;span&gt;is not allowed&lt;/span&gt;'
+   >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
+   u'<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
 
 
 If you would rather Bleach stripped this markup entirely, you can pass
@@ -197,9 +199,10 @@ If you would rather Bleach stripped this markup entirely, you can pass
 .. doctest::
 
    >>> import bleach
-
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
    u'is not allowed'
+   >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
+   u'<b>is not allowed</b>'
 
 
 Stripping Comments

From 38c2ce9775a6083622aeed7a048740567f312cbc Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 6 Dec 2016 14:55:50 -1000
Subject: [PATCH 041/314] Remove support for Python 2.6 and 3.2

Fixes #206
Fixes #224
---
 .travis.yml        |  6 ------
 CHANGES            |  9 +++++++++
 bleach/__init__.py |  8 +-------
 bleach/version.py  |  2 +-
 setup.py           | 11 -----------
 tox.ini            |  4 +---
 6 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 59912666..f88146d1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,9 +4,7 @@ cache:
   directories:
   - "~/.cache/pip"
 python:
-- "2.6"
 - "2.7"
-- "3.2"
 - "3.3"
 - "3.4"
 - "3.5"
@@ -15,10 +13,6 @@ env:
 - HTML5LIB=0.999       # 3
 - HTML5LIB=0.999999    # 6
 - HTML5LIB=0.9999999   # 7
-matrix:
-  allow_failures:
-  - python: "2.6"
-  - python: "3.2"
 install:
 - pip install -r requirements.txt
 - pip install html5lib==$HTML5LIB
diff --git a/CHANGES b/CHANGES
index eb8f46af..c52b5485 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,15 @@
 Bleach Changes
 ==============
 
+Version 2.0 (in development)
+----------------------------
+
+**Backwards incompatible changes**
+
+- Removed support for Python 2.6. #206
+- Removed support for Python 3.2. #224
+
+
 Version 1.5 (November 4th, 2016)
 --------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 13b5e2e7..09dad637 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,12 +2,6 @@
 
 from __future__ import unicode_literals
 import logging
-try:  # Python 2.7+
-    from logging import NullHandler
-except ImportError:
-    class NullHandler(logging.Handler):
-        def emit(self, record):
-            pass
 import re
 
 import html5lib
@@ -22,7 +16,7 @@ def emit(self, record):
 __all__ = ['clean', 'linkify']
 
 log = logging.getLogger(__name__)
-log.addHandler(NullHandler())
+log.addHandler(logging.NullHandler())
 
 ALLOWED_TAGS = [
     'a',
diff --git a/bleach/version.py b/bleach/version.py
index 98d40e59..bcd8affc 100644
--- a/bleach/version.py
+++ b/bleach/version.py
@@ -2,5 +2,5 @@
 
 from __future__ import unicode_literals
 
-VERSION = (1, 5, 0)
+VERSION = (2, 0, 0)
 __version__ = '.'.join([str(n) for n in VERSION])
diff --git a/setup.py b/setup.py
index 26f7eefa..908928e1 100644
--- a/setup.py
+++ b/setup.py
@@ -20,15 +20,6 @@
     'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999',
 ]
 
-try:
-    from collections import OrderedDict  # noqa
-except ImportError:
-    # We don't use ordereddict, but html5lib does when you request
-    # alpha-sorted attributes and on Python 2.6 and it doesn't specify it
-    # as a dependency (see
-    # https://github.com/html5lib/html5lib-python/pull/177)
-    install_requires.append('ordereddict')
-
 
 def get_long_desc():
     desc = open('README.rst').read()
@@ -72,10 +63,8 @@ def get_version():
         'Operating System :: OS Independent',
         'Programming Language :: Python',
         'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.6',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.2',
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
diff --git a/tox.ini b/tox.ini
index 73c8511c..02dde2d3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,13 +4,11 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{26,27,32,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999
+envlist = py{27,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999
 
 [testenv]
 basepython =
-    py26: python2.6
     py27: python2.7
-    py32: python3.2
     py33: python3.3
     py34: python3.4
     py35: python3.5

From 2900b1020a99e11298fe295f230989c61695e289 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 19 Dec 2016 15:37:58 -0500
Subject: [PATCH 042/314] Add regression tests

We're getting ready to overhaul bleach to use html5lib 0.99999999+.
Before doing that, I wanted to add more regression tests so I can verify
when/how things change. This adds those.

These tests come from the OWASP Testing Guide 4.0 appendix in the Cross
Site Scripting section.
---
 tests/data/1.test      |  1 +
 tests/data/1.test.out  |  1 +
 tests/data/10.test     |  1 +
 tests/data/10.test.out |  1 +
 tests/data/11.test     |  1 +
 tests/data/11.test.out |  1 +
 tests/data/12.test     |  1 +
 tests/data/12.test.out |  1 +
 tests/data/13.test     |  1 +
 tests/data/13.test.out |  1 +
 tests/data/14.test     |  1 +
 tests/data/14.test.out |  1 +
 tests/data/15.test     |  1 +
 tests/data/15.test.out |  1 +
 tests/data/16.test     |  1 +
 tests/data/16.test.out |  1 +
 tests/data/17.test     |  1 +
 tests/data/17.test.out |  1 +
 tests/data/18.test     |  1 +
 tests/data/18.test.out |  1 +
 tests/data/19.test     |  1 +
 tests/data/19.test.out |  2 ++
 tests/data/2.test      |  1 +
 tests/data/2.test.out  |  1 +
 tests/data/3.test      |  1 +
 tests/data/3.test.out  |  1 +
 tests/data/5.test      |  1 +
 tests/data/5.test.out  |  1 +
 tests/data/7.test      |  1 +
 tests/data/7.test.out  |  1 +
 tests/data/8.test      |  1 +
 tests/data/8.test.out  |  1 +
 tests/data/9.test      |  1 +
 tests/data/9.test.out  |  1 +
 tests/test_security.py | 38 ++++++++++++++++++++++++++++++++++++++
 35 files changed, 73 insertions(+)
 create mode 100644 tests/data/1.test
 create mode 100644 tests/data/1.test.out
 create mode 100644 tests/data/10.test
 create mode 100644 tests/data/10.test.out
 create mode 100644 tests/data/11.test
 create mode 100644 tests/data/11.test.out
 create mode 100644 tests/data/12.test
 create mode 100644 tests/data/12.test.out
 create mode 100644 tests/data/13.test
 create mode 100644 tests/data/13.test.out
 create mode 100644 tests/data/14.test
 create mode 100644 tests/data/14.test.out
 create mode 100644 tests/data/15.test
 create mode 100644 tests/data/15.test.out
 create mode 100644 tests/data/16.test
 create mode 100644 tests/data/16.test.out
 create mode 100644 tests/data/17.test
 create mode 100644 tests/data/17.test.out
 create mode 100644 tests/data/18.test
 create mode 100644 tests/data/18.test.out
 create mode 100644 tests/data/19.test
 create mode 100644 tests/data/19.test.out
 create mode 100644 tests/data/2.test
 create mode 100644 tests/data/2.test.out
 create mode 100644 tests/data/3.test
 create mode 100644 tests/data/3.test.out
 create mode 100644 tests/data/5.test
 create mode 100644 tests/data/5.test.out
 create mode 100644 tests/data/7.test
 create mode 100644 tests/data/7.test.out
 create mode 100644 tests/data/8.test
 create mode 100644 tests/data/8.test.out
 create mode 100644 tests/data/9.test
 create mode 100644 tests/data/9.test.out

diff --git a/tests/data/1.test b/tests/data/1.test
new file mode 100644
index 00000000..c900eccd
--- /dev/null
+++ b/tests/data/1.test
@@ -0,0 +1 @@
+>"><script>alert("XSS")</script>&
diff --git a/tests/data/1.test.out b/tests/data/1.test.out
new file mode 100644
index 00000000..d89228ad
--- /dev/null
+++ b/tests/data/1.test.out
@@ -0,0 +1 @@
+&gt;"&gt;&lt;script&gt;alert("XSS")&lt;/script&gt;&amp;
\ No newline at end of file
diff --git a/tests/data/10.test b/tests/data/10.test
new file mode 100644
index 00000000..268771bc
--- /dev/null
+++ b/tests/data/10.test
@@ -0,0 +1 @@
+<IMG SRC="javascript:alert('XSS');">
diff --git a/tests/data/10.test.out b/tests/data/10.test.out
new file mode 100644
index 00000000..29998a1f
--- /dev/null
+++ b/tests/data/10.test.out
@@ -0,0 +1 @@
+&lt;img src="javascript:alert('XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/11.test b/tests/data/11.test
new file mode 100644
index 00000000..16a49c70
--- /dev/null
+++ b/tests/data/11.test
@@ -0,0 +1 @@
+<IMG SRC=javascript:alert('XSS')>
diff --git a/tests/data/11.test.out b/tests/data/11.test.out
new file mode 100644
index 00000000..52a02dc6
--- /dev/null
+++ b/tests/data/11.test.out
@@ -0,0 +1 @@
+&lt;img src="javascript:alert('XSS')"&gt;
\ No newline at end of file
diff --git a/tests/data/12.test b/tests/data/12.test
new file mode 100644
index 00000000..d4b96e6f
--- /dev/null
+++ b/tests/data/12.test
@@ -0,0 +1 @@
+<IMG SRC=JaVaScRiPt:alert('XSS')>
diff --git a/tests/data/12.test.out b/tests/data/12.test.out
new file mode 100644
index 00000000..fb0807ae
--- /dev/null
+++ b/tests/data/12.test.out
@@ -0,0 +1 @@
+&lt;img src="JaVaScRiPt:alert('XSS')"&gt;
\ No newline at end of file
diff --git a/tests/data/13.test b/tests/data/13.test
new file mode 100644
index 00000000..07279a83
--- /dev/null
+++ b/tests/data/13.test
@@ -0,0 +1 @@
+<IMG SRC=JaVaScRiPt:alert(&quot;XSS<WBR>&quot;)>
diff --git a/tests/data/13.test.out b/tests/data/13.test.out
new file mode 100644
index 00000000..1c866507
--- /dev/null
+++ b/tests/data/13.test.out
@@ -0,0 +1 @@
+&lt;img src="JaVaScRiPt:alert("XSS&amp;lt;WBR"&gt;")&gt;
\ No newline at end of file
diff --git a/tests/data/14.test b/tests/data/14.test
new file mode 100644
index 00000000..b704c0b4
--- /dev/null
+++ b/tests/data/14.test
@@ -0,0 +1 @@
+<IMGSRC=&#106;&#97;&#118;&#97;&<WBR>#115;&#99;&#114;&#105;&#112;&<WBR>#116;&#58;&#97;
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
new file mode 100644
index 00000000..16445739
--- /dev/null
+++ b/tests/data/14.test.out
@@ -0,0 +1 @@
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;#116;:a
\ No newline at end of file
diff --git a/tests/data/15.test b/tests/data/15.test
new file mode 100644
index 00000000..b6a2de6b
--- /dev/null
+++ b/tests/data/15.test
@@ -0,0 +1 @@
+&#108;&#101;&<WBR>#114;&#116;&#40;&#39;&#88;&#83<WBR>;&#83;&#39;&#41>
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
new file mode 100644
index 00000000..334f916b
--- /dev/null
+++ b/tests/data/15.test.out
@@ -0,0 +1 @@
+le&amp;&lt;wbr&gt;#114;t('XS&lt;wbr&gt;;S')&gt;
\ No newline at end of file
diff --git a/tests/data/16.test b/tests/data/16.test
new file mode 100644
index 00000000..d66b5921
--- /dev/null
+++ b/tests/data/16.test
@@ -0,0 +1 @@
+<IMGSRC=&#0000106&#0000097&<WBR>#0000118&#0000097&#0000115&<WBR>#0000099&#0000114&#0000105&<WBR>#0000112&#0000116&#0000058&<WBR>#0000097&#0000108&#0000101&<WBR>#0000114&#0000116&#0000040&<WBR>#0000039&#0000088&#0000083&<WBR>#0000083&#0000039&#0000041>
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
new file mode 100644
index 00000000..9c6ca965
--- /dev/null
+++ b/tests/data/16.test.out
@@ -0,0 +1 @@
+&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;#0000099ri&amp;&lt;wbr&gt;#0000112t:&amp;&lt;wbr&gt;#0000097le&amp;&lt;wbr&gt;#0000114t(&amp;&lt;wbr&gt;#0000039XS&amp;&lt;wbr&gt;#0000083')&gt;
\ No newline at end of file
diff --git a/tests/data/17.test b/tests/data/17.test
new file mode 100644
index 00000000..6e71b152
--- /dev/null
+++ b/tests/data/17.test
@@ -0,0 +1 @@
+<IMGSRC=&#x6A&#x61&#x76&#x61&#x73&<WBR>#x63&#x72&#x69&#x70&#x74&#x3A&<WBR>#x61&#x6C&#x65&#x72&#x74&#x28&<WBR>#x27&#x58&#x53&#x53&#x27&#x29>
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
new file mode 100644
index 00000000..dabfaa2d
--- /dev/null
+++ b/tests/data/17.test.out
@@ -0,0 +1 @@
+&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;#x61lert(&amp;&lt;wbr&gt;#x27XSS')&gt;
\ No newline at end of file
diff --git a/tests/data/18.test b/tests/data/18.test
new file mode 100644
index 00000000..1c173723
--- /dev/null
+++ b/tests/data/18.test
@@ -0,0 +1 @@
+<IMG SRC="jav&#x09;ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/18.test.out b/tests/data/18.test.out
new file mode 100644
index 00000000..8046c715
--- /dev/null
+++ b/tests/data/18.test.out
@@ -0,0 +1 @@
+&lt;img src="jav	ascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/19.test b/tests/data/19.test
new file mode 100644
index 00000000..e6e79742
--- /dev/null
+++ b/tests/data/19.test
@@ -0,0 +1 @@
+<IMG SRC="jav&#x0A;ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/19.test.out b/tests/data/19.test.out
new file mode 100644
index 00000000..8eb8794c
--- /dev/null
+++ b/tests/data/19.test.out
@@ -0,0 +1,2 @@
+&lt;img src="jav
+ascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/2.test b/tests/data/2.test
new file mode 100644
index 00000000..21b93db3
--- /dev/null
+++ b/tests/data/2.test
@@ -0,0 +1 @@
+"><STYLE>@import"javascript:alert('XSS')";</STYLE>
diff --git a/tests/data/2.test.out b/tests/data/2.test.out
new file mode 100644
index 00000000..0b32b6a4
--- /dev/null
+++ b/tests/data/2.test.out
@@ -0,0 +1 @@
+"&gt;&lt;style&gt;@import"javascript:alert('XSS')";&lt;/style&gt;
\ No newline at end of file
diff --git a/tests/data/3.test b/tests/data/3.test
new file mode 100644
index 00000000..8dc3a4ee
--- /dev/null
+++ b/tests/data/3.test
@@ -0,0 +1 @@
+>"'><img%20src%3D%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;XSS%26%23x20;Test%26%23x20;Successful%26quot;)>
diff --git a/tests/data/3.test.out b/tests/data/3.test.out
new file mode 100644
index 00000000..20c3d0d4
--- /dev/null
+++ b/tests/data/3.test.out
@@ -0,0 +1 @@
+&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
diff --git a/tests/data/5.test b/tests/data/5.test
new file mode 100644
index 00000000..0b03876b
--- /dev/null
+++ b/tests/data/5.test
@@ -0,0 +1 @@
+>%22%27><img%20src%3d%22javascript:alert(%27%20XSS%27)%22>
diff --git a/tests/data/5.test.out b/tests/data/5.test.out
new file mode 100644
index 00000000..1782eafb
--- /dev/null
+++ b/tests/data/5.test.out
@@ -0,0 +1 @@
+&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
diff --git a/tests/data/7.test b/tests/data/7.test
new file mode 100644
index 00000000..827f9b9e
--- /dev/null
+++ b/tests/data/7.test
@@ -0,0 +1 @@
+">
diff --git a/tests/data/7.test.out b/tests/data/7.test.out
new file mode 100644
index 00000000..41fd4322
--- /dev/null
+++ b/tests/data/7.test.out
@@ -0,0 +1 @@
+"&gt;
\ No newline at end of file
diff --git a/tests/data/8.test b/tests/data/8.test
new file mode 100644
index 00000000..ddf33a96
--- /dev/null
+++ b/tests/data/8.test
@@ -0,0 +1 @@
+>"
diff --git a/tests/data/8.test.out b/tests/data/8.test.out
new file mode 100644
index 00000000..bc1ffd44
--- /dev/null
+++ b/tests/data/8.test.out
@@ -0,0 +1 @@
+&gt;"
\ No newline at end of file
diff --git a/tests/data/9.test b/tests/data/9.test
new file mode 100644
index 00000000..9cf58659
--- /dev/null
+++ b/tests/data/9.test
@@ -0,0 +1 @@
+'';!--"<XSS>=&{()}
diff --git a/tests/data/9.test.out b/tests/data/9.test.out
new file mode 100644
index 00000000..3a4d9b6c
--- /dev/null
+++ b/tests/data/9.test.out
@@ -0,0 +1 @@
+'';!--"&lt;xss&gt;=&amp;{()}
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 7ebb25cd..6ffaf449 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -1,5 +1,10 @@
 """More advanced security tests"""
 
+import os
+
+import pytest
+import six
+
 from bleach import clean
 
 
@@ -137,3 +142,36 @@ def test_poster_attribute():
 
 def test_feed_protocol():
     assert clean('<a href="feed:file:///tmp/foo">foo</a>') == '<a>foo</a>'
+
+
+def get_tests():
+    """Retrieves regression tests from data/ directory"""
+    datadir = os.path.join(os.path.dirname(__file__), 'data')
+    tests = [
+        os.path.join(datadir, fn) for fn in os.listdir(datadir)
+        if fn.endswith('.test')
+    ]
+    # Sort numerically which makes it easier to iterate through them
+    tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
+    return tests
+
+
+@pytest.mark.parametrize('fn', get_tests())
+def test_regressions(fn):
+    """Regression tests for clean so we can see if there are issues"""
+    s = open(fn, 'r').read()
+    expected = six.text_type(open(fn + '.out', 'r').read())
+
+    # NOTE(willkg): This strips input and expected which makes it easier to
+    # maintain the files. If there comes a time when the input needs whitespace
+    # at the beginning or end, then we'll have to figure out something else.
+    assert clean(s.strip()) == expected.strip()
+
+
+def test_regression_manually():
+    """Regression tests for clean so we can see if there are issues"""
+    # NOTE(willkg): Have to do this one by hand because of the \r
+    s = """<IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">"""
+    expected = """&lt;img src="jav\rascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;"""
+
+    assert clean(s) == expected

From 81e5bc737965fc59590f905107fb09096c4e239c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 Jan 2017 13:00:56 -0500
Subject: [PATCH 043/314] Add tesing for Python 3.6

---
 .travis.yml | 1 +
 CHANGES     | 4 ++++
 tox.ini     | 3 ++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index f88146d1..4e66cf1d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,6 +8,7 @@ python:
 - "3.3"
 - "3.4"
 - "3.5"
+- "3.6"
 - "pypy"
 env:
 - HTML5LIB=0.999       # 3
diff --git a/CHANGES b/CHANGES
index c52b5485..ec3bc992 100644
--- a/CHANGES
+++ b/CHANGES
@@ -9,6 +9,10 @@ Version 2.0 (in development)
 - Removed support for Python 2.6. #206
 - Removed support for Python 3.2. #224
 
+**Changes**
+
+- Added testing for Python 3.6.
+
 
 Version 1.5 (November 4th, 2016)
 --------------------------------
diff --git a/tox.ini b/tox.ini
index 02dde2d3..09ed488f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999
+envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999
 
 [testenv]
 basepython =
@@ -12,6 +12,7 @@ basepython =
     py33: python3.3
     py34: python3.4
     py35: python3.5
+    py36: python3.6
 deps =
     -rrequirements.txt
     html5lib999: html5lib==0.999

From c94db9529d47943ddc8ab207108ed0b30f177f87 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 Jan 2017 13:02:03 -0500
Subject: [PATCH 044/314] Add Python 3.6 to classifiers list

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 908928e1..167658ff 100644
--- a/setup.py
+++ b/setup.py
@@ -68,6 +68,7 @@ def get_version():
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ]
 )

From ef0c48765c160f5724d915770060a095cbc69dda Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 17 Feb 2017 14:55:55 -0500
Subject: [PATCH 045/314] Update dev requirements

---
 requirements.txt | 7 ++++---
 setup.py         | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a026c46e..6ec6bd90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,13 @@
 -e .
 
 # Requirements to run the test suite:
-pytest==3.0.3
-flake8==3.0.4
+pytest==3.0.6
+pytest-wholenodeid
+flake8==3.3.0
 tox==2.4.1
  
 # Requirements for building docs
-Sphinx==1.4.8
+Sphinx==1.5.2
  
 # Requirements for updating package
 twine==1.8.1
diff --git a/setup.py b/setup.py
index 167658ff..6c627a38 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     setup_requires.append('pytest-runner>=2.0,<3dev')
 
 tests_require = [
-    'pytest==3.0.3',
+    'pytest>=3.0.0',
 ]
     
 install_requires = [

From 567eebb53e0716f6d267fa8951c548d63bf78a70 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 Jan 2017 17:02:03 -0500
Subject: [PATCH 046/314] Overhaul bleach to use html5lib >= 0.99999999

This is a bleach rewrite to use the new sanitizer API in html5lib 0.99999999.
The new API happens as a filter when emitting the tree rather than in
the tokenizer. Because of that, the output of .clean() and .linkify() are
different than in previous versions of bleach.
---
 bleach/__init__.py     |  53 +++++++-----
 bleach/sanitizer.py    | 189 ++++++++++++++++++++---------------------
 setup.py               |   5 +-
 tests/test_basics.py   |   8 +-
 tests/test_css.py      |  74 ++++++++++------
 tests/test_links.py    |   4 +-
 tests/test_security.py |  24 ++++--
 tox.ini                |   7 +-
 8 files changed, 204 insertions(+), 160 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 09dad637..c54dc72b 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -5,13 +5,14 @@
 import re
 
 import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
+from html5lib.filters import sanitizer
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
 
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
-from .version import __version__, VERSION # flake8: noqa
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.sanitizer import BleachSanitizerFilter
+from bleach.version import __version__, VERSION # flake8: noqa
 
 __all__ = ['clean', 'linkify']
 
@@ -60,7 +61,7 @@
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
+PROTOCOLS = allowed_protocols
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
@@ -125,21 +126,34 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     text = force_unicode(text)
 
-    class s(BleachSanitizer):
-        allowed_elements = tags
-        allowed_attributes = attributes
-        allowed_css_properties = styles
-        allowed_protocols = protocols
-        strip_disallowed_elements = strip
-        strip_html_comments = strip_comments
+    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+    dom = parser.parseFragment(text)
 
-    parser = html5lib.HTMLParser(tokenizer=s)
+    walker = html5lib.getTreeWalker('etree')
+    filtered = BleachSanitizerFilter(
+        source=walker(dom),
+        allowed_attributes_map=attributes,
 
-    return _render(parser.parseFragment(text))
+        allowed_elements=tags,
+        allowed_css_properties=styles,
+        allowed_protocols=protocols,
+
+        allowed_svg_properties=[],
+
+        strip_disallowed_elements=strip,
+        strip_html_comments=strip_comments
+    )
+    s = HTMLSerializer(
+        quote_attr_values='always',
+        alphabetical_attributes=True,
+        omit_optional_tags=False
+    )
+    return s.render(filtered)
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False, tokenizer=HTMLSanitizer):
+            # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer):
+            parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
     ``linkify()`` converts strings that look like URLs, domain names and email
@@ -158,7 +172,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     if not text:
         return ''
 
-    parser = html5lib.HTMLParser(tokenizer=tokenizer)
+    # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer)
+    parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
     _seen = set([])
@@ -427,7 +442,7 @@ def _render(tree):
 def _serialize(domtree):
     walker = html5lib.treewalkers.getTreeWalker('etree')
     stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values=True,
+    serializer = HTMLSerializer(quote_attr_values='always',
                                 alphabetical_attributes=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index eec6659b..fb502b85 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -2,118 +2,125 @@
 import re
 from xml.sax.saxutils import escape, unescape
 
-from html5lib.constants import tokenTypes
-from html5lib.sanitizer import HTMLSanitizerMixin
-from html5lib.tokenizer import HTMLTokenizer
+from html5lib.constants import namespaces
+from html5lib.filters import sanitizer
 
 
-PROTOS = HTMLSanitizerMixin.acceptable_protocols
-PROTOS.remove('feed')
+class BleachSanitizerFilter(sanitizer.Filter):
+    def __init__(self, source, allowed_attributes_map,
+                 strip_disallowed_elements=False, strip_html_comments=True,
+                 **kwargs):
 
+        if isinstance(allowed_attributes_map, dict):
+            self.wildcard_attributes = allowed_attributes_map.get('*', [])
+            self.allowed_attributes_map = allowed_attributes_map
+        else:
+            self.wildcard_attributes = allowed_attributes_map
+            self.allowed_attributes_map = {}
 
-class BleachSanitizerMixin(HTMLSanitizerMixin):
-    """Mixin to replace sanitize_token() and sanitize_css()."""
+        self.strip_disallowed_elements = strip_disallowed_elements
+        self.strip_html_comments = strip_html_comments
 
-    allowed_svg_properties = []
+        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
 
-        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
-        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
+        ['attribute', 'pairs'], 'tag': callable}.
 
-        Here callable is a function with two arguments of attribute name
-        and value. It should return true of false.
+        Here callable is a function with two arguments of attribute name and
+        value. It should return true of false.
 
         Also gives the option to strip tags instead of encoding.
 
         """
-        if (getattr(self, 'wildcard_attributes', None) is None and
-                isinstance(self.allowed_attributes, dict)):
-            self.wildcard_attributes = self.allowed_attributes.get('*', [])
-
-        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
-                             tokenTypes['EmptyTag']):
+        token_type = token['type']
+        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
             if token['name'] in self.allowed_elements:
-                if 'data' in token:
-                    if isinstance(self.allowed_attributes, dict):
-                        allowed_attributes = self.allowed_attributes.get(
-                            token['name'], [])
-                        if not callable(allowed_attributes):
-                            allowed_attributes += self.wildcard_attributes
-                    else:
-                        allowed_attributes = self.allowed_attributes
-                    attrs = dict([(name, val) for name, val in
-                                  token['data'][::-1]
-                                  if (allowed_attributes(name, val)
-                                      if callable(allowed_attributes)
-                                      else name in allowed_attributes)])
-                    for attr in self.attr_val_is_uri:
-                        if attr not in attrs:
-                            continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                               unescape(attrs[attr])).lower()
-                        # Remove replacement characters from unescaped
-                        # characters.
-                        val_unescaped = val_unescaped.replace("\ufffd", "")
-                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
-                            and (val_unescaped.split(':')[0] not in
-                                 self.allowed_protocols)):
-                            del attrs[attr]
-                    for attr in self.svg_attr_val_allows_ref:
-                        if attr in attrs:
-                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                                 ' ',
-                                                 unescape(attrs[attr]))
-                    if (token['name'] in self.svg_allow_local_href and
-                            'xlink:href' in attrs and
-                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
-                        del attrs['xlink:href']
-                    if 'style' in attrs:
-                        attrs['style'] = self.sanitize_css(attrs['style'])
-                    token['data'] = [(name, val) for name, val in
-                                     attrs.items()]
-                return token
+                return self.allow_token(token)
+
             elif self.strip_disallowed_elements:
                 pass
+
             else:
-                if token['type'] == tokenTypes['EndTag']:
-                    token['data'] = '</{0!s}>'.format(token['name'])
-                elif token['data']:
-                    attr = ' {0!s}="{1!s}"'
-                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
-                                    token['data']])
-                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
-                else:
-                    token['data'] = '<{0!s}>'.format(token['name'])
-                if token['selfClosing']:
-                    token['data'] = token['data'][:-1] + '/>'
-                token['type'] = tokenTypes['Characters']
-                del token["name"]
-                return token
-        elif token['type'] == tokenTypes['Comment']:
+                return self.disallowed_token(token)
+
+        elif token_type == 'Comment':
             if not self.strip_html_comments:
                 return token
+
         else:
             return token
 
-    def sanitize_css(self, style):
-        """HTMLSanitizerMixin.sanitize_css replacement.
-
-        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
-        border-*, margin-*, and padding-*. We only whitelist what's in
-        the whitelist.
+    def allow_token(self, token):
+        if 'data' in token:
+            allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
+            if not callable(allowed_attributes):
+                allowed_attributes += self.wildcard_attributes
+
+            # Drop any attributes that aren't allowed
+            attrs = {}
+            for namespaced_name, val in token['data'].items():
+                namespace, name = namespaced_name
+                # FIXME(willkg): "name" used to be something like "xlink:href"
+                # but it's now (namespace['xlink'], 'href'). we should fix the
+                # name here so it's what the callable would expect.
+                if callable(allowed_attributes):
+                    if allowed_attributes(name, val):
+                        attrs[namespaced_name] = val
+
+                elif name in allowed_attributes:
+                    attrs[namespaced_name] = val
+
+            # Go through all the uri-type attributes
+            for attr in self.attr_val_is_uri:
+                if attr not in attrs:
+                    continue
+                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # Remove replacement characters from unescaped characters.
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+
+                if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
+                    (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                    # It has a protocol, but it's not allowed--so drop it
+                    del attrs[attr]
+
+            # FIXME(willkg): is this right?
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+
+            # FIXME(willkg): is this right?
+            if (token['name'] in self.svg_allow_local_href and
+                    (namespace['xlink'], 'href') in attrs and
+                    re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])):
+                del attrs[(namespace['xlink'], 'href')]
+
+            # Sanitize css in style attribute
+            if (None, u'style') in attrs:
+                attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
+
+            token['data'] = attrs
+        return token
 
-        """
+    def sanitize_css(self, style):
+        """html5lib sanitizer filter replacement to fix issues"""
         # disallow urls
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # gauntlet
-        # TODO: Make sure this does what it's meant to - I *think* it wants to
-        # validate style attribute contents.
+
+        # Validate the css in the style tag and if it's not valid, then drop
+        # the whole thing.
         parts = style.split(';')
-        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
-                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+        gauntlet = re.compile(
+            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
+        )
+
         for part in parts:
             if not gauntlet.match(part):
                 return ''
@@ -125,23 +132,11 @@ def sanitize_css(self, style):
         for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
             if not value:
                 continue
+
             if prop.lower() in self.allowed_css_properties:
                 clean.append(prop + ': ' + value + ';')
+
             elif prop.lower() in self.allowed_svg_properties:
                 clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
-
-
-class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName,
-                               **kwargs)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
diff --git a/setup.py b/setup.py
index 6c627a38..39fbb370 100644
--- a/setup.py
+++ b/setup.py
@@ -15,9 +15,8 @@
     
 install_requires = [
     'six',
-    # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're
-    # busted
-    'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999',
+    # >= 8 9s because of breaking API change
+    'html5lib>=0.99999999',
 ]
 
 
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 07d4d918..8e293ca6 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,5 +1,6 @@
-import six
 import html5lib
+import pytest
+import six
 
 import bleach
 
@@ -234,6 +235,7 @@ def test_wildcard_attributes():
     assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     dirty = 'Yeah right <sarcasm/>'
@@ -242,8 +244,8 @@ def test_sarcasm():
 
 
 def test_user_defined_protocols_valid():
-    valid_href = '<a href="my_protocol://more_text">allowed href</a>'
-    assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href
+    valid_href = '<a href="myprotocol://more_text">allowed href</a>'
+    assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
 
 
 def test_user_defined_protocols_invalid():
diff --git a/tests/test_css.py b/tests/test_css.py
index 0b92f40b..3d224fac 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -8,7 +8,7 @@
 clean = partial(clean, tags=['p'], attributes=['style'])
 
 
-@pytest.mark.parametrize('data,styles,expected', [
+@pytest.mark.parametrize('data, styles, expected', [
     (
         'font-family: Arial; color: red; float: left; background-color: red;',
         ['color'],
@@ -91,24 +91,40 @@ def test_valid_css():
 
 def test_style_hang():
     """The sanitizer should not hang on any inline styles"""
-    # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug
-    style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """
-             """margin-left: 0px; padding-top: 15px; padding-right: 15px; """
-             """padding-bottom: 15px; padding-left: 15px; border-top-width: """
-             """1px; border-right-width: 1px; border-bottom-width: 1px; """
-             """border-left-width: 1px; border-top-style: dotted; """
-             """border-right-style: dotted; border-bottom-style: dotted; """
-             """border-left-style: dotted; border-top-color: rgb(203, 200, """
-             """185); border-right-color: rgb(203, 200, 185); """
-             """border-bottom-color: rgb(203, 200, 185); border-left-color: """
-             """rgb(203, 200, 185); background-image: initial; """
-             """background-attachment: initial; background-origin: initial; """
-             """background-clip: initial; background-color: """
-             """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """
-             """font: normal normal normal 100%/normal 'Courier New', """
-             """'Andale Mono', monospace; background-position: initial """
-             """initial; background-repeat: initial initial;""")
-    html = '<p style="{0!s}">Hello world</p>'.format(style)
+    style = [
+        'margin-top: 0px;',
+        'margin-right: 0px;',
+        'margin-bottom: 1.286em;',
+        'margin-left: 0px;',
+        'padding-top: 15px;',
+        'padding-right: 15px;',
+        'padding-bottom: 15px;',
+        'padding-left: 15px;',
+        'border-top-width: 1px;',
+        'border-right-width: 1px;',
+        'border-bottom-width: 1px;',
+        'border-left-width: 1px;',
+        'border-top-style: dotted;',
+        'border-right-style: dotted;',
+        'border-bottom-style: dotted;',
+        'border-left-style: dotted;',
+        'border-top-color: rgb(203, 200, 185);',
+        'border-right-color: rgb(203, 200, 185);',
+        'border-bottom-color: rgb(203, 200, 185);',
+        'border-left-color: rgb(203, 200, 185);',
+        'background-image: initial;',
+        'background-attachment: initial;',
+        'background-origin: initial;',
+        'background-clip: initial;',
+        'background-color: rgb(246, 246, 242);',
+        'overflow-x: auto;',
+        'overflow-y: auto;',
+        # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css.
+        # 'font: italic small-caps bolder condensed 16px/3 cursive;',
+        'background-position: initial initial;',
+        'background-repeat: initial initial;'
+    ]
+    html = '<p style="%s">Hello world</p>' % ' '.join(style)
     styles = [
         'border', 'float', 'overflow', 'min-height', 'vertical-align',
         'white-space',
@@ -120,12 +136,18 @@ def test_style_hang():
         'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
     ]
 
-    expected = ("""<p style="margin-top: 0px; margin-right: 0px; """
-                """margin-bottom: 1.286em; margin-left: 0px; padding-top: """
-                """15px; padding-right: 15px; padding-bottom: 15px; """
-                """padding-left: 15px; background-color: """
-                """rgb(246, 246, 242); font: normal normal normal """
-                """100%/normal 'Courier New', 'Andale Mono', monospace;">"""
-                """Hello world</p>""")
+    expected = (
+        '<p style="'
+        'margin-top: 0px; '
+        'margin-right: 0px; '
+        'margin-bottom: 1.286em; '
+        'margin-left: 0px; '
+        'padding-top: 15px; '
+        'padding-right: 15px; '
+        'padding-bottom: 15px; '
+        'padding-left: 15px; '
+        'background-color: rgb(246, 246, 242);'
+        '">Hello world</p>'
+    )
 
     assert clean(html, styles=styles) == expected
diff --git a/tests/test_links.py b/tests/test_links.py
index ac38ee70..6b7a77eb 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,7 @@
 except ImportError:
     from urllib import quote_plus
 
-from html5lib.tokenizer import HTMLTokenizer
+# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
 import pytest
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -406,6 +406,7 @@ def test_end_of_clause():
     )
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
@@ -498,6 +499,7 @@ def test_ports(data, expected_data):
     assert linkify(data) == out.format(*expected_data)
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
 def test_tokenizer():
     """Linkify doesn't always have to sanitize."""
     raw = '<em>test<x></x></em>'
diff --git a/tests/test_security.py b/tests/test_security.py
index 6ffaf449..4fb30207 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -74,7 +74,9 @@ def test_invalid_href_attr():
 
 def test_invalid_filter_attr():
     IMG = ['img', ]
-    IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
+    IMG_ATTR = {
+        'img': lambda n, v: n == 'src' and v == "http://example.com/"
+    }
 
     assert (
         clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
@@ -145,7 +147,11 @@ def test_feed_protocol():
 
 
 def get_tests():
-    """Retrieves regression tests from data/ directory"""
+    """Retrieves regression tests from data/ directory
+
+    :returns: list of ``(filename, filedata)`` tuples
+
+    """
     datadir = os.path.join(os.path.dirname(__file__), 'data')
     tests = [
         os.path.join(datadir, fn) for fn in os.listdir(datadir)
@@ -153,19 +159,23 @@ def get_tests():
     ]
     # Sort numerically which makes it easier to iterate through them
     tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
-    return tests
+
+    testcases = [
+        (fn, open(fn, 'r').read()) for fn in tests
+    ]
+
+    return testcases
 
 
-@pytest.mark.parametrize('fn', get_tests())
-def test_regressions(fn):
+@pytest.mark.parametrize('fn, text', get_tests())
+def test_regressions(fn, text):
     """Regression tests for clean so we can see if there are issues"""
-    s = open(fn, 'r').read()
     expected = six.text_type(open(fn + '.out', 'r').read())
 
     # NOTE(willkg): This strips input and expected which makes it easier to
     # maintain the files. If there comes a time when the input needs whitespace
     # at the beginning or end, then we'll have to figure out something else.
-    assert clean(s.strip()) == expected.strip()
+    assert clean(text.strip()) == expected.strip()
 
 
 def test_regression_manually():
diff --git a/tox.ini b/tox.ini
index 09ed488f..53c175c9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999
+envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999
 
 [testenv]
 basepython =
@@ -15,8 +15,7 @@ basepython =
     py36: python3.6
 deps =
     -rrequirements.txt
-    html5lib999: html5lib==0.999
-    html5lib999999: html5lib==0.999999
-    html5lib9999999: html5lib==0.9999999
+    html5lib99999999: html5lib==0.99999999
+    html5lib999999999: html5lib==0.999999999
 commands =
     py.test {posargs:-v}

From 3db588a5ed8b43da8324547b23ad5670b459ee9e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 11:45:01 -0500
Subject: [PATCH 047/314] Update security regression tests

I also moved test_nasty to the regression tests because it's just like those.
---
 tests/data/14.test.out |  2 +-
 tests/data/15.test.out |  2 +-
 tests/data/16.test.out |  2 +-
 tests/data/17.test.out |  2 +-
 tests/data/3.test.out  |  2 +-
 tests/data/4.test      |  1 +
 tests/data/4.test.out  |  1 +
 tests/data/5.test.out  |  2 +-
 tests/data/9.test.out  |  2 +-
 tests/test_security.py | 35 +++++++++++++++++------------------
 10 files changed, 26 insertions(+), 25 deletions(-)
 create mode 100644 tests/data/4.test
 create mode 100644 tests/data/4.test.out

diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 16445739..8e5ff754 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;#116;:a
\ No newline at end of file
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;:a&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index 334f916b..8b90245f 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-le&amp;&lt;wbr&gt;#114;t('XS&lt;wbr&gt;;S')&gt;
\ No newline at end of file
+le&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;t('XS&lt;wbr&gt;&lt;/wbr&gt;;S')&gt;
\ No newline at end of file
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
index 9c6ca965..1ecb332b 100644
--- a/tests/data/16.test.out
+++ b/tests/data/16.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;#0000099ri&amp;&lt;wbr&gt;#0000112t:&amp;&lt;wbr&gt;#0000097le&amp;&lt;wbr&gt;#0000114t(&amp;&lt;wbr&gt;#0000039XS&amp;&lt;wbr&gt;#0000083')&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099ri&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112t:&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097le&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114t(&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039XS&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083')&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
index dabfaa2d..ae928a99 100644
--- a/tests/data/17.test.out
+++ b/tests/data/17.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;#x61lert(&amp;&lt;wbr&gt;#x27XSS')&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61lert(&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27XSS')&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/3.test.out b/tests/data/3.test.out
index 20c3d0d4..f0d69629 100644
--- a/tests/data/3.test.out
+++ b/tests/data/3.test.out
@@ -1 +1 @@
-&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
+&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;&lt;/img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
diff --git a/tests/data/4.test b/tests/data/4.test
new file mode 100644
index 00000000..c4cf51cd
--- /dev/null
+++ b/tests/data/4.test
@@ -0,0 +1 @@
+<scr<script></script>ipt type="text/javascript">alert("foo");</<script></script>script<del></del>>
diff --git a/tests/data/4.test.out b/tests/data/4.test.out
new file mode 100644
index 00000000..88ea86b2
--- /dev/null
+++ b/tests/data/4.test.out
@@ -0,0 +1 @@
+&lt;scr&lt;script&gt;ipt type="text/javascript"&gt;alert("foo");script&lt;del&gt;&lt;/del&gt;&gt;&lt;/scr&lt;script&gt;
diff --git a/tests/data/5.test.out b/tests/data/5.test.out
index 1782eafb..0d88a88a 100644
--- a/tests/data/5.test.out
+++ b/tests/data/5.test.out
@@ -1 +1 @@
-&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
+&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;&lt;/img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
diff --git a/tests/data/9.test.out b/tests/data/9.test.out
index 3a4d9b6c..5c5eb6ba 100644
--- a/tests/data/9.test.out
+++ b/tests/data/9.test.out
@@ -1 +1 @@
-'';!--"&lt;xss&gt;=&amp;{()}
\ No newline at end of file
+'';!--"&lt;xss&gt;=&amp;{()}&lt;/xss&gt;
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 4fb30207..356b1292 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -22,7 +22,7 @@ def test_nested_script_tag():
 def test_nested_script_tag_r():
     assert (
         clean('<script<script>>evil()</script</script>>') ==
-        '&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;'
+        '&lt;script&lt;script&gt;&gt;evil()&gt;&lt;/script&lt;script&gt;'
     )
 
 
@@ -90,8 +90,11 @@ def test_invalid_filter_attr():
 
 def test_invalid_tag_char():
     assert (
-        clean('<script/xss src="http://xx.com/xss.js"></script>') ==
-        '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        clean('<script/xss src="http://xx.com/xss.js"></script>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" xss=""&gt;&lt;/script&gt;',
+            '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
     assert (
         clean('<script/src="http://xx.com/xss.js"></script>') ==
@@ -102,15 +105,21 @@ def test_invalid_tag_char():
 def test_unclosed_tag():
     assert (
         clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;'
+        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;&lt;/script&gt;'
     )
     assert (
-        clean('<script src="http://xx.com/xss.js"<b>') ==
-        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+        clean('<script src="http://xx.com/xss.js"<b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
     assert (
-        clean('<script src="http://xx.com/xss.js" <b>') ==
-        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+        clean('<script src="http://xx.com/xss.js" <b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
 
 
@@ -122,16 +131,6 @@ def test_strip():
     assert clean(s, strip=True) == 'pt&gt;pt&gt;alert(1)'
 
 
-def test_nasty():
-    """Nested, broken up, multiple tags, are still foiled!"""
-    test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
-            '<script></script>script<del></del>>')
-    expect = ('&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
-              '&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
-              '&gt;')
-    assert clean(test) == expect
-
-
 def test_poster_attribute():
     """Poster attributes should not allow javascript."""
     tags = ['video']

From 02facd34b02cd9d9e16b547e07b07ddd3db40455 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 13:23:00 -0500
Subject: [PATCH 048/314] Minor code cleanup

* address FIXMEs
* minor cleanup to code and comments
---
 bleach/__init__.py     |  8 ++++----
 bleach/sanitizer.py    | 14 ++++++--------
 tests/test_basics.py   | 15 ++++++++++-----
 tests/test_css.py      |  6 +++---
 tests/test_links.py    |  9 ---------
 tests/test_security.py |  2 +-
 6 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index c54dc72b..d1a82cde 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -120,9 +120,11 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     :arg strip: whether or not to strip disallowed elements
     :arg strip_comments: whether or not to strip HTML comments
 
+    :returns: cleaned text as unicode
+
     """
     if not text:
-        return ''
+        return u''
 
     text = force_unicode(text)
 
@@ -152,7 +154,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer):
             parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
@@ -170,9 +171,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     text = force_unicode(text)
 
     if not text:
-        return ''
+        return u''
 
-    # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer)
     parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index fb502b85..0701def2 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -63,9 +63,7 @@ def allow_token(self, token):
             attrs = {}
             for namespaced_name, val in token['data'].items():
                 namespace, name = namespaced_name
-                # FIXME(willkg): "name" used to be something like "xlink:href"
-                # but it's now (namespace['xlink'], 'href'). we should fix the
-                # name here so it's what the callable would expect.
+
                 if callable(allowed_attributes):
                     if allowed_attributes(name, val):
                         attrs[namespaced_name] = val
@@ -73,12 +71,14 @@ def allow_token(self, token):
                 elif name in allowed_attributes:
                     attrs[namespaced_name] = val
 
-            # Go through all the uri-type attributes
+            # Handle attributes that have uri values
             for attr in self.attr_val_is_uri:
                 if attr not in attrs:
                     continue
+
                 val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                        unescape(attrs[attr])).lower()
+
                 # Remove replacement characters from unescaped characters.
                 val_unescaped = val_unescaped.replace("\ufffd", "")
 
@@ -87,17 +87,15 @@ def allow_token(self, token):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
-            # FIXME(willkg): is this right?
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
                     attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                          ' ',
                                          unescape(attrs[attr]))
 
-            # FIXME(willkg): is this right?
             if (token['name'] in self.svg_allow_local_href and
-                    (namespace['xlink'], 'href') in attrs and
-                    re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])):
+                    (namespaces['xlink'], 'href') in attrs and
+                    re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
                 del attrs[(namespace['xlink'], 'href')]
 
             # Sanitize css in style attribute
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 8e293ca6..49148592 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -71,12 +71,17 @@ def test_function_arguments():
 
 def test_named_arguments():
     ATTRS = {'a': ['rel', 'href']}
-    s = ('<a href="http://xx.com" rel="alternate">xx.com</a>',
-         '<a rel="alternate" href="http://xx.com">xx.com</a>')
 
-    assert bleach.clean(s[0]) == '<a href="http://xx.com">xx.com</a>'
-    # FIXME: This might not be needed if attribute order is stable now.
-    assert bleach.clean(s[0], attributes=ATTRS) in s
+    text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
+
+    assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
+    assert (
+        bleach.clean(text, attributes=ATTRS) in
+        [
+            '<a href="http://xx.com" rel="alternate">xx.com</a>',
+            '<a rel="alternate" href="http://xx.com">xx.com</a>'
+        ]
+    )
 
 
 def test_disallowed_html():
diff --git a/tests/test_css.py b/tests/test_css.py
index 3d224fac..d8880d78 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -119,8 +119,7 @@ def test_style_hang():
         'background-color: rgb(246, 246, 242);',
         'overflow-x: auto;',
         'overflow-y: auto;',
-        # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css.
-        # 'font: italic small-caps bolder condensed 16px/3 cursive;',
+        'font: italic small-caps bolder condensed 16px/3 cursive;',
         'background-position: initial initial;',
         'background-repeat: initial initial;'
     ]
@@ -146,7 +145,8 @@ def test_style_hang():
         'padding-right: 15px; '
         'padding-bottom: 15px; '
         'padding-left: 15px; '
-        'background-color: rgb(246, 246, 242);'
+        'background-color: rgb(246, 246, 242); '
+        'font: italic small-caps bolder condensed 16px/3 cursive;'
         '">Hello world</p>'
     )
 
diff --git a/tests/test_links.py b/tests/test_links.py
index 6b7a77eb..53d60e5c 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,6 @@
 except ImportError:
     from urllib import quote_plus
 
-# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
 import pytest
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -499,14 +498,6 @@ def test_ports(data, expected_data):
     assert linkify(data) == out.format(*expected_data)
 
 
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
-def test_tokenizer():
-    """Linkify doesn't always have to sanitize."""
-    raw = '<em>test<x></x></em>'
-    assert linkify(raw) == '<em>test&lt;x&gt;&lt;/x&gt;</em>'
-    assert linkify(raw, tokenizer=HTMLTokenizer) == raw
-
-
 def test_ignore_bad_protocols():
     assert (
         linkify('foohttp://bar') ==
diff --git a/tests/test_security.py b/tests/test_security.py
index 356b1292..2aac0200 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -75,7 +75,7 @@ def test_invalid_href_attr():
 def test_invalid_filter_attr():
     IMG = ['img', ]
     IMG_ATTR = {
-        'img': lambda n, v: n == 'src' and v == "http://example.com/"
+        'img': lambda attr, val: attr == 'src' and val == "http://example.com/"
     }
 
     assert (

From 10852231012ae3eece9c0a2af5c6c7c8e2e5212f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 15:37:00 -0500
Subject: [PATCH 049/314] Update CHANGES and docs

---
 CHANGES          | 19 +++++++++++++++----
 README.rst       |  2 --
 docs/clean.rst   |  5 ++++-
 docs/conf.py     |  3 +--
 docs/dev.rst     |  4 ++--
 docs/goals.rst   |  4 ++--
 docs/linkify.rst | 17 +----------------
 7 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/CHANGES b/CHANGES
index ec3bc992..e8e49a8d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -8,10 +8,19 @@ Version 2.0 (in development)
 
 - Removed support for Python 2.6. #206
 - Removed support for Python 3.2. #224
+- Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+  This version represents a rewrite to use the new sanitizing API since
+  the old one was dropped in html5lib 0.99999999 (8 9s).
+
+- linkify no longer accepts a tokenizer argument.
+- clean output is different than in previous versions; particularly this version
+  will add end tags even if the tag will be escaped.
 
 **Changes**
 
-- Added testing for Python 3.6.
+- Supports Python 3.6.
+- Supports html5lib >= 0.99999999 (8 9s).
 
 
 Version 1.5 (November 4th, 2016)
@@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016)
 **Backwards incompatible changes**
 
 - clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
-  mailto. Previously it was a long list of protocols something like ed2k, ftp,
-  http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto,
-  feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+  mailto.
+
+  Previously it was a long list of protocols something like ed2k, ftp, http,
+  https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+  urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
 
 **Changes**
 
diff --git a/README.rst b/README.rst
index 8f9ce05d..3bd87573 100644
--- a/README.rst
+++ b/README.rst
@@ -101,5 +101,3 @@ The simplest way to use Bleach is:
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
 .. _PyPI: http://pypi.python.org/pypi/bleach
-
-
diff --git a/docs/clean.rst b/docs/clean.rst
index ebd82055..a988a81a 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -162,7 +162,7 @@ For example, this sets allowed protocols to http, https and smb:
    u'<a href="smb://more_text">allowed protocol</a>'
 
 
-This adds smb to the bleach-specified set of allowed protocols:
+This adds smb to the Bleach-specified set of allowed protocols:
 
 .. doctest::
 
@@ -187,6 +187,7 @@ whitelist and invalid markup. For example:
 .. doctest::
 
    >>> import bleach
+
    >>> bleach.clean('<span>is not allowed</span>')
    u'&lt;span&gt;is not allowed&lt;/span&gt;'
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
@@ -199,6 +200,7 @@ If you would rather Bleach stripped this markup entirely, you can pass
 .. doctest::
 
    >>> import bleach
+
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
    u'is not allowed'
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
@@ -214,6 +216,7 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
 .. doctest::
 
    >>> import bleach
+
    >>> html = 'my<!-- commented --> html'
 
    >>> bleach.clean(html)
diff --git a/docs/conf.py b/docs/conf.py
index 00b9c239..e186c827 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,8 +27,7 @@
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode',
-              'sphinx.ext.doctest']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
diff --git a/docs/dev.rst b/docs/dev.rst
index 027a0a76..02f8d44a 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -5,7 +5,7 @@ Bleach development
 Docs
 ====
 
-Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs
+Docs are in ``docs/``. We use Sphinx. Docs are pushed to ReadTheDocs
 via a GitHub webhook.
 
 
@@ -16,7 +16,7 @@ Run::
 
     $ tox
 
-That'll run bleach tests in all the supported Python environments. Note
+That'll run Bleach tests in all the supported Python environments. Note
 that you need the necessary Python binaries for them all to be tested.
 
 Tests are run in Travis CI via a GitHub webhook.
diff --git a/docs/goals.rst b/docs/goals.rst
index 01f63a94..632c222c 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -91,10 +91,10 @@ Make malicious content look pretty or sane
 ------------------------------------------
 
 Malicious content is designed to be malicious. Making it safe is a design goal
-of bleach. Making it pretty or sane-looking is not.
+of Bleach. Making it pretty or sane-looking is not.
 
 If you want your malicious content to look pretty, you should pass it through
-bleach to make it safe and then do your own transform afterwards.
+Bleach to make it safe and then do your own transform afterwards.
 
 
 Allow arbitrary styling
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b7449c34..705000c2 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -9,7 +9,7 @@
 control how and when those links are rendered::
 
     def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-                parse_email=False, tokenizer=HTMLSanitizer):
+                parse_email=False):
         """Convert URL-like strings in an HTML fragment to links.
 
 ``linkify()`` works by building a document tree, so it's guaranteed never to do
@@ -194,19 +194,4 @@ they are newly created or already in the text, so be careful when writing
 callbacks that may need to behave differently if the protocol is ``mailto:``.
 
 
-``tokenizer``
-=============
-
-``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by
-default. This has the effect of scrubbing some tags and attributes. To use a
-more lenient, or totally different, tokenizer, you can specify the tokenizer
-class here. (See the implementation of :ref:`clean() <clean-chapter>` for an
-example of building a custom tokenizer.)
-
-::
-
-    from html5lib.tokenizer import HTMLTokenizer
-    linked_text = linkify(text, tokenizer=HTMLTokenizer)
-
-
 .. _Crate: https://crate.io/

From 066631af96ee9c16ddcf9d132bd8537a4af17da6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 20:49:21 -0500
Subject: [PATCH 050/314] Update .travis.yml

---
 .travis.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4e66cf1d..318dfa7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,9 +11,8 @@ python:
 - "3.6"
 - "pypy"
 env:
-- HTML5LIB=0.999       # 3
-- HTML5LIB=0.999999    # 6
-- HTML5LIB=0.9999999   # 7
+- HTML5LIB=0.99999999   # 8
+- HTML5LIB=0.999999999  # 9
 install:
 - pip install -r requirements.txt
 - pip install html5lib==$HTML5LIB

From 30772dd20f16a8ec75f7c1ddb7b76c9ff6fd97d2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 20:57:13 -0500
Subject: [PATCH 051/314] Another .travis.yml fix

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 318dfa7d..a5db65b1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,7 @@ env:
 - HTML5LIB=0.99999999   # 8
 - HTML5LIB=0.999999999  # 9
 install:
+- pip install -U pip
 - pip install -r requirements.txt
 - pip install html5lib==$HTML5LIB
 script:

From 85cc802584f4290556b14e50384c2ed6bb6959b6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 21:05:59 -0500
Subject: [PATCH 052/314] One more .travis.yml fix

---
 .travis.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a5db65b1..14015378 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,9 +14,11 @@ env:
 - HTML5LIB=0.99999999   # 8
 - HTML5LIB=0.999999999  # 9
 install:
-- pip install -U pip
-- pip install -r requirements.txt
-- pip install html5lib==$HTML5LIB
+  # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+  - pip install -U pip setuptools>=18.5
+  - pip install -r requirements.txt
+  # stomp on html5lib install with the specified one
+  - pip install html5lib==$HTML5LIB
 script:
 - py.test
 - flake8 bleach/

From 4cca43b5f41a4cc157fa2e6f5dd5185f70cb7a6c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 21:10:40 -0500
Subject: [PATCH 053/314] Fix flake8 issues

---
 bleach/sanitizer.py | 4 ++--
 setup.cfg           | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0701def2..68438e9a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 import re
-from xml.sax.saxutils import escape, unescape
+from xml.sax.saxutils import unescape
 
 from html5lib.constants import namespaces
 from html5lib.filters import sanitizer
@@ -83,7 +83,7 @@ def allow_token(self, token):
                 val_unescaped = val_unescaped.replace("\ufffd", "")
 
                 if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
-                    (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                        (val_unescaped.split(':')[0] not in self.allowed_protocols)):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
diff --git a/setup.cfg b/setup.cfg
index f3a416e4..950364a7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ test=pytest
 
 [flake8]
 ignore = E731,W503
+max-line-length = 100
 
 [wheel]
 universal=1

From 49f35dd27317019e47febed1c0c507177297a8af Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 12:21:38 -0500
Subject: [PATCH 054/314] Add tests, fix alphabetizing, code cleanup

* this adds some missing tests to add more coverage
* html5lib 0.99999999 and 0.999999999 have an alphabeticalattributes filter that
  doesn't work when the attributes set has some items with a namespace and some
  without in Python 3; this rolls alphabetizing into the Bleach sanitizer
* remove some dead code and clean some other code up
---
 bleach/__init__.py   | 17 ++++-----
 bleach/sanitizer.py  | 43 +++++++++++++++++-----
 tests/test_basics.py | 86 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 17 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index d1a82cde..ae96a925 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -61,8 +61,6 @@
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 
-PROTOCOLS = allowed_protocols
-
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
     \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
@@ -70,7 +68,7 @@
     (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
+    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
     re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -87,8 +85,6 @@
     """,
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 
-NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
-
 ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
 # a simple routine that returns the tag name with the namespace prefix
 # as returned by etree's Element.tag attribute
@@ -147,8 +143,13 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     )
     s = HTMLSerializer(
         quote_attr_values='always',
-        alphabetical_attributes=True,
-        omit_optional_tags=False
+        omit_optional_tags=False,
+
+        # Bleach has its own sanitizer, so don't use the html5lib one
+        sanitize=False,
+
+        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+        alphabetical_attributes=False,
     )
     return s.render(filtered)
 
@@ -176,7 +177,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
-    _seen = set([])
+    _seen = set()
 
     def replace_nodes(tree, new_frag, node, index=0):
         """Doesn't really replace nodes, but inserts the nodes contained in
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 68438e9a..c12b8d24 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,4 +1,5 @@
 from __future__ import unicode_literals
+from collections import OrderedDict
 import re
 from xml.sax.saxutils import unescape
 
@@ -6,6 +7,19 @@
 from html5lib.filters import sanitizer
 
 
+def _attr_key(attr):
+    """Returns appropriate key for sorting attribute names
+
+    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+    ``None`` or a string. These can't be compared in Python 3, so we conver the
+    ``None`` to an empty string.
+
+    """
+    key = (attr[0][0] or ''), attr[0][1]
+    print(key)
+    return key
+
+
 class BleachSanitizerFilter(sanitizer.Filter):
     def __init__(self, source, allowed_attributes_map,
                  strip_disallowed_elements=False, strip_html_comments=True,
@@ -87,22 +101,33 @@ def allow_token(self, token):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
+            # Drop values in svg attrs with non-local IRIs
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
-                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                         ' ',
-                                         unescape(attrs[attr]))
-
-            if (token['name'] in self.svg_allow_local_href and
-                    (namespaces['xlink'], 'href') in attrs and
-                    re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
-                del attrs[(namespace['xlink'], 'href')]
+                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                     ' ',
+                                     unescape(attrs[attr]))
+                    new_val = new_val.strip()
+                    if not new_val:
+                        del attrs[attr]
+                    else:
+                        attrs[attr] = new_val
+
+            # Drop href and xlink:href attr for svg elements with non-local IRIs
+            if (None, token['name']) in self.svg_allow_local_href:
+                for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]:
+                    if href_attr in attrs:
+                        if re.search(r'^\s*[^#\s]', attrs[href_attr]):
+                            del attrs[href_attr]
 
             # Sanitize css in style attribute
             if (None, u'style') in attrs:
                 attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
 
-            token['data'] = attrs
+            # Alphabetize attributes
+            token['data'] = OrderedDict(
+                [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)]
+            )
         return token
 
     def sanitize_css(self, style):
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 49148592..bae1506e 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -240,6 +240,92 @@ def test_wildcard_attributes():
     assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
 
 
+def test_callable_attributes():
+    """Verify callable attributes work and get correct arg values"""
+    def img_test(attr, val):
+        return attr == 'src' and val.startswith('https')
+
+    ATTR = {
+        'img': img_test,
+    }
+    TAGS = ['img']
+
+    assert (
+        bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+        u'foo <img> baz'
+    )
+    assert (
+        bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+        u'foo <img src="https://example.com"> baz'
+    )
+
+
+def test_svg_attr_val_allows_ref():
+    """Unescape values in svg attrs that allow url references"""
+    # Local IRI, so keep it
+    text = '<svg><rect fill="url(#foo)" /></svg>'
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect fill="url(#foo)"></rect></svg>'
+    )
+
+    # Non-local IRI, so drop it
+    text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect></rect></svg>'
+    )
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+        '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
+        # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href(text, expected):
+    """Keep local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href_nonlocal(text, expected):
+    """Drop non-local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+
+
 @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""

From 70d96e390c1525d8508436d8582daa49357fdf3e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 12:49:12 -0500
Subject: [PATCH 055/314] Alphabetize before escaping disallowed tokens

Making this change means the output is stable since attributes will always
happen in the same order. Seems like maybe it's not a great idea, but stable
seems good. If it turns out this is terrible, someone will complain with a
compelling use case and we can undo it.

I also went through and removed a bunch of the "the output is either this or
that" in the tests.
---
 bleach/sanitizer.py   |  6 ++++++
 tests/test_basics.py  | 31 +++++++++++++------------------
 tests/test_unicode.py |  7 ++-----
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c12b8d24..789d89e6 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -58,6 +58,12 @@ def sanitize_token(self, token):
                 pass
 
             else:
+                if 'data' in token:
+                    # Alphabetize the attributes before calling .disallowed_token()
+                    # so that the resulting string is stable
+                    token['data'] = OrderedDict(
+                        [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)]
+                    )
                 return self.disallowed_token(token)
 
         elif token_type == 'Comment':
diff --git a/tests/test_basics.py b/tests/test_basics.py
index bae1506e..c42ccc62 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -76,11 +76,8 @@ def test_named_arguments():
 
     assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
     assert (
-        bleach.clean(text, attributes=ATTRS) in
-        [
-            '<a href="http://xx.com" rel="alternate">xx.com</a>',
-            '<a rel="alternate" href="http://xx.com">xx.com</a>'
-        ]
+        bleach.clean(text, attributes=ATTRS) ==
+        '<a href="http://xx.com" rel="alternate">xx.com</a>'
     )
 
 
@@ -199,25 +196,22 @@ def test_idempotent():
     clean = bleach.clean(dirty)
     assert bleach.clean(clean) == clean
 
-    possible_outs = (
-        '<span>invalid &amp; </span> &lt; extra <a rel="nofollow" href="http://link.com">http://link.com</a><em></em>',
+    linked = bleach.linkify(dirty)
+    assert (
+        bleach.linkify(linked) ==
         '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" rel="nofollow">http://link.com</a><em></em>'
     )
-    linked = bleach.linkify(dirty)
-    assert bleach.linkify(linked) in possible_outs
 
 
 def test_rel_already_there():
     """Make sure rel attribute is updated not replaced"""
     linked = ('Click <a href="http://example.com" rel="tooltip">'
               'here</a>.')
-    link_good = (('Click <a href="http://example.com" rel="tooltip nofollow">'
-                  'here</a>.'),
-                 ('Click <a rel="tooltip nofollow" href="http://example.com">'
-                  'here</a>.'))
 
-    assert bleach.linkify(linked) in link_good
-    assert bleach.linkify(link_good[0]) in link_good
+    link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+
+    assert bleach.linkify(linked) == link_good
+    assert bleach.linkify(link_good) == link_good
 
 
 def test_lowercase_html():
@@ -235,9 +229,10 @@ def test_wildcard_attributes():
     TAG = ['img', 'em']
     dirty = ('both <em id="foo" style="color: black">can</em> have '
              '<img id="bar" src="foo"/>')
-    clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">',
-             'both <em id="foo">can</em> have <img id="bar" src="foo">')
-    assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
+    assert (
+        bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
+        'both <em id="foo">can</em> have <img id="bar" src="foo">'
+    )
 
 
 def test_callable_attributes():
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index b8b670e8..08ab3f4e 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -27,11 +27,8 @@ def test_mixed():
 
 def test_mixed_linkify():
     assert (
-        linkify('Домашняя http://example.com ヘルプとチュートリアル') in
-        (
-            'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル',
-            'Домашняя <a rel="nofollow" href="http://example.com">http://example.com</a> ヘルプとチュートリアル'
-        )
+        linkify('Домашняя http://example.com ヘルプとチュートリアル') ==
+        'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル'
     )
 
 

From 86dea93c17d96a237acd2543264ba27d83f15d79 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 13:09:20 -0500
Subject: [PATCH 056/314] Cosmetic fix for readability

---
 bleach/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index ae96a925..a645ac7e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -130,16 +130,18 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     walker = html5lib.getTreeWalker('etree')
     filtered = BleachSanitizerFilter(
         source=walker(dom),
+
+        # Bleach-sanitizer-specific things
         allowed_attributes_map=attributes,
+        strip_disallowed_elements=strip,
+        strip_html_comments=strip_comments,
 
+        # html5lib-sanitizer things
         allowed_elements=tags,
         allowed_css_properties=styles,
         allowed_protocols=protocols,
-
         allowed_svg_properties=[],
 
-        strip_disallowed_elements=strip,
-        strip_html_comments=strip_comments
     )
     s = HTMLSerializer(
         quote_attr_values='always',

From 14ce11293e9f74009e4c9e55bec3039ac8106540 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 22 Feb 2017 19:37:49 -0500
Subject: [PATCH 057/314] add test website and scripts

---
 website/.gitignore        |  1 +
 website/README.txt        | 16 +++++++++
 website/data_to_json.py   | 53 ++++++++++++++++++++++++++++
 website/index.html        | 74 +++++++++++++++++++++++++++++++++++++++
 website/open_test_page.py | 34 ++++++++++++++++++
 website/server.py         | 42 ++++++++++++++++++++++
 6 files changed, 220 insertions(+)
 create mode 100644 website/.gitignore
 create mode 100644 website/README.txt
 create mode 100755 website/data_to_json.py
 create mode 100644 website/index.html
 create mode 100755 website/open_test_page.py
 create mode 100755 website/server.py

diff --git a/website/.gitignore b/website/.gitignore
new file mode 100644
index 00000000..765417a3
--- /dev/null
+++ b/website/.gitignore
@@ -0,0 +1 @@
+testcases.json
diff --git a/website/README.txt b/website/README.txt
new file mode 100644
index 00000000..2223e7ab
--- /dev/null
+++ b/website/README.txt
@@ -0,0 +1,16 @@
+Scripts for a Bleach demo/test website
+
+Usage:
+
+from the project root:
+
+# generate testcases.json
+python website/data_to_json.py tests/data > testcases.json
+
+# run the test server
+cd website && python server.py &
+
+# open the page in browsers python can find
+python open_test_page.py
+
+# inspect bleached html and iframe
diff --git a/website/data_to_json.py b/website/data_to_json.py
new file mode 100755
index 00000000..ffd346f5
--- /dev/null
+++ b/website/data_to_json.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+"""
+Util to write a directory of test cases with input filenames
+<testcase>.test and output filenames <testcase>.test.out as JSON to
+stdout.
+
+example:
+
+python tests/data_to_json.py tests/data > testcases.json
+"""
+
+import argparse
+import fnmatch
+import json
+import os
+import os.path
+
+import bleach
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('data_dir',
+                        help='directory containing test cases with input files'
+                        ' named <testcase>.test and output <testcase>.test.out')
+
+    args = parser.parse_args()
+
+    filenames = os.listdir(args.data_dir)
+    ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')]
+    outs = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test.out')]
+
+    testcases = []
+    for infn, outfn in zip(ins, outs):
+        case_name = infn.rsplit('.test', 1)[0]
+
+        with open(infn, 'r') as fin, open(outfn, 'r') as fout:
+            payload = fin.read()[:-1]
+            testcases.append({
+                "title": case_name,
+                "input_filename": infn,
+                "output_filename": outfn,
+                "payload": payload,
+                "actual": bleach.clean(payload),
+                "expected": fout.read(),
+            })
+
+    print(json.dumps(testcases, indent=4, sort_keys=True))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/website/index.html b/website/index.html
new file mode 100644
index 00000000..6ff43871
--- /dev/null
+++ b/website/index.html
@@ -0,0 +1,74 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta charset="UTF-8">
+        <title>Python Bleach 2.0.0</title>
+    </head>
+    <body>
+        <h4>Python Bleach 2.0.0</h4>
+        <p>
+            <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
+            <a href="https://travis-ci.org/mozilla/bleach"><img style="max-width:100%;" alt="Build Status" src="https://travis-ci.org/mozilla/bleach.svg?branch=master"></a>
+        </p>
+        <p>
+        This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes.
+        The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below.
+        </p>
+        <hr>
+        <p><button onclick="sanitize();">bleach.clean textarea value then write result to DOM</button></p>
+        <p><input id="autoclean" type="checkbox" /> clean on change</p>
+        <hr>
+        <p><label for="dirty">Dirty HTML</label></p>
+        <textarea placeholder="Payload goes here, test me, test me hard!" id="dirty" style="width:95%;height:100px"><!-- Loading Test-Vectors ... --></textarea>
+        <p><label for="clean">Clean HTML</label></p>
+        <textarea placeholder="Here be the sanitized markup to inspect!" id="clean" style="width:95%;height:100px" disabled></textarea>
+        <p><label for="ifr">Clean DOM</label></p>
+        <iframe src="about:blank" id="ifr" style="width:95%;height:100px"></iframe>
+
+        <script>
+         if(typeof console === 'undefined') {
+             console = {};
+             console.log = function() {}
+         }
+         var sanitize = function() {
+             var ifr = document.getElementById('ifr');
+             var xhr = new XMLHttpRequest();
+             xhr.open('POST', '/sanitize');
+             xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8");
+             xhr.onload = function() {
+                 var sanitized = xhr.responseText;
+                 ifr.contentDocument.open();
+                 ifr.contentDocument.write(sanitized);
+                 clean.value = sanitized;
+                 ifr.contentDocument.close();
+             }
+             xhr.send(dirty.value);
+         };
+
+         document.getElementById('dirty')
+                 .addEventListener('input', function () {
+                     var autocleanEl = document.getElementById('autoclean');
+                     if (autocleanEl.checked) {
+                         sanitize();
+                     }
+         });
+
+         document.addEventListener('DOMContentLoaded', function() {
+             var xhr = new XMLHttpRequest();
+             xhr.open('GET', '/testcases.json');
+             xhr.onload = function() {
+                 data=JSON.parse(xhr.responseText);
+                 dirty.value = '<!-- I am ready now, click one of the buttons! -->\r\n';
+                 for(i in data) {
+                     dirty.value += data[i].payload+"\r\n\r\n";
+                 }
+                 sanitize();
+             }
+             xhr.onerror = function() {
+                 console.error(arguments)
+             }
+             xhr.send(null);
+         }, false);
+        </script>
+    </body>
+</html>
diff --git a/website/open_test_page.py b/website/open_test_page.py
new file mode 100755
index 00000000..b812de92
--- /dev/null
+++ b/website/open_test_page.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import webbrowser
+
+TEST_BROWSERS = set([
+    # 'mozilla',
+    'firefox',
+    # 'netscape',
+    # 'galeon',
+    # 'epiphany',
+    # 'skipstone',
+    # 'kfmclient',
+    # 'konqueror',
+    # 'kfm',
+    # 'mosaic',
+    # 'opera',
+    # 'grail',
+    # 'links',
+    # 'elinks',
+    # 'lynx',
+    # 'w3m',
+    'windows-default',
+    # 'macosx',
+    'safari',
+    # 'google-chrome',
+    'chrome',
+    # 'chromium',
+    # 'chromium-browser',
+])
+REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
+
+if __name__ == '__main__':
+    for b in TEST_BROWSERS & REGISTERED_BROWSERS:
+        webbrowser.get(b).open_new_tab('http://localhost:8080')
diff --git a/website/server.py b/website/server.py
new file mode 100755
index 00000000..83fcf84a
--- /dev/null
+++ b/website/server.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+"""
+Simple Test/Demo Server for running bleach.clean output
+on various desktops.
+
+Usage:
+
+python server.py
+"""
+
+import SimpleHTTPServer
+import SocketServer
+import json
+
+import bleach
+
+
+PORT = 8080
+
+class BleachCleanHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+
+    def do_POST(self):
+        content_len = int(self.headers.getheader('content-length', 0))
+        body = self.rfile.read(content_len)
+        print("read %s bytes: %s" % (content_len, body))
+        cleaned = bleach.clean(body)
+        print("cleaned %s" % cleaned)
+
+        self.send_response(200)
+        self.send_header('Content-Length', len(cleaned))
+        self.send_header('Content-Type', 'text/plain;charset=UTF-8')
+        self.end_headers()
+
+        self.wfile.write(cleaned)
+
+
+if __name__ == '__main__':
+    SocketServer.TCPServer.allow_reuse_address = True  # Prevent 'cannot bind to address' errors on restart
+    httpd = SocketServer.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
+    print("listening on localhost port %d" % PORT)
+    httpd.serve_forever()

From 41349916a756c74ef1e249e5a93ce7e789b89a59 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Fri, 24 Feb 2017 14:25:39 -0500
Subject: [PATCH 058/314] isolate in-browser tests; add option to insert unsafe

---
 website/index.html | 117 +++++++++++++++++++++++++++++++++------------
 1 file changed, 87 insertions(+), 30 deletions(-)

diff --git a/website/index.html b/website/index.html
index 6ff43871..6cd1ef10 100644
--- a/website/index.html
+++ b/website/index.html
@@ -3,6 +3,17 @@
     <head>
         <meta charset="UTF-8">
         <title>Python Bleach 2.0.0</title>
+        <style>
+         textarea, iframe {
+             width: 95%;
+         }
+         #clean, #dirty, #ifr {
+             height: 100px;
+         }
+         .test-case > .ifr {
+             height: 50px;
+         }
+        </style>
     </head>
     <body>
         <h4>Python Bleach 2.0.0</h4>
@@ -12,39 +23,91 @@ <h4>Python Bleach 2.0.0</h4>
         </p>
         <p>
         This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes.
-        The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below.
+        Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.
         </p>
         <hr>
-        <p><button onclick="sanitize();">bleach.clean textarea value then write result to DOM</button></p>
-        <p><input id="autoclean" type="checkbox" /> clean on change</p>
+        <p><button id="unsafe-write">write <strong>unsanitized</strong> textarea value to DOM</button></p>
+        <p><button id="clean-and-write">bleach.clean textarea value then write result to DOM</button></p>
+        <p><input id="autoclean" type="checkbox" checked /> clean when dirty HTML changes</p>
         <hr>
-        <p><label for="dirty">Dirty HTML</label></p>
-        <textarea placeholder="Payload goes here, test me, test me hard!" id="dirty" style="width:95%;height:100px"><!-- Loading Test-Vectors ... --></textarea>
-        <p><label for="clean">Clean HTML</label></p>
-        <textarea placeholder="Here be the sanitized markup to inspect!" id="clean" style="width:95%;height:100px" disabled></textarea>
-        <p><label for="ifr">Clean DOM</label></p>
-        <iframe src="about:blank" id="ifr" style="width:95%;height:100px"></iframe>
+
+        <div>
+            <h4>Demo</h4>
+
+            <p><label for="dirty">Dirty HTML</label></p>
+            <textarea placeholder="Payload goes here." id="dirty"><!-- Loading Test-Vectors ... --></textarea>
+            <p><label for="clean">Clean HTML</label></p>
+            <textarea placeholder="Here be the sanitized markup to inspect!" id="clean"></textarea>
+            <p><label for="ifr">Clean DOM</label></p>
+            <iframe src="about:blank" id="ifr"></iframe>
+        </div>
+
+        <div id="test-template" class="test-case" style="display:none">
+            <a class="test-name-link" href="#"><h4 class="test-name"></h4></a>
+
+            <p><label for="dirty">Dirty HTML</label></p>
+            <textarea placeholder="Payload goes here." class="dirty" readonly rows="2"></textarea>
+            <p><label for="clean">Clean HTML</label></p>
+            <textarea placeholder="Here be the sanitized markup to inspect!" class="clean" readonly rows="2"></textarea>
+            <p><label for="ifr">Clean DOM</label></p>
+            <iframe src="about:blank" class="ifr"></iframe>
+        </div>
 
         <script>
-         if(typeof console === 'undefined') {
-             console = {};
-             console.log = function() {}
-         }
+         var writeToIframe = function(ifr, value) {
+             ifr.contentDocument.open();
+             ifr.contentDocument.write(value);
+             ifr.contentDocument.close();
+         };
+
          var sanitize = function() {
-             var ifr = document.getElementById('ifr');
              var xhr = new XMLHttpRequest();
              xhr.open('POST', '/sanitize');
              xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8");
              xhr.onload = function() {
                  var sanitized = xhr.responseText;
-                 ifr.contentDocument.open();
-                 ifr.contentDocument.write(sanitized);
+                 document.querySelector('label[for=ifr]').textContent = "Clean DOM";
+                 writeToIframe(document.getElementById('ifr'), sanitized);
                  clean.value = sanitized;
-                 ifr.contentDocument.close();
              }
              xhr.send(dirty.value);
          };
 
+         var unsafeWrite = function() {
+             clean.value = "N/A";
+             document.querySelector('label[for=ifr]').textContent = "Dirty DOM";
+             writeToIframe(document.getElementById('ifr'), dirty.value);
+         };
+
+         var addTest = function (test, index) {
+             var testTemplate = document.getElementById('test-template');
+             var template = testTemplate.cloneNode(true);
+             var testId = 'test-' + index;
+             template.setAttribute('id', testId);
+             template.getElementsByClassName('test-name-link')[0].setAttribute('href', '#' + testId);
+             template.getElementsByClassName('test-name')[0].textContent = test.title;
+             template.getElementsByClassName('dirty')[0].value = test.payload;
+             template.getElementsByClassName('clean')[0].value = test.actual;
+             template.style.display = "block";
+
+             // iframe must be in DOM before we can write to it.
+             document.getElementsByTagName('body')[0].appendChild(template);
+
+             writeToIframe(template.getElementsByClassName('ifr')[0], test.actual);
+         };
+
+         var loadTests = function(url, onSuccess) {
+             var xhr = new XMLHttpRequest();
+             xhr.open('GET', url);
+             xhr.onload = function () {
+                 return onSuccess(xhr.responseText);
+             };
+             xhr.onerror = function() {
+                 console.error(arguments)
+             }
+             xhr.send(null);
+         };
+
          document.getElementById('dirty')
                  .addEventListener('input', function () {
                      var autocleanEl = document.getElementById('autoclean');
@@ -53,21 +116,15 @@ <h4>Python Bleach 2.0.0</h4>
                      }
          });
 
+         document.getElementById('unsafe-write').addEventListener('click', unsafeWrite, false);
+         document.getElementById('clean-and-write').addEventListener('click', sanitize, false);
+
          document.addEventListener('DOMContentLoaded', function() {
-             var xhr = new XMLHttpRequest();
-             xhr.open('GET', '/testcases.json');
-             xhr.onload = function() {
-                 data=JSON.parse(xhr.responseText);
+             loadTests('/testcases.json', function (responseText) {
                  dirty.value = '<!-- I am ready now, click one of the buttons! -->\r\n';
-                 for(i in data) {
-                     dirty.value += data[i].payload+"\r\n\r\n";
-                 }
-                 sanitize();
-             }
-             xhr.onerror = function() {
-                 console.error(arguments)
-             }
-             xhr.send(null);
+
+                 JSON.parse(responseText).forEach(addTest)
+             });
          }, false);
         </script>
     </body>

From 13dadf8e990f7773758bdac662ff0c9e87f345e0 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Fri, 24 Feb 2017 14:59:26 -0500
Subject: [PATCH 059/314] unsafe and safe insert testcase vectors into dom

---
 website/index.html | 88 ++++++++++++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/website/index.html b/website/index.html
index 6cd1ef10..21ece427 100644
--- a/website/index.html
+++ b/website/index.html
@@ -13,43 +13,54 @@
          .test-case > .ifr {
              height: 50px;
          }
+         .test-case, .demo {
+             padding-bottom: 15px;
+             border-bottom: 2px solid gray;
+         }
         </style>
     </head>
     <body>
-        <h4>Python Bleach 2.0.0</h4>
+        <h2>Python Bleach 2.0.0</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://travis-ci.org/mozilla/bleach"><img style="max-width:100%;" alt="Build Status" src="https://travis-ci.org/mozilla/bleach.svg?branch=master"></a>
         </p>
-        <p>
-        This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes.
-        Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.
-        </p>
-        <hr>
-        <p><button id="unsafe-write">write <strong>unsanitized</strong> textarea value to DOM</button></p>
-        <p><button id="clean-and-write">bleach.clean textarea value then write result to DOM</button></p>
-        <p><input id="autoclean" type="checkbox" checked /> clean when dirty HTML changes</p>
-        <hr>
 
-        <div>
-            <h4>Demo</h4>
+        <div class="demo">
+            <h3>Demo</h3>
+            <p>
+                This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes.
+                Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.
+            </p>
+
+            <hr>
+            <p><button class="unsafe-write">write <strong>unsanitized</strong> textarea value to DOM</button></p>
+            <p><button class="clean-and-write">bleach.clean textarea value then write result to DOM</button></p>
+            <p><input id="autoclean" type="checkbox" checked /> clean when dirty HTML changes</p>
 
             <p><label for="dirty">Dirty HTML</label></p>
-            <textarea placeholder="Payload goes here." id="dirty"><!-- Loading Test-Vectors ... --></textarea>
+            <textarea placeholder="Payload goes here." id="dirty" class="dirty"><!-- Loading Test-Vectors ... --></textarea>
             <p><label for="clean">Clean HTML</label></p>
-            <textarea placeholder="Here be the sanitized markup to inspect!" id="clean"></textarea>
-            <p><label for="ifr">Clean DOM</label></p>
-            <iframe src="about:blank" id="ifr"></iframe>
+            <textarea placeholder="Here be the sanitized markup to inspect!" id="clean" class="clean"></textarea>
+            <p><label for="ifr" class="dom-label">Clean DOM</label></p>
+            <iframe src="about:blank" id="ifr" class="ifr"></iframe>
         </div>
 
         <div id="test-template" class="test-case" style="display:none">
-            <a class="test-name-link" href="#"><h4 class="test-name"></h4></a>
+            <a class="test-name-link" href="#"><h3 class="test-name"></h3></a>
+
+            <p>
+                <button class="unsafe-write" data-test="null">
+                    write <strong>unsanitized</strong> textarea value to DOM
+                </button>
+                <p><button class="clean-and-write">bleach.clean textarea value then write result to DOM</button></p>
+            </p>
 
             <p><label for="dirty">Dirty HTML</label></p>
             <textarea placeholder="Payload goes here." class="dirty" readonly rows="2"></textarea>
             <p><label for="clean">Clean HTML</label></p>
             <textarea placeholder="Here be the sanitized markup to inspect!" class="clean" readonly rows="2"></textarea>
-            <p><label for="ifr">Clean DOM</label></p>
+            <p><label for="ifr" class="dom-label">Clean DOM</label></p>
             <iframe src="about:blank" class="ifr"></iframe>
         </div>
 
@@ -60,23 +71,29 @@ <h4>Demo</h4>
              ifr.contentDocument.close();
          };
 
-         var sanitize = function() {
-             var xhr = new XMLHttpRequest();
-             xhr.open('POST', '/sanitize');
-             xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8");
-             xhr.onload = function() {
-                 var sanitized = xhr.responseText;
-                 document.querySelector('label[for=ifr]').textContent = "Clean DOM";
-                 writeToIframe(document.getElementById('ifr'), sanitized);
-                 clean.value = sanitized;
+         var sanitize = function(event) {
+             if (event && event.target && event.target.className.indexOf("clean-and-write") !== -1) {
+                 var grandParent = event.target.parentElement.parentElement;
+                 var xhr = new XMLHttpRequest();
+                 xhr.open('POST', '/sanitize');
+                 xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8");
+                 xhr.onload = function() {
+                     var sanitized = xhr.responseText;
+                     grandParent.querySelector("label.dom-label[for=ifr]").textContent = "Clean DOM";
+                     writeToIframe(grandParent.querySelector('.ifr'), sanitized);
+                     grandParent.querySelector("textarea.clean").value = sanitized;
+                 }
+                 xhr.send(grandParent.querySelector("textarea.dirty").value);
              }
-             xhr.send(dirty.value);
          };
 
-         var unsafeWrite = function() {
-             clean.value = "N/A";
-             document.querySelector('label[for=ifr]').textContent = "Dirty DOM";
-             writeToIframe(document.getElementById('ifr'), dirty.value);
+         var unsafeWrite = function(event) {
+             if (event.target.className.indexOf("unsafe-write") !== -1) {
+                 var grandParent = event.target.parentElement.parentElement;
+                 grandParent.querySelector("textarea.clean").value = "N/A";
+                 grandParent.querySelector("label.dom-label[for=ifr]").textContent = "Dirty DOM";
+                 writeToIframe(grandParent.querySelector('.ifr'), grandParent.querySelector("textarea.dirty").value);
+             }
          };
 
          var addTest = function (test, index) {
@@ -108,16 +125,17 @@ <h4>Demo</h4>
              xhr.send(null);
          };
 
+         // TODO: debounce input events?
          document.getElementById('dirty')
                  .addEventListener('input', function () {
                      var autocleanEl = document.getElementById('autoclean');
                      if (autocleanEl.checked) {
-                         sanitize();
+                         sanitize({target: document.getElementsByClassName('clean-and-write')[0]});
                      }
          });
 
-         document.getElementById('unsafe-write').addEventListener('click', unsafeWrite, false);
-         document.getElementById('clean-and-write').addEventListener('click', sanitize, false);
+         document.addEventListener('click', unsafeWrite, false);
+         document.addEventListener('click', sanitize, false);
 
          document.addEventListener('DOMContentLoaded', function() {
              loadTests('/testcases.json', function (responseText) {

From 19be3f56ecee18c2d43c8bb32bd5b070506449b8 Mon Sep 17 00:00:00 2001
From: Vadim Kotov <vkotovv@gmail.com>
Date: Sun, 26 Feb 2017 15:50:18 +0400
Subject: [PATCH 060/314] Fixed pypi badge URL

Capital letters dont seem to be working with badge.fury.io
---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 3bd87573..403ff9b6 100644
--- a/README.rst
+++ b/README.rst
@@ -5,8 +5,8 @@ Bleach
 .. image:: https://travis-ci.org/mozilla/bleach.png?branch=master
    :target: https://travis-ci.org/mozilla/bleach
 
-.. image:: https://badge.fury.io/py/Bleach.svg
-   :target: http://badge.fury.io/py/Bleach
+.. image:: https://badge.fury.io/py/bleach.svg
+   :target: http://badge.fury.io/py/bleach
 
 Bleach is a whitelist-based HTML sanitizing library that escapes or strips
 markup and attributes.

From a2c015bf23d239deed61341694c52839a5fa68bb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 10:22:40 -0500
Subject: [PATCH 061/314] Refactor allow_token

This redoes the innards of allow_token so that it's a single loop across all the
attributes in a token rather than a bunch of little passes.

This has less looping, so theoretically it's more optimal, but I didn't spend
any time testing that theory.
---
 bleach/sanitizer.py | 81 +++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 789d89e6..f5244b49 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -84,51 +84,60 @@ def allow_token(self, token):
             for namespaced_name, val in token['data'].items():
                 namespace, name = namespaced_name
 
+                # See if we should dump the attribute
                 if callable(allowed_attributes):
-                    if allowed_attributes(name, val):
-                        attrs[namespaced_name] = val
+                    if not allowed_attributes(name, val):
+                        # DROP!
+                        continue
 
-                elif name in allowed_attributes:
-                    attrs[namespaced_name] = val
-
-            # Handle attributes that have uri values
-            for attr in self.attr_val_is_uri:
-                if attr not in attrs:
+                elif name not in allowed_attributes:
+                    # DROP!
                     continue
 
-                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                       unescape(attrs[attr])).lower()
-
-                # Remove replacement characters from unescaped characters.
-                val_unescaped = val_unescaped.replace("\ufffd", "")
-
-                if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
-                        (val_unescaped.split(':')[0] not in self.allowed_protocols)):
-                    # It has a protocol, but it's not allowed--so drop it
-                    del attrs[attr]
-
-            # Drop values in svg attrs with non-local IRIs
-            for attr in self.svg_attr_val_allows_ref:
-                if attr in attrs:
+                # Look at attributes that have uri values
+                if namespaced_name in self.attr_val_is_uri:
+                    val_unescaped = re.sub(
+                        "[`\000-\040\177-\240\s]+",
+                        '',
+                        unescape(val)).lower()
+
+                    # Remove replacement characters from unescaped characters.
+                    val_unescaped = val_unescaped.replace("\ufffd", "")
+
+                    # Drop attributes with uri values that have protocols that
+                    # aren't allowed
+                    if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
+                            (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                        # DROP!
+                        continue
+
+                # Drop values in svg attrs with non-local IRIs
+                if namespaced_name in self.svg_attr_val_allows_ref:
                     new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                      ' ',
-                                     unescape(attrs[attr]))
+                                     unescape(val))
                     new_val = new_val.strip()
                     if not new_val:
-                        del attrs[attr]
+                        # DROP!
+                        continue
+
                     else:
-                        attrs[attr] = new_val
-
-            # Drop href and xlink:href attr for svg elements with non-local IRIs
-            if (None, token['name']) in self.svg_allow_local_href:
-                for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]:
-                    if href_attr in attrs:
-                        if re.search(r'^\s*[^#\s]', attrs[href_attr]):
-                            del attrs[href_attr]
-
-            # Sanitize css in style attribute
-            if (None, u'style') in attrs:
-                attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
+                        # Replace the val with the unescaped version because
+                        # it's a iri
+                        val = new_val
+
+                # Drop href and xlink:href attr for svg elements with non-local IRIs
+                if (None, token['name']) in self.svg_allow_local_href:
+                    if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
+                        if re.search(r'^\s*[^#\s]', val):
+                            continue
+
+                # If it's a style attribute, sanitize it
+                if namespaced_name == (None, u'style'):
+                    val = self.sanitize_css(val)
+
+                # At this point, we want to keep the attribute, so add it in
+                attrs[namespaced_name] = val
 
             # Alphabetize attributes
             token['data'] = OrderedDict(

From 62eb131f48745264b1a0b1ccf5495d91349957ec Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 10:27:51 -0500
Subject: [PATCH 062/314] Fix linting error

---
 bleach/sanitizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index f5244b49..50fc2c55 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -141,7 +141,7 @@ def allow_token(self, token):
 
             # Alphabetize attributes
             token['data'] = OrderedDict(
-                [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)]
+                [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
             )
         return token
 

From 73a205a01fef5d962ff9a9daee6ed041625ae828 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 10:50:42 -0500
Subject: [PATCH 063/314] Clean up code comments to be clearer

---
 bleach/sanitizer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 50fc2c55..b7162eed 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -79,19 +79,22 @@ def allow_token(self, token):
             if not callable(allowed_attributes):
                 allowed_attributes += self.wildcard_attributes
 
-            # Drop any attributes that aren't allowed
+            # Loop through all the attributes and drop the ones that are not
+            # allowed, are unsafe or break other rules. Additionally, fix
+            # attribute values that need fixing.
+            #
+            # At the end of this loop, we have the final set of attributes
+            # we're keeping.
             attrs = {}
             for namespaced_name, val in token['data'].items():
                 namespace, name = namespaced_name
 
-                # See if we should dump the attribute
+                # Drop attributes that are not explicitly allowed
                 if callable(allowed_attributes):
                     if not allowed_attributes(name, val):
-                        # DROP!
                         continue
 
                 elif name not in allowed_attributes:
-                    # DROP!
                     continue
 
                 # Look at attributes that have uri values
@@ -108,7 +111,6 @@ def allow_token(self, token):
                     # aren't allowed
                     if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
                             (val_unescaped.split(':')[0] not in self.allowed_protocols)):
-                        # DROP!
                         continue
 
                 # Drop values in svg attrs with non-local IRIs
@@ -118,7 +120,6 @@ def allow_token(self, token):
                                      unescape(val))
                     new_val = new_val.strip()
                     if not new_val:
-                        # DROP!
                         continue
 
                     else:
@@ -143,6 +144,7 @@ def allow_token(self, token):
             token['data'] = OrderedDict(
                 [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
             )
+
         return token
 
     def sanitize_css(self, style):

From 564b040d2e66a8af0ac97e5d4b28cc4aea4f24cb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 10:53:55 -0500
Subject: [PATCH 064/314] Add Cleaner

Bleach.clean() is often used on batches of content. This allows you to create a
Cleaner class that encapsulates all the clean arguments into a single instance
and ald reuses html5lib parser, walker and serializer.
---
 bleach/__init__.py | 154 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 120 insertions(+), 34 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index a645ac7e..296448ee 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -92,6 +92,100 @@
 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 
 
+class Cleaner(object):
+    """Cleaner for cleaning HTML fragments of malicious content
+
+    This cleaner is a security-focused function whose sole purpose is to remove
+    malicious content from a string such that it can be displayed as content in
+    a web page.
+
+    This cleaner is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    To use::
+
+        from bleach import Cleaner
+
+        cleaner = Cleaner()
+
+        for text in all_the_yucky_things:
+            sanitized = cleaner.clean(text)
+
+    """
+
+    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+                 strip_comments=True):
+        """
+        :arg tags: whitelist of allowed tags; defaults to
+            ``bleach.ALLOWED_TAGS``
+
+        :arg attributes: whitelist of allowed attributes; defaults to
+            ``bleach.ALLOWED_ATTRIBUTES``
+
+        :arg styles: whitelist of allowed css; defaults to
+            ``bleach.ALLOWED_STYLES``
+
+        :arg protocols: whitelist of allowed protocols for links; defaults
+            to ``bleach.ALLOWED_PROTOCOLS``
+
+        :arg strip: whether or not to strip disallowed elements
+
+        :arg strip_comments: whether or not to strip HTML comments
+
+        """
+        self.tags = tags
+        self.attributes = attributes
+        self.styles = styles
+        self.protocols = protocols
+        self.strip = strip
+        self.strip_comments = strip_comments
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib.getTreeWalker('etree')
+        self.serializer = HTMLSerializer(
+            quote_attr_values='always',
+            omit_optional_tags=False,
+
+            # Bleach has its own sanitizer, so don't use the html5lib one
+            sanitize=False,
+
+            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            alphabetical_attributes=False,
+        )
+
+    def clean(self, text):
+        """Cleans text and returns sanitized result as unicode
+
+        :arg str text: text to be cleaned
+
+        :returns: sanitized text as unicode
+
+        """
+        if not text:
+            return u''
+
+        text = force_unicode(text)
+
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
+
+            # Bleach-sanitizer-specific things
+            allowed_attributes_map=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
+
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
+
+        return self.serializer.render(filtered)
+
+
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
           strip_comments=True):
@@ -104,56 +198,48 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     This function is not designed to use to transform content to be used in
     non-web-page contexts.
 
+    Example::
+
+        import bleach
+
+        better_text = bleach.clean(yucky_text)
+
+
+    .. Note::
+
+       If you're cleaning a lot of text and passing the same argument
+       values, consider caching a ``Cleaner`` instance.
+
     :arg text: the text to clean
+
     :arg tags: whitelist of allowed tags; defaults to
         ``bleach.ALLOWED_TAGS``
+
     :arg attributes: whitelist of allowed attributes; defaults to
         ``bleach.ALLOWED_ATTRIBUTES``
+
     :arg styles: whitelist of allowed css; defaults to
         ``bleach.ALLOWED_STYLES``
+
     :arg protocols: whitelist of allowed protocols for links; defaults
         to ``bleach.ALLOWED_PROTOCOLS``
+
     :arg strip: whether or not to strip disallowed elements
+
     :arg strip_comments: whether or not to strip HTML comments
 
     :returns: cleaned text as unicode
 
     """
-    if not text:
-        return u''
-
-    text = force_unicode(text)
-
-    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
-    dom = parser.parseFragment(text)
-
-    walker = html5lib.getTreeWalker('etree')
-    filtered = BleachSanitizerFilter(
-        source=walker(dom),
-
-        # Bleach-sanitizer-specific things
-        allowed_attributes_map=attributes,
-        strip_disallowed_elements=strip,
-        strip_html_comments=strip_comments,
-
-        # html5lib-sanitizer things
-        allowed_elements=tags,
-        allowed_css_properties=styles,
-        allowed_protocols=protocols,
-        allowed_svg_properties=[],
-
-    )
-    s = HTMLSerializer(
-        quote_attr_values='always',
-        omit_optional_tags=False,
-
-        # Bleach has its own sanitizer, so don't use the html5lib one
-        sanitize=False,
-
-        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
-        alphabetical_attributes=False,
+    cleaner = Cleaner(
+        tags=tags,
+        attributes=attributes,
+        styles=styles,
+        protocols=protocols,
+        strip=strip,
+        strip_comments=strip_comments,
     )
-    return s.render(filtered)
+    return cleaner.clean(text)
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,

From a97f3eade7157b22b9d76ccb105f100347b0fe2f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 10:58:28 -0500
Subject: [PATCH 065/314] Add docs for Cleaner

---
 docs/clean.rst | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index a988a81a..63f0427e 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -5,10 +5,7 @@
 ``bleach.clean()``
 ==================
 
-``clean()`` is Bleach's HTML sanitization method.
-
-.. autofunction:: bleach.clean
-
+:py:func:`bleach.clean`` is Bleach's HTML sanitization method.
 
 Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing
 algorithm and sanitize any disallowed tags or attributes. This algorithm also
@@ -19,6 +16,14 @@ takes care of things like unclosed and (some) misnested tags.
    always return ``unicode``.
 
 
+If you're cleaning a lot of text, you might want to create a
+:py:class:`bleach.Cleaner` instance.
+
+.. autofunction:: bleach.clean
+
+.. autoclass:: bleach.Cleaner
+
+
 Tag Whitelist
 =============
 

From a0b88285cf6bcf8a7050b96ba0632eecf998d6f1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 11:11:56 -0500
Subject: [PATCH 066/314] Cosmetic: Reorganize tests

This is entirely cosmetic reorganization of tests so it's easier to see test
coverage for clean vs. linkify.
---
 tests/test_basics.py | 600 +++++++++++++++++++++----------------------
 1 file changed, 288 insertions(+), 312 deletions(-)

diff --git a/tests/test_basics.py b/tests/test_basics.py
index c42ccc62..790ad559 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -5,141 +5,289 @@
 import bleach
 
 
-def test_empty():
-    assert bleach.clean('') == ''
-
-
-def test_nbsp():
-    if six.PY3:
-        expected = '\xa0test string\xa0'
-    else:
-        expected = six.u('\\xa0test string\\xa0')
-
-    assert bleach.clean('&nbsp;test string&nbsp;') == expected
-
-
-def test_comments_only():
-    comment = '<!-- this is a comment -->'
-    open_comment = '<!-- this is an open comment'
-    assert bleach.clean(comment) == ''
-    assert bleach.clean(open_comment) == ''
-    assert bleach.clean(comment, strip_comments=False) == comment
-    assert (
-        bleach.clean(open_comment, strip_comments=False) ==
-        '{0!s}-->'.format(open_comment)
-    )
-
-
-def test_with_comments():
-    html = '<!-- comment -->Just text'
-    assert 'Just text', bleach.clean(html) == 'Just text'
-    assert bleach.clean(html, strip_comments=False) == html
-
-
-def test_no_html():
-    assert bleach.clean('no html string') == 'no html string'
-
-
-def test_allowed_html():
-    assert (
-        bleach.clean('an <strong>allowed</strong> tag') ==
-        'an <strong>allowed</strong> tag'
-    )
-    assert (
-        bleach.clean('another <em>good</em> tag') ==
-        'another <em>good</em> tag'
-    )
-
-
-def test_bad_html():
-    assert (
-        bleach.clean('a <em>fixed tag') ==
-        'a <em>fixed tag</em>'
-    )
-
-
-def test_function_arguments():
-    TAGS = ['span', 'br']
-    ATTRS = {'span': ['style']}
-
-    assert (
-        bleach.clean('a <br/><span style="color:red">test</span>',
-                     tags=TAGS, attributes=ATTRS) ==
-        'a <br><span style="">test</span>'
-    )
-
-
-def test_named_arguments():
-    ATTRS = {'a': ['rel', 'href']}
-
-    text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
-
-    assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
-    assert (
-        bleach.clean(text, attributes=ATTRS) ==
-        '<a href="http://xx.com" rel="alternate">xx.com</a>'
-    )
-
-
-def test_disallowed_html():
-    assert (
-        bleach.clean('a <script>safe()</script> test') ==
-        'a &lt;script&gt;safe()&lt;/script&gt; test'
-    )
-    assert (
-        bleach.clean('a <style>body{}</style> test') ==
-        'a &lt;style&gt;body{}&lt;/style&gt; test'
-    )
-
-
-def test_bad_href():
-    assert (
-        bleach.clean('<em href="fail">no link</em>') ==
-        '<em>no link</em>'
-    )
-
-
-def test_bare_entities():
-    assert (
-        bleach.clean('an & entity') ==
-        'an &amp; entity'
-    )
-    assert (
-        bleach.clean('an < entity') ==
-        'an &lt; entity'
-    )
-
-    assert (
-        bleach.clean('tag < <em>and</em> entity') ==
-        'tag &lt; <em>and</em> entity'
-    )
-
-    assert (
-        bleach.clean('&amp;') ==
-        '&amp;'
-    )
-
-
-def test_escaped_entities():
-    s = '&lt;em&gt;strong&lt;/em&gt;'
-    assert bleach.clean(s) == s
-
-
-def test_serializer():
-    s = '<table></table>'
-    assert bleach.clean(s, tags=['table']) == s
-    assert bleach.linkify('<table>test</table>') == 'test<table></table>'
-    assert bleach.clean('<p>test</p>', tags=['p']) == '<p>test</p>'
-
-
-def test_no_href_links():
-    s = '<a name="anchor">x</a>'
-    assert bleach.linkify(s) == s
-
-
-def test_weird_strings():
-    s = '</3'
-    assert bleach.clean(s) == ''
+class TestClean:
+    def test_empty(self):
+        assert bleach.clean('') == ''
+
+    def test_nbsp(self):
+        if six.PY3:
+            expected = '\xa0test string\xa0'
+        else:
+            expected = six.u('\\xa0test string\\xa0')
+
+        assert bleach.clean('&nbsp;test string&nbsp;') == expected
+
+    def test_comments_only(self):
+        comment = '<!-- this is a comment -->'
+        open_comment = '<!-- this is an open comment'
+        assert bleach.clean(comment) == ''
+        assert bleach.clean(open_comment) == ''
+        assert bleach.clean(comment, strip_comments=False) == comment
+        assert (
+            bleach.clean(open_comment, strip_comments=False) ==
+            '{0!s}-->'.format(open_comment)
+        )
+
+    def test_with_comments(self):
+        html = '<!-- comment -->Just text'
+        assert 'Just text', bleach.clean(html) == 'Just text'
+        assert bleach.clean(html, strip_comments=False) == html
+
+    def test_no_html(self):
+        assert bleach.clean('no html string') == 'no html string'
+
+    def test_allowed_html(self):
+        assert (
+            bleach.clean('an <strong>allowed</strong> tag') ==
+            'an <strong>allowed</strong> tag'
+        )
+        assert (
+            bleach.clean('another <em>good</em> tag') ==
+            'another <em>good</em> tag'
+        )
+
+    def test_bad_html(self):
+        assert (
+            bleach.clean('a <em>fixed tag') ==
+            'a <em>fixed tag</em>'
+        )
+
+    def test_function_arguments(self):
+        TAGS = ['span', 'br']
+        ATTRS = {'span': ['style']}
+
+        assert (
+            bleach.clean('a <br/><span style="color:red">test</span>',
+                         tags=TAGS, attributes=ATTRS) ==
+            'a <br><span style="">test</span>'
+        )
+
+    def test_named_arguments(self):
+        ATTRS = {'a': ['rel', 'href']}
+
+        text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
+
+        assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
+        assert (
+            bleach.clean(text, attributes=ATTRS) ==
+            '<a href="http://xx.com" rel="alternate">xx.com</a>'
+        )
+
+    def test_disallowed_html(self):
+        assert (
+            bleach.clean('a <script>safe()</script> test') ==
+            'a &lt;script&gt;safe()&lt;/script&gt; test'
+        )
+        assert (
+            bleach.clean('a <style>body{}</style> test') ==
+            'a &lt;style&gt;body{}&lt;/style&gt; test'
+        )
+
+    def test_bad_href(self):
+        assert (
+            bleach.clean('<em href="fail">no link</em>') ==
+            '<em>no link</em>'
+        )
+
+    def test_bare_entities(self):
+        assert (
+            bleach.clean('an & entity') ==
+            'an &amp; entity'
+        )
+        assert (
+            bleach.clean('an < entity') ==
+            'an &lt; entity'
+        )
+
+        assert (
+            bleach.clean('tag < <em>and</em> entity') ==
+            'tag &lt; <em>and</em> entity'
+        )
+
+        assert (
+            bleach.clean('&amp;') ==
+            '&amp;'
+        )
+
+    def test_escaped_entities(self):
+        s = '&lt;em&gt;strong&lt;/em&gt;'
+        assert bleach.clean(s) == s
+
+    def test_weird_strings(self):
+        s = '</3'
+        assert bleach.clean(s) == ''
+
+    def test_stripping(self):
+        assert (
+            bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True) ==
+            'a test <em>with</em> <b>html</b> tags'
+        )
+        assert (
+            bleach.clean('a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
+                         strip=True) ==
+            'a test <em>with</em>  <b>html</b> tags'
+        )
+
+        s = '<p><a href="http://example.com/">link text</a></p>'
+        assert (
+            bleach.clean(s, tags=['p'], strip=True) ==
+            '<p>link text</p>'
+        )
+        s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
+        assert (
+            bleach.clean(s, tags=['p'], strip=True) ==
+            '<p>multiply nested text</p>'
+        )
+
+        s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
+             '</a></p>')
+        assert (
+            bleach.clean(s, tags=['p', 'a'], strip=True) ==
+            '<p><a href="http://example.com/"></a></p>'
+        )
+
+    def test_allowed_styles(self):
+        ATTR = ['style']
+        STYLE = ['color']
+        blank = '<b style=""></b>'
+        s = '<b style="color: blue;"></b>'
+        assert bleach.clean('<b style="top:0"></b>', attributes=ATTR) == blank
+        assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s
+        assert (
+            bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTR, styles=STYLE) ==
+            s
+        )
+
+    def test_lowercase_html(self):
+        """We should output lowercase HTML."""
+        dirty = '<EM CLASS="FOO">BAR</EM>'
+        clean = '<em class="FOO">BAR</em>'
+        assert bleach.clean(dirty, attributes=['class']) == clean
+
+    def test_wildcard_attributes(self):
+        ATTR = {
+            '*': ['id'],
+            'img': ['src'],
+        }
+        TAG = ['img', 'em']
+        dirty = ('both <em id="foo" style="color: black">can</em> have '
+                 '<img id="bar" src="foo"/>')
+        assert (
+            bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
+            'both <em id="foo">can</em> have <img id="bar" src="foo">'
+        )
+
+    def test_callable_attributes(self):
+        """Verify callable attributes work and get correct arg values"""
+        def img_test(attr, val):
+            return attr == 'src' and val.startswith('https')
+
+        ATTR = {
+            'img': img_test,
+        }
+        TAGS = ['img']
+
+        assert (
+            bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+            u'foo <img> baz'
+        )
+        assert (
+            bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+            u'foo <img src="https://example.com"> baz'
+        )
+
+    def test_svg_attr_val_allows_ref(self):
+        """Unescape values in svg attrs that allow url references"""
+        # Local IRI, so keep it
+        text = '<svg><rect fill="url(#foo)" /></svg>'
+        TAGS = ['svg', 'rect']
+        ATTRS = {
+            'rect': ['fill'],
+        }
+        assert (
+            bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+            '<svg><rect fill="url(#foo)"></rect></svg>'
+        )
+
+        # Non-local IRI, so drop it
+        text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
+        TAGS = ['svg', 'rect']
+        ATTRS = {
+            'rect': ['fill'],
+        }
+        assert (
+            bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+            '<svg><rect></rect></svg>'
+        )
+
+    @pytest.mark.parametrize('text, expected', [
+        (
+            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+            '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
+        ),
+        (
+            '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
+            # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
+        ),
+    ])
+    def test_svg_allow_local_href(self, text, expected):
+        """Keep local hrefs for svg elements"""
+        TAGS = ['svg', 'pattern']
+        ATTRS = {
+            'pattern': ['id', 'href'],
+        }
+        assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+    @pytest.mark.parametrize('text, expected', [
+        (
+            '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
+            '<svg><pattern id="patt1"></pattern></svg>'
+        ),
+        (
+            '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
+            '<svg><pattern id="patt1"></pattern></svg>'
+        ),
+    ])
+    def test_svg_allow_local_href_nonlocal(self, text, expected):
+        """Drop non-local hrefs for svg elements"""
+        TAGS = ['svg', 'pattern']
+        ATTRS = {
+            'pattern': ['id', 'href'],
+        }
+        assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+    @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
+    def test_sarcasm(self):
+        """Jokes should crash.<sarcasm/>"""
+        dirty = 'Yeah right <sarcasm/>'
+        clean = 'Yeah right &lt;sarcasm/&gt;'
+        assert bleach.clean(dirty) == clean
+
+    def test_user_defined_protocols_valid(self):
+        valid_href = '<a href="myprotocol://more_text">allowed href</a>'
+        assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
+
+    def test_user_defined_protocols_invalid(self):
+        invalid_href = '<a href="http://xx.com">invalid href</a>'
+        cleaned_href = '<a>invalid href</a>'
+        assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+
+
+class TestLinkify:
+    def test_no_href_links(self):
+        s = '<a name="anchor">x</a>'
+        assert bleach.linkify(s) == s
+
+    def test_rel_already_there(self):
+        """Make sure rel attribute is updated not replaced"""
+        linked = ('Click <a href="http://example.com" rel="tooltip">'
+                  'here</a>.')
+
+        link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+
+        assert bleach.linkify(linked) == link_good
+        assert bleach.linkify(link_good) == link_good
 
 
 def test_xml_render():
@@ -147,48 +295,6 @@ def test_xml_render():
     assert bleach._render(parser.parseFragment('')) == ''
 
 
-def test_stripping():
-    assert (
-        bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True) ==
-        'a test <em>with</em> <b>html</b> tags'
-    )
-    assert (
-        bleach.clean('a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags', strip=True) ==
-        'a test <em>with</em>  <b>html</b> tags'
-    )
-
-    s = '<p><a href="http://example.com/">link text</a></p>'
-    assert (
-        bleach.clean(s, tags=['p'], strip=True) ==
-        '<p>link text</p>'
-    )
-    s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
-    assert (
-        bleach.clean(s, tags=['p'], strip=True) ==
-        '<p>multiply nested text</p>'
-    )
-
-    s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
-         '</a></p>')
-    assert (
-        bleach.clean(s, tags=['p', 'a'], strip=True) ==
-        '<p><a href="http://example.com/"></a></p>'
-    )
-
-
-def test_allowed_styles():
-    ATTR = ['style']
-    STYLE = ['color']
-    blank = '<b style=""></b>'
-    s = '<b style="color: blue;"></b>'
-    assert bleach.clean('<b style="top:0"></b>', attributes=ATTR) == blank
-    assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s
-    assert (
-        bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTR, styles=STYLE) ==
-        s
-    )
-
-
 def test_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
     dirty = '<span>invalid & </span> < extra http://link.com<em>'
@@ -203,138 +309,8 @@ def test_idempotent():
     )
 
 
-def test_rel_already_there():
-    """Make sure rel attribute is updated not replaced"""
-    linked = ('Click <a href="http://example.com" rel="tooltip">'
-              'here</a>.')
-
-    link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
-
-    assert bleach.linkify(linked) == link_good
-    assert bleach.linkify(link_good) == link_good
-
-
-def test_lowercase_html():
-    """We should output lowercase HTML."""
-    dirty = '<EM CLASS="FOO">BAR</EM>'
-    clean = '<em class="FOO">BAR</em>'
-    assert bleach.clean(dirty, attributes=['class']) == clean
-
-
-def test_wildcard_attributes():
-    ATTR = {
-        '*': ['id'],
-        'img': ['src'],
-    }
-    TAG = ['img', 'em']
-    dirty = ('both <em id="foo" style="color: black">can</em> have '
-             '<img id="bar" src="foo"/>')
-    assert (
-        bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
-        'both <em id="foo">can</em> have <img id="bar" src="foo">'
-    )
-
-
-def test_callable_attributes():
-    """Verify callable attributes work and get correct arg values"""
-    def img_test(attr, val):
-        return attr == 'src' and val.startswith('https')
-
-    ATTR = {
-        'img': img_test,
-    }
-    TAGS = ['img']
-
-    assert (
-        bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
-        u'foo <img> baz'
-    )
-    assert (
-        bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
-        u'foo <img src="https://example.com"> baz'
-    )
-
-
-def test_svg_attr_val_allows_ref():
-    """Unescape values in svg attrs that allow url references"""
-    # Local IRI, so keep it
-    text = '<svg><rect fill="url(#foo)" /></svg>'
-    TAGS = ['svg', 'rect']
-    ATTRS = {
-        'rect': ['fill'],
-    }
-    assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<svg><rect fill="url(#foo)"></rect></svg>'
-    )
-
-    # Non-local IRI, so drop it
-    text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
-    TAGS = ['svg', 'rect']
-    ATTRS = {
-        'rect': ['fill'],
-    }
-    assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<svg><rect></rect></svg>'
-    )
-
-
-@pytest.mark.parametrize('text, expected', [
-    (
-        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
-        '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
-    ),
-    (
-        '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
-        # NOTE(willkg): Bug in html5lib serializer drops the xlink part
-        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
-    ),
-])
-def test_svg_allow_local_href(text, expected):
-    """Keep local hrefs for svg elements"""
-    TAGS = ['svg', 'pattern']
-    ATTRS = {
-        'pattern': ['id', 'href'],
-    }
-    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
-
-@pytest.mark.parametrize('text, expected', [
-    (
-        '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
-        '<svg><pattern id="patt1"></pattern></svg>'
-    ),
-    (
-        '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
-        '<svg><pattern id="patt1"></pattern></svg>'
-    ),
-])
-def test_svg_allow_local_href_nonlocal(text, expected):
-    """Drop non-local hrefs for svg elements"""
-    TAGS = ['svg', 'pattern']
-    ATTRS = {
-        'pattern': ['id', 'href'],
-    }
-    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
-
-
-
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
-def test_sarcasm():
-    """Jokes should crash.<sarcasm/>"""
-    dirty = 'Yeah right <sarcasm/>'
-    clean = 'Yeah right &lt;sarcasm/&gt;'
-    assert bleach.clean(dirty) == clean
-
-
-def test_user_defined_protocols_valid():
-    valid_href = '<a href="myprotocol://more_text">allowed href</a>'
-    assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
-
-
-def test_user_defined_protocols_invalid():
-    invalid_href = '<a href="http://xx.com">invalid href</a>'
-    cleaned_href = '<a>invalid href</a>'
-    assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+def test_serializer():
+    s = '<table></table>'
+    assert bleach.clean(s, tags=['table']) == s
+    assert bleach.linkify('<table>test</table>') == 'test<table></table>'
+    assert bleach.clean('<p>test</p>', tags=['p']) == '<p>test</p>'

From b46c7ae058c1ad5f2d351f47e6e57a4dfa1591c3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 11:19:31 -0500
Subject: [PATCH 067/314] Remove unneeded module

---
 tests/tools.py | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 tests/tools.py

diff --git a/tests/tools.py b/tests/tools.py
deleted file mode 100644
index 3ae047e9..00000000
--- a/tests/tools.py
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-def in_(l, a, msg=None):
-    """Shorthand for 'assert a in l, "%r not in %r" % (a, l)
-    """
-    if a not in l:
-        raise AssertionError(msg or "%r not in %r" % (a, l))

From bd239771493c1ff6493fcff324e6512bbbb2028d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 11:19:38 -0500
Subject: [PATCH 068/314] Add test for Cleaner and fix module import issue

---
 bleach/__init__.py   |  2 +-
 tests/test_basics.py | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 296448ee..12788eb1 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -14,7 +14,7 @@
 from bleach.sanitizer import BleachSanitizerFilter
 from bleach.version import __version__, VERSION # flake8: noqa
 
-__all__ = ['clean', 'linkify']
+__all__ = ['Cleaner', 'clean', 'linkify']
 
 log = logging.getLogger(__name__)
 log.addHandler(logging.NullHandler())
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 790ad559..919c8678 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -274,6 +274,19 @@ def test_user_defined_protocols_invalid(self):
         assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
 
 
+class TestCleaner:
+    def test_basics(self):
+        TAGS = ['span', 'br']
+        ATTRS = {'span': ['style']}
+
+        cleaner = bleach.Cleaner(tags=TAGS, attributes=ATTRS)
+
+        assert (
+            cleaner.clean('a <br/><span style="color:red">test</span>') ==
+            'a <br><span style="">test</span>'
+        )
+
+
 class TestLinkify:
     def test_no_href_links(self):
         s = '<a name="anchor">x</a>'

From c0602651f187af1d1b927e1cbf219af0a46ad7c3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 12:02:12 -0500
Subject: [PATCH 069/314] Implement ability to use Filters in cleaning

This lets users extend cleaning more easily by specifying an html5lib Filter.
---
 bleach/__init__.py   |  33 ++++++++++++--
 bleach/sanitizer.py  |   1 -
 docs/clean.rst       | 102 +++++++++++++++++++++++++++++++++++--------
 tests/test_basics.py |  34 ++++++++++++---
 4 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 12788eb1..04d69b0a 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -115,9 +115,8 @@ class Cleaner(object):
 
     def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
                  styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-                 strip_comments=True):
-        """
-        :arg tags: whitelist of allowed tags; defaults to
+                 strip_comments=True, filters=None):
+        """:arg tags: whitelist of allowed tags; defaults to
             ``bleach.ALLOWED_TAGS``
 
         :arg attributes: whitelist of allowed attributes; defaults to
@@ -133,6 +132,16 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
         :arg strip_comments: whether or not to strip HTML comments
 
+        :arg filters: list of html5lib Filter classes to pass streamed content through
+
+            See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+            .. Warning::
+
+               Using filters changes the output of
+               :py:method:`bleach.Cleaner.clean`. Make sure the way the filters
+               change the output are secure.
+
         """
         self.tags = tags
         self.attributes = attributes
@@ -140,6 +149,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         self.protocols = protocols
         self.strip = strip
         self.strip_comments = strip_comments
+        self.filters = filters or []
 
         self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
         self.walker = html5lib.getTreeWalker('etree')
@@ -183,12 +193,16 @@ def clean(self, text):
             allowed_svg_properties=[],
         )
 
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
+
         return self.serializer.render(filtered)
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-          strip_comments=True):
+          strip_comments=True, filters=None):
     """Clean an HTML fragment of malicious content and return it
 
     This function is a security-focused function whose sole purpose is to
@@ -228,6 +242,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     :arg strip_comments: whether or not to strip HTML comments
 
+    :arg filters: list of html5lib Filter classes to pass streamed content through
+
+        See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+        .. Warning::
+
+           Using filters changes the output of
+           `bleach.Cleaner.clean`. Make sure the way the filters
+           change the output are secure.
+
     :returns: cleaned text as unicode
 
     """
@@ -238,6 +262,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         protocols=protocols,
         strip=strip,
         strip_comments=strip_comments,
+        filters=filters,
     )
     return cleaner.clean(text)
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index b7162eed..62bbf648 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -16,7 +16,6 @@ def _attr_key(attr):
 
     """
     key = (attr[0][0] or ''), attr[0][1]
-    print(key)
     return key
 
 
diff --git a/docs/clean.rst b/docs/clean.rst
index 63f0427e..8e310f59 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -5,7 +5,7 @@
 ``bleach.clean()``
 ==================
 
-:py:func:`bleach.clean`` is Bleach's HTML sanitization method.
+:py:func:`bleach.clean` is Bleach's HTML sanitization method.
 
 Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing
 algorithm and sanitize any disallowed tags or attributes. This algorithm also
@@ -48,13 +48,41 @@ The default value is a relatively conservative list found in
 ``bleach.ALLOWED_TAGS``.
 
 
-Attribute Whitelist
-===================
+Allowed Attributes
+==================
+
+The ``attributes`` kwarg lets you specify which attributes are allowed.
+
+The default value is also a conservative dict found in
+``bleach.ALLOWED_ATTRIBUTES``.
+
+
+As a list
+---------
+
+The ``attributes`` value can be a list, in which case the attributes are allowed
+for any tag.
+
+For example:
+
+.. doctest::
+
+   >>> import bleach
+
+   >>> bleach.clean(
+   ...     u'<p class="foo" style="color: red; font-weight: bold;">blah blah blah</p>',
+   ...     tags=['p'],
+   ...     attributes=['style'],
+   ...     styles=['color'],
+   ... )
+   u'<p style="color: red;">blah blah blah</p>'
+
+
+As a dict
+---------
 
-The ``attributes`` kwarg is a whitelist of attributes. It can be a list, in
-which case the attributes are allowed for any tag, or a dictionary, in which
-case the keys are tag names (or a wildcard: ``*`` for all tags) and the values
-are lists of allowed attributes.
+The ``attributes`` value can be a dict, in which case the keys are tag names (or
+a wildcard: ``*`` for all tags) and the values are lists of allowed attributes.
 
 For example:
 
@@ -80,23 +108,19 @@ In this case, ``class`` is allowed on any allowed element (from the ``tags``
 argument), ``<a>`` tags are allowed to have ``href`` and ``rel`` attributes,
 and so on.
 
-The default value is also a conservative dict found in
-``bleach.ALLOWED_ATTRIBUTES``.
-
 
-Callable Filters
-----------------
+Using functions
+---------------
 
-You can also use a callable (instead of a list) in the ``attributes`` kwarg. If
-the callable returns ``True``, the attribute is allowed. Otherwise, it is
-stripped. For example:
+You can also use callables. If the callable returns ``True``, the attribute is
+allowed. Otherwise, it is stripped. For example:
 
 .. doctest::
 
     >>> from urlparse import urlparse
     >>> import bleach
 
-    >>> def filter_src(name, value):
+    >>> def allow_src(name, value):
     ...     if name in ('alt', 'height', 'width'):
     ...         return True
     ...     if name == 'src':
@@ -108,7 +132,7 @@ stripped. For example:
     ...    u'<img src="http://example.com" alt="an example">',
     ...    tags=['img'],
     ...    attributes={
-    ...        'img': filter_src
+    ...        'img': allow_src
     ...    }
     ... )
     u'<img alt="an example">'
@@ -229,3 +253,47 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
 
    >>> bleach.clean(html, strip_comments=False)
    u'my<!-- commented --> html'
+
+
+html5lib Filters
+================
+
+Bleach sanitizing is implemented as an html5lib Filter. The consequence of this
+is that we can pass the streamed content through additional specified filters
+after the :py:class:`bleach.sanitizer.BleachSanitizingFilter` filter has run.
+
+This lets you add data, drop data and change data as it is being serialized back
+to a unicode.
+
+Documentation on html5lib Filters is here:
+http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+Trivial Filter example:
+
+.. doctest::
+
+   >>> import bleach
+   >>> from html5lib.filters.base import Filter
+
+   >>> class MooFilter(Filter):
+   ...     def __iter__(self):
+   ...         for token in Filter.__iter__(self):
+   ...             if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+   ...                 for attr, value in token['data'].items():
+   ...                     token['data'][attr] = 'moo'
+   ...             yield token
+   ...
+   >>> ATTRS = {
+   ...     'img': ['rel', 'src']
+   ... }
+   ...
+   >>> TAGS = ['img']
+   >>> dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
+   >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+   u'this is cute! <img rel="moo" src="moo">'
+
+
+.. Warning::
+
+   Filters change the output of cleaning. Make sure that whatever changes the
+   filter is applying maintain the safety guarantees of the output.
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 919c8678..620a42da 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,4 +1,5 @@
 import html5lib
+from html5lib.filters.base import Filter
 import pytest
 import six
 
@@ -147,14 +148,14 @@ def test_stripping(self):
         )
 
     def test_allowed_styles(self):
-        ATTR = ['style']
+        ATTRS = ['style']
         STYLE = ['color']
         blank = '<b style=""></b>'
         s = '<b style="color: blue;"></b>'
-        assert bleach.clean('<b style="top:0"></b>', attributes=ATTR) == blank
-        assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s
+        assert bleach.clean('<b style="top:0"></b>', attributes=ATTRS) == blank
+        assert bleach.clean(s, attributes=ATTRS, styles=STYLE) == s
         assert (
-            bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTR, styles=STYLE) ==
+            bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTRS, styles=STYLE) ==
             s
         )
 
@@ -165,7 +166,7 @@ def test_lowercase_html(self):
         assert bleach.clean(dirty, attributes=['class']) == clean
 
     def test_wildcard_attributes(self):
-        ATTR = {
+        ATTRS = {
             '*': ['id'],
             'img': ['src'],
         }
@@ -173,7 +174,7 @@ def test_wildcard_attributes(self):
         dirty = ('both <em id="foo" style="color: black">can</em> have '
                  '<img id="bar" src="foo"/>')
         assert (
-            bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
+            bleach.clean(dirty, tags=TAG, attributes=ATTRS) ==
             'both <em id="foo">can</em> have <img id="bar" src="foo">'
         )
 
@@ -273,6 +274,27 @@ def test_user_defined_protocols_invalid(self):
         cleaned_href = '<a>invalid href</a>'
         assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
 
+    def test_filters(self):
+        # Create a Filter that changes all the attr values to "moo"
+        class MooFilter(Filter):
+            def __iter__(self):
+                for token in Filter.__iter__(self):
+                    if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+                        for attr, value in token['data'].items():
+                            token['data'][attr] = 'moo'
+
+                    yield token
+
+        ATTRS = {
+            'img': ['rel', 'src']
+        }
+        TAGS = ['img']
+        dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
+        assert (
+            bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) ==
+            'this is cute! <img rel="moo" src="moo">'
+        )
+
 
 class TestCleaner:
     def test_basics(self):

From 40ebdcda9833cd0d93418a943d4616c62c8cf043 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 12:20:51 -0500
Subject: [PATCH 070/314] Minor fixes that should have been in last PR

---
 bleach/__init__.py | 11 ++++++-----
 docs/clean.rst     |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 04d69b0a..b71eeb89 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -116,7 +116,9 @@ class Cleaner(object):
     def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
                  styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
                  strip_comments=True, filters=None):
-        """:arg tags: whitelist of allowed tags; defaults to
+        """Initializes a Cleaner
+
+        :arg tags: whitelist of allowed tags; defaults to
             ``bleach.ALLOWED_TAGS``
 
         :arg attributes: whitelist of allowed attributes; defaults to
@@ -138,9 +140,8 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
             .. Warning::
 
-               Using filters changes the output of
-               :py:method:`bleach.Cleaner.clean`. Make sure the way the filters
-               change the output are secure.
+               Using filters changes the output of ``bleach.Cleaner.clean``.
+               Make sure the way the filters change the output are secure.
 
         """
         self.tags = tags
@@ -249,7 +250,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         .. Warning::
 
            Using filters changes the output of
-           `bleach.Cleaner.clean`. Make sure the way the filters
+           ``bleach.Cleaner.clean``. Make sure the way the filters
            change the output are secure.
 
     :returns: cleaned text as unicode
diff --git a/docs/clean.rst b/docs/clean.rst
index 8e310f59..a65d8b93 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -22,6 +22,7 @@ If you're cleaning a lot of text, you might want to create a
 .. autofunction:: bleach.clean
 
 .. autoclass:: bleach.Cleaner
+   :members:
 
 
 Tag Whitelist

From 313478f17e7107fc659243be4309e6e0c9b86e30 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 15:18:27 -0500
Subject: [PATCH 071/314] Reimplement linkify as an html5lib Filter

This reimplements linkify as an html5lib Filter. This has a few advantages:

1. it just has to consume a token stream which involves fewer dance steps to get
   things done

2. it can be used in conjunction with other Filters

3. it can now be used as a post-clean Filter for Cleaner letting you clean
   and linkify in one tree traversal
---
 bleach/__init__.py   | 339 +++-------------------------------------
 bleach/callbacks.py  |  19 ++-
 bleach/linkifier.py  | 356 +++++++++++++++++++++++++++++++++++++++++++
 bleach/sanitizer.py  |  22 +--
 bleach/utils.py      |  23 +++
 setup.cfg            |   6 +-
 tests/test_basics.py |  17 +--
 tests/test_links.py  |  56 +++++--
 8 files changed, 472 insertions(+), 366 deletions(-)
 create mode 100644 bleach/linkifier.py
 create mode 100644 bleach/utils.py

diff --git a/bleach/__init__.py b/bleach/__init__.py
index b71eeb89..0155a127 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -11,6 +11,7 @@
 
 from bleach import callbacks as linkify_callbacks
 from bleach.encoding import force_unicode
+from bleach.linkifier import LinkifyFilter
 from bleach.sanitizer import BleachSanitizerFilter
 from bleach.version import __version__, VERSION # flake8: noqa
 
@@ -44,47 +45,6 @@
 
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
-TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
-       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
-       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
-       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
-       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
-       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
-       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
-       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
-       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
-       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
-       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
-       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
-       xn xxx ye yt yu za zm zw""".split()
-
-# Make sure that .com doesn't get matched by .co first
-TLDS.reverse()
-
-url_re = re.compile(
-    r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
-        # /path/zz (excluding "unsafe" chars from RFC 1738,
-        # except for # and ~, which happen in practice)
-    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
-    re.IGNORECASE | re.VERBOSE | re.UNICODE)
-
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-
-punct_re = re.compile(r'([\.,]+)$')
-
-email_re = re.compile(
-    r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
-    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-        |\\[\001-011\013\014\016-\177])*"  # quoted-string
-    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
-    """,
-    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
-
 ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
 # a simple routine that returns the tag name with the namespace prefix
 # as returned by etree's Element.tag attribute
@@ -268,8 +228,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     return cleaner.clean(text)
 
 
-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False):
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
     ``linkify()`` converts strings that look like URLs, domain names and email
@@ -283,281 +242,29 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     situations due to crazy text.
 
     """
-    text = force_unicode(text)
-
-    if not text:
-        return u''
-
-    parser = html5lib.HTMLParser()
-
-    forest = parser.parseFragment(text)
-    _seen = set()
+    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+    walker = html5lib.getTreeWalker('etree')
+    serializer = HTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
 
-    def replace_nodes(tree, new_frag, node, index=0):
-        """Doesn't really replace nodes, but inserts the nodes contained in
-        ``new_frag`` into ``tree`` at position ``index`` and returns the number
-        of nodes inserted.
+        # Bleach has its own sanitizer, so don't use the html5lib one
+        sanitize=False,
 
-        If ``node`` is passed in, it is removed from the resulting tree.
+        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+        alphabetical_attributes=False,
+    )
 
-        :arg tree: tree
-        :arg new_frag: fragment of html text to insert
-        :arg node: the node to "replace"
-        :arg index: the index position to focus on
+    text = force_unicode(text)
 
-        :returns: number of nodes inserted so that you can skip ahead
+    if not text:
+        return u''
 
-        """
-        count = 0
-        new_tree = parser.parseFragment(new_frag)
-        # capture any non-tag text at the start of the fragment
-        if new_tree.text:
-            if index == 0:
-                tree.text = (tree.text or '') + new_tree.text
-            else:
-                tree[index-1].tail = (tree[index-1].tail or '') + new_tree.text
-
-        # then put in the tagged elements into the old tree
-        for n in new_tree:
-            if n.tag == ETREE_TAG('a'):
-                _seen.add(n)
-            tree.insert(index + count, n)
-            count += 1
-
-        # if we got a node to remove...
-        if node is not None:
-            # first, grab the node tail so we don't lose text
-            if node.tail:
-                if index + count == 0:
-                    tree.text = (tree.text or '') + node.tail
-                else:
-                    tree[index+count-1].tail = (tree[index+count-1].tail or '') + node.tail
-            tree.remove(node)
-        return count
-
-    def strip_wrapping_parentheses(fragment):
-        """Strips wrapping parentheses.
-
-        Returns a tuple of the following format::
-
-            (string stripped from wrapping parentheses,
-             count of stripped opening parentheses,
-             count of stripped closing parentheses)
-        """
-        opening_parentheses = closing_parentheses = 0
-        # Count consecutive opening parentheses
-        # at the beginning of the fragment (string).
-        for char in fragment:
-            if char == '(':
-                opening_parentheses += 1
-            else:
-                break
-
-        if opening_parentheses:
-            newer_frag = ''
-            # Cut the consecutive opening brackets from the fragment.
-            fragment = fragment[opening_parentheses:]
-            # Reverse the fragment for easier detection of parentheses
-            # inside the URL.
-            reverse_fragment = fragment[::-1]
-            skip = False
-            for char in reverse_fragment:
-                # Remove the closing parentheses if it has a matching
-                # opening parentheses (they are balanced).
-                if (char == ')' and
-                        closing_parentheses < opening_parentheses and
-                        not skip):
-                    closing_parentheses += 1
-                    continue
-                # Do not remove ')' from the URL itself.
-                elif char != ')':
-                    skip = True
-                newer_frag += char
-            fragment = newer_frag[::-1]
-
-        return fragment, opening_parentheses, closing_parentheses
-
-    def apply_callbacks(attrs, new):
-        for cb in callbacks:
-            attrs = cb(attrs, new)
-            if attrs is None:
-                return None
-        return attrs
-
-    def _render_inner(node):
-        out = ['' if node.text is None else node.text]
-        for subnode in node:
-            out.append(_render(subnode))
-            if subnode.tail:
-                out.append(subnode.tail)
-        return ''.join(out)
-
-    def linkify_nodes(tree, parse_text=True):
-        children = len(tree)
-        current_child = -1
-        # start at -1 to process the parent first
-        while current_child < len(tree):
-            if current_child < 0:
-                node = tree
-                if parse_text and node.text:
-                    new_txt = old_txt = node.text
-                    if parse_email:
-                        new_txt = re.sub(email_re, email_repl, node.text)
-                        if new_txt and new_txt != node.text:
-                            node.text = ''
-                            adj = replace_nodes(tree, new_txt, None, 0)
-                            children += adj
-                            current_child += adj
-                            linkify_nodes(tree, True)
-                            continue
-
-                    new_txt = re.sub(url_re, link_repl, new_txt)
-                    if new_txt != old_txt:
-                        node.text = ''
-                        adj = replace_nodes(tree, new_txt, None, 0)
-                        children += adj
-                        current_child += adj
-                        continue
-            else:
-                node = tree[current_child]
-
-            if parse_text and node.tail:
-                new_tail = old_tail = node.tail
-                if parse_email:
-                    new_tail = re.sub(email_re, email_repl, new_tail)
-                    if new_tail != node.tail:
-                        node.tail = ''
-                        adj = replace_nodes(tree, new_tail, None,
-                                            current_child + 1)
-                        # Insert the new nodes made from my tail into
-                        # the tree right after me. current_child+1
-                        children += adj
-                        continue
-
-                new_tail = re.sub(url_re, link_repl, new_tail)
-                if new_tail != old_tail:
-                    node.tail = ''
-                    adj = replace_nodes(tree, new_tail, None,
-                                        current_child + 1)
-                    children += adj
-
-            if node.tag == ETREE_TAG('a') and not (node in _seen):
-                if not node.get('href', None) is None:
-                    attrs = dict(node.items())
-
-                    _text = attrs['_text'] = _render_inner(node)
-
-                    attrs = apply_callbacks(attrs, False)
-
-                    if attrs is None:
-                        # # <a> tag replaced by the text within it
-                        adj = replace_nodes(tree, _text, node, current_child)
-                        # pull back current_child by 1 to scan the new nodes
-                        # again.
-                        current_child -= 1
-                    else:
-                        text = force_unicode(attrs.pop('_text'))
-                        for attr_key, attr_val in attrs.items():
-                            node.set(attr_key, attr_val)
-
-                        for n in reversed(list(node)):
-                            node.remove(n)
-                        text = parser.parseFragment(text)
-                        node.text = text.text
-                        for n in text:
-                            node.append(n)
-                        _seen.add(node)
-
-            elif current_child >= 0:
-                if node.tag == ETREE_TAG('pre') and skip_pre:
-                    linkify_nodes(node, False)
-                elif not (node in _seen):
-                    linkify_nodes(node, parse_text)
-
-            current_child += 1
-
-    def email_repl(match):
-        addr = match.group(0).replace('"', '&quot;')
-        link = {
-            '_text': addr,
-            'href': 'mailto:{0!s}'.format(addr),
-        }
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return addr
-
-        _href = link.pop('href')
-        _text = link.pop('_text')
-
-        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-        return repl.format(_href, attribs, _text)
-
-    def link_repl(match):
-        url = match.group(0)
-        open_brackets = close_brackets = 0
-        if url.startswith('('):
-            _wrapping = strip_wrapping_parentheses(url)
-            url, open_brackets, close_brackets = _wrapping
-        if url.endswith(')') and '(' not in url:
-            # This is a clumsy handling for the case where we have something
-            # like (foo http://example.com) and the ) gets picked up by the
-            # url_re but we don't want it part of the link.
-            new_url = url.rstrip(')')
-            close_brackets += len(url) - len(new_url)
-            url = new_url
-
-        end = ''
-        m = re.search(punct_re, url)
-        if m:
-            end = m.group(0)
-            url = url[0:m.start()]
-        if re.search(proto_re, url):
-            href = url
-        else:
-            href = ''.join(['http://', url])
-
-        link = {
-            '_text': url,
-            'href': href,
-        }
-
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return '(' * open_brackets + url + ')' * close_brackets
-
-        _text = link.pop('_text')
-        _href = link.pop('href')
-
-        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-
-        return repl.format('(' * open_brackets,
-                           _href, attribs, _text, end,
-                           ')' * close_brackets)
-
-    try:
-        linkify_nodes(forest)
-    except RuntimeError as e:
-        # If we hit the max recursion depth, just return what we've got.
-        log.exception('Probable recursion error: {0!r}'.format(e))
-
-    return _render(forest)
-
-
-def _render(tree):
-    """Try rendering as HTML, then XML, then give up."""
-    return force_unicode(_serialize(tree))
-
-
-def _serialize(domtree):
-    walker = html5lib.treewalkers.getTreeWalker('etree')
-    stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values='always',
-                                alphabetical_attributes=True,
-                                omit_optional_tags=False)
-    return serializer.render(stream)
+    dom = parser.parseFragment(text)
+    filtered = LinkifyFilter(
+        source=walker(dom),
+        callbacks=callbacks,
+        skip_pre=skip_pre,
+        parse_email=parse_email
+    )
+    return serializer.render(filtered)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index 3cb82c25..d2ba1014 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -3,18 +3,23 @@
 
 
 def nofollow(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, u'href')
+    if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
         return attrs
-    rel = [x for x in attrs.get('rel', '').split(' ') if x]
-    if 'nofollow' not in [x.lower() for x in rel]:
-        rel.append('nofollow')
-    attrs['rel'] = ' '.join(rel)
+
+    rel_key = (None, u'rel')
+    rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
+    if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append(u'nofollow')
+    attrs[rel_key] = u' '.join(rel_values)
 
     return attrs
 
 
 def target_blank(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, u'href')
+    if attrs[href_key].startswith(u'mailto:'):
         return attrs
-    attrs['target'] = '_blank'
+
+    attrs[(None, u'target')] = u'_blank'
     return attrs
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
new file mode 100644
index 00000000..b4ba2ea8
--- /dev/null
+++ b/bleach/linkifier.py
@@ -0,0 +1,356 @@
+from __future__ import unicode_literals
+import re
+
+from html5lib.filters.base import Filter
+
+from bleach import allowed_protocols
+from bleach.encoding import force_unicode
+from bleach.utils import alphabetize_attributes
+
+
+# FIXME(willkg): Move this to a constants module.
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+       xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+
+url_re = re.compile(
+    r"""\(*  # Match any opening parentheses.
+    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+    ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+        # /path/zz (excluding "unsafe" chars from RFC 1738,
+        # except for # and ~, which happen in practice)
+    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
+    re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+
+punct_re = re.compile(r'([\.,]+)$')
+
+email_re = re.compile(
+    r"""(?<!//)
+    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
+    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+        |\\[\001-011\013\014\016-\177])*"  # quoted-string
+    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
+    """,
+    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+
+class LinkifyFilter(Filter):
+    def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
+        super(LinkifyFilter, self).__init__(source)
+
+        self.callbacks = callbacks or []
+        self.skip_pre = skip_pre
+        self.parse_email = parse_email
+
+    def apply_callbacks(self, attrs, is_new):
+        for cb in self.callbacks:
+            attrs = cb(attrs, is_new)
+            if attrs is None:
+                return None
+        return attrs
+
+    def extract_character_data(self, token_list):
+        """Extracts and squashes character sequences in a token stream"""
+        out = []
+        for token in token_list:
+            token_type = token['type']
+            if token_type in ['Characters', 'SpaceCharacters']:
+                out.append(token['data'])
+
+        return u''.join(out)
+
+    def handle_email_addresses(self, src_iter):
+        """Handle email addresses in character tokens"""
+        for token in src_iter:
+            if token['type'] == 'Characters':
+                text = token['data']
+                new_tokens = []
+                end = 0
+
+                for match in email_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                        )
+
+                    # Run attributes through the callbacks to see what we
+                    # should do with this match
+                    attrs = {
+                        (None, u'href'): u'mailto:%s' % match.group(0),
+                        u'_text': match.group(0)
+                    }
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if attrs is None:
+                        # Just add the text
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': match.group(0)}
+                        )
+
+                    else:
+                        # Add a "a" tag
+                        _text = attrs.pop(u'_text', '')
+                        attrs = alphabetize_attributes(attrs)
+                        new_tokens.extend([
+                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
+                            {u'type': u'Characters', u'data': force_unicode(_text)},
+                            {u'type': u'EndTag', u'name': 'a'}
+                        ])
+                    end = match.end()
+
+                if new_tokens:
+                    if end < len(text):
+                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def strip_wrapping_parentheses(self, fragment):
+        """Strips wrapping parentheses"""
+        openp = closep = 0
+
+        # Count consecutive opening parentheses at the beginning of the
+        # fragment (string)
+        for char in fragment:
+            if char == '(':
+                openp += 1
+            else:
+                break
+
+        if openp:
+            newer_frag = ''
+            # Cut the consecutive opening brackets from the fragment
+            fragment = fragment[openp:]
+
+            # Reverse the fragment for easier detection of parentheses
+            # inside the URL
+            reverse_fragment = fragment[::-1]
+            skip = False
+            for char in reverse_fragment:
+                if char == ')' and closep < openp and not skip:
+                    # Remove the closing parentheses if it has a matching
+                    # opening parentheses (they are balanced).
+                    closep += 1
+                    continue
+
+                elif char != ')':
+                    # Do not remove ')' from the URL itself.
+                    skip = True
+
+                newer_frag += char
+
+            # Reverse fragment back
+            fragment = newer_frag[::-1]
+
+        return fragment, u'(' * openp, u')' * closep
+
+    def strip_punctuation(self, fragment):
+        match = re.search(punct_re, fragment)
+        if match:
+            return fragment[0:match.start()], match.group(0)
+        else:
+            return fragment, ''
+
+    def handle_links(self, src_iter):
+        """Handle links in character tokens"""
+        for token in src_iter:
+            if token['type'] == 'Characters':
+                text = token['data']
+                new_tokens = []
+                end = 0
+
+                for match in url_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                        )
+
+                    url = match.group(0)
+                    prefix = suffix = ''
+
+                    # Sometimes we pick up ( and ), so drop them from the url
+                    if url.startswith('('):
+                        url, prefix, suffix = self.strip_wrapping_parentheses(url)
+
+                    if url.endswith(u')') and u'(' not in url:
+                        new_url = url.rstrip(u')')
+                        suffix = url[len(new_url):] + suffix
+                        url = new_url
+
+                    # Sometimes we pick up . and , at the end of the url that's
+                    # part of the sentence and not the url so drop it
+                    url, punct_suffix = self.strip_punctuation(url)
+                    if punct_suffix:
+                        suffix = suffix + punct_suffix
+
+                    # If there's no protocol, add one
+                    if re.search(proto_re, url):
+                        href = url
+                    else:
+                        href = u'http://%s' % url
+
+                    attrs = {
+                        (None, u'href'): href,
+                        u'_text': url
+                    }
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if prefix:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': prefix}
+                        )
+
+                    if attrs is None:
+                        # Just add the text
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': url}
+                        )
+
+                    else:
+                        # Add an "a" tag!
+                        _text = attrs.pop(u'_text', '')
+                        attrs = alphabetize_attributes(attrs)
+
+                        new_tokens.extend([
+                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
+                            {u'type': u'Characters', u'data': force_unicode(_text)},
+                            {u'type': u'EndTag', u'name': 'a'},
+                        ])
+
+                    if suffix:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': suffix}
+                        )
+
+                    end = match.end()
+
+                if new_tokens:
+                    if end < len(text):
+                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def __iter__(self):
+        in_a = False
+        in_pre = False
+
+        token_buffer = []
+
+        for token in super(LinkifyFilter, self).__iter__():
+            if in_a:
+                # Handle the case where we're in an "a" tag--we want to buffer tokens
+                # until we hit an end "a" tag.
+                if token['type'] == 'EndTag' and token['name'] == 'a':
+                    # We're no longer in an "a" tag, so we get all the things we
+                    # need to apply callbacks and then figure out what to do with
+                    # this "a" tag.
+                    in_a = False
+                    a_token = token_buffer[0]
+                    if a_token['data']:
+                        attrs = a_token['data']
+                    else:
+                        attrs = {}
+
+                    text = self.extract_character_data(token_buffer)
+                    attrs['_text'] = text
+
+                    attrs = self.apply_callbacks(attrs, False)
+                    if attrs is None:
+                        # We're dropping the "a" tag and everything else and replacing
+                        # it with character data. So emit that token.
+                        yield {'type': 'Characters', 'data': text}
+
+                    else:
+                        new_text = attrs.pop('_text', '')
+                        # FIXME(willkg): add nofollow here
+                        a_token['data'] = alphabetize_attributes(attrs)
+
+                        if text == new_text:
+                            # The callbacks didn't change the text, so we yield the
+                            # new "a" token, then whatever else was there, then the
+                            # end "a" token
+                            yield a_token
+                            for mem in token_buffer[1:]:
+                                yield mem
+                            yield token
+
+                        else:
+                            # If the callbacks changed the text, then we're going
+                            # to drop all the tokens between the start and end "a"
+                            # tags and replace it with the new text
+                            yield a_token
+                            yield {'type': 'Characters', 'data': force_unicode(new_text)}
+                            yield token
+
+                    token_buffer = []
+                    continue
+
+                else:
+                    token_buffer.append(token)
+                    continue
+
+            elif token['type'] in ['StartTag', 'EmptyTag']:
+                if token['name'] == 'pre' and self.skip_pre:
+                    # The "pre" tag starts a "special mode" where we don't linkify
+                    # anything.
+                    in_pre = True
+
+                elif token['name'] == 'a':
+                    # The "a" tag is special--we switch to a slurp mode and
+                    # slurp all the tokens until the end "a" tag and then
+                    # figure out what to do with them there.
+                    in_a = True
+                    token_buffer.append(token)
+
+                    # We buffer the start tag, so we don't want to yield it,
+                    # yet
+                    continue
+
+            elif in_pre:
+                # NOTE(willkg): We put this clause here since in_a and
+                # switching in and out of is_a takes precedence.
+                if token['type'] == 'EndTag' and token['name'] == 'pre':
+                    in_pre = False
+
+            elif not in_a and not in_pre and token['type'] == 'Characters':
+                new_stream = iter([token])
+                if self.parse_email:
+                    new_stream = self.handle_email_addresses(new_stream)
+
+                new_stream = self.handle_links(new_stream)
+
+                for token in new_stream:
+                    yield token
+
+                # We've already yielded this token, so continue
+                continue
+
+            yield token
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 62bbf648..610dd903 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,22 +1,11 @@
 from __future__ import unicode_literals
-from collections import OrderedDict
 import re
 from xml.sax.saxutils import unescape
 
 from html5lib.constants import namespaces
 from html5lib.filters import sanitizer
 
-
-def _attr_key(attr):
-    """Returns appropriate key for sorting attribute names
-
-    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
-    ``None`` or a string. These can't be compared in Python 3, so we conver the
-    ``None`` to an empty string.
-
-    """
-    key = (attr[0][0] or ''), attr[0][1]
-    return key
+from bleach.utils import alphabetize_attributes
 
 
 class BleachSanitizerFilter(sanitizer.Filter):
@@ -60,9 +49,7 @@ def sanitize_token(self, token):
                 if 'data' in token:
                     # Alphabetize the attributes before calling .disallowed_token()
                     # so that the resulting string is stable
-                    token['data'] = OrderedDict(
-                        [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)]
-                    )
+                    token['data'] = alphabetize_attributes(token['data'])
                 return self.disallowed_token(token)
 
         elif token_type == 'Comment':
@@ -139,10 +126,7 @@ def allow_token(self, token):
                 # At this point, we want to keep the attribute, so add it in
                 attrs[namespaced_name] = val
 
-            # Alphabetize attributes
-            token['data'] = OrderedDict(
-                [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
-            )
+            token['data'] = alphabetize_attributes(attrs)
 
         return token
 
diff --git a/bleach/utils.py b/bleach/utils.py
new file mode 100644
index 00000000..d9c211fc
--- /dev/null
+++ b/bleach/utils.py
@@ -0,0 +1,23 @@
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Returns appropriate key for sorting attribute names
+
+    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+    ``None`` or a string. These can't be compared in Python 3, so we conver the
+    ``None`` to an empty string.
+
+    """
+    key = (attr[0][0] or ''), attr[0][1]
+    return key
+
+
+def alphabetize_attributes(attrs):
+    """Takes a dict of attributes (or None) and returns them alphabetized"""
+    if not attrs:
+        return attrs
+
+    return OrderedDict(
+        [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
+    )
diff --git a/setup.cfg b/setup.cfg
index 950364a7..69c6d1f2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,11 @@
 test=pytest
 
 [flake8]
-ignore = E731,W503
+ignore =
+    # E731: do not assign a lambda expression, use a def
+    E731,
+    # W503: line break occurred before a binary operator
+    W503
 max-line-length = 100
 
 [wheel]
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 620a42da..e3f5d2da 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,4 +1,3 @@
-import html5lib
 from html5lib.filters.base import Filter
 import pytest
 import six
@@ -183,17 +182,19 @@ def test_callable_attributes(self):
         def img_test(attr, val):
             return attr == 'src' and val.startswith('https')
 
-        ATTR = {
+        ATTRS = {
             'img': img_test,
         }
         TAGS = ['img']
 
         assert (
-            bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+            bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS,
+                         attributes=ATTRS) ==
             u'foo <img> baz'
         )
         assert (
-            bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+            bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS,
+                         attributes=ATTRS) ==
             u'foo <img src="https://example.com"> baz'
         )
 
@@ -325,11 +326,6 @@ def test_rel_already_there(self):
         assert bleach.linkify(link_good) == link_good
 
 
-def test_xml_render():
-    parser = html5lib.HTMLParser()
-    assert bleach._render(parser.parseFragment('')) == ''
-
-
 def test_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
     dirty = '<span>invalid & </span> < extra http://link.com<em>'
@@ -340,7 +336,8 @@ def test_idempotent():
     linked = bleach.linkify(dirty)
     assert (
         bleach.linkify(linked) ==
-        '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" rel="nofollow">http://link.com</a><em></em>'
+        '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" '
+        'rel="nofollow">http://link.com</a><em></em>'
     )
 
 
diff --git a/tests/test_links.py b/tests/test_links.py
index 53d60e5c..1712d199 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -5,7 +5,8 @@
 
 import pytest
 
-from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
+from bleach import linkify, DEFAULT_CALLBACKS as DC
+from bleach.linkifier import url_re
 
 
 def test_url_re():
@@ -51,8 +52,9 @@ def test_trailing_slash():
 def test_mangle_link():
     """We can muck with the href attribute of the link."""
     def filter_url(attrs, new=False):
-        quoted = quote_plus(attrs['href'])
-        attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
+        if not attrs.get((None, 'href'), '').startswith('http://bouncer'):
+            quoted = quote_plus(attrs[(None, 'href')])
+            attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted)
         return attrs
 
     assert (
@@ -188,7 +190,7 @@ def test_set_attrs():
     """We can set random attributes on links."""
 
     def set_attr(attrs, new=False):
-        attrs['rev'] = 'canonical'
+        attrs[(None, u'rev')] = u'canonical'
         return attrs
 
     assert (
@@ -214,7 +216,7 @@ def only_proto(attrs, new=False):
 def test_stop_email():
     """Returning None should prevent a link from being created."""
     def no_email(attrs, new=False):
-        if attrs['href'].startswith('mailto:'):
+        if attrs[(None, 'href')].startswith('mailto:'):
             return None
         return attrs
     text = 'do not link james@example.com'
@@ -276,14 +278,16 @@ def test_add_rel_nofollow():
 def test_url_with_path():
     assert (
         linkify('http://example.com/path/to/file') ==
-        '<a href="http://example.com/path/to/file" rel="nofollow">http://example.com/path/to/file</a>'
+        '<a href="http://example.com/path/to/file" rel="nofollow">'
+        'http://example.com/path/to/file</a>'
     )
 
 
 def test_link_ftp():
     assert (
         linkify('ftp://ftp.mozilla.org/some/file') ==
-        '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">ftp://ftp.mozilla.org/some/file</a>'
+        '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+        'ftp://ftp.mozilla.org/some/file</a>'
     )
 
 
@@ -325,10 +329,8 @@ def test_escaped_html():
 def test_link_http_complete():
     assert (
         linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') ==
-        (
-            '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
-            'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'
-        )
+        '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
+        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'
     )
 
 
@@ -348,7 +350,8 @@ def test_unsafe_url():
     """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
     assert (
         linkify('All your{"xx.yy.com/grover.png"}base are') ==
-        'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}base are'
+        'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}'
+        'base are'
     )
 
 
@@ -556,8 +559,35 @@ def test_remove_first_childlink():
 
 def test_drop_link_tags():
     """Verify that dropping link tags *just* drops the tag and not the content"""
-    html = """first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">fourth</a> fifth"""
+    html = (
+        'first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">'
+        'fourth</a> fifth'
+    )
     assert (
         linkify(html, callbacks=[lambda attrs, new: None]) ==
         'first second third fourth fifth'
     )
+
+
+@pytest.mark.parametrize('text, expected', [
+    (u'&lt;br&gt;', u'&lt;br&gt;'),
+    (
+        u'&lt;br&gt; http://example.com',
+        u'&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>'
+    ),
+    (
+        u'&lt;br&gt; <br> http://example.com',
+        u'&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>'
+    )
+])
+def test_naughty_unescaping(text, expected):
+    """Verify that linkify is not unescaping things it shouldn't be"""
+    assert linkify(text) == expected
+
+
+def test_hang():
+    """This string would hang linkify. Issue #200"""
+    assert (
+        linkify("an@email.com<mailto:an@email.com>", parse_email=True) ==
+        '<a href="mailto:an@email.com">an@email.com</a><mailto:an@email.com></mailto:an@email.com>'
+    )

From f4333240eeb543b323e7f26d81dff58ce112fcb6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 21:03:21 -0500
Subject: [PATCH 072/314] Update CHANGES

---
 CHANGES | 67 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/CHANGES b/CHANGES
index e8e49a8d..c2b91c1a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -6,21 +6,68 @@ Version 2.0 (in development)
 
 **Backwards incompatible changes**
 
-- Removed support for Python 2.6. #206
-- Removed support for Python 3.2. #224
-- Bleach no longer supports html5lib < 0.99999999 (8 9s).
+* Removed support for Python 2.6. #206
 
-  This version represents a rewrite to use the new sanitizing API since
-  the old one was dropped in html5lib 0.99999999 (8 9s).
+* Removed support for Python 3.2. #224
+
+* Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+  This version is a rewrite to use the new sanitizing API since the old
+  one was dropped in html5lib 0.99999999 (8 9s).
+
+* ``bleach.clean`` and friends were rewritten
+
+  ``clean`` is now implemented as an html5lib Filter and happens at a different
+  step in the HTML parsing -> traversing -> serializing process. Because of
+  that, there are some differences in clean's output as compared with previous
+  versions.
+
+  Amongst other things, this version will add end tags even if the tag in
+  question is to be escaped.
+
+* ``bleach.linkify`` was rewritten
+
+  ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer
+  accepts a ``tokenizer`` argument.
+
+  The callback functions for adjusting link attributes now takes a namespaced
+  attribute.
+
+  Previously you'd do something like this::
+
+      def check_protocol(attrs):
+          if not attrs.get('href', '').startswith('http:', 'https:')):
+              return None
+          return attrs
+
+  Now it's more like this::
+
+      def check_protocol(attrs):
+          if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')):
+              #            ^^^^^^^^^^^^^^
+              return None
+          return attrs
 
-- linkify no longer accepts a tokenizer argument.
-- clean output is different than in previous versions; particularly this version
-  will add end tags even if the tag will be escaped.
 
 **Changes**
 
-- Supports Python 3.6.
-- Supports html5lib >= 0.99999999 (8 9s).
+* Supports Python 3.6.
+
+* Supports html5lib >= 0.99999999 (8 9s).
+
+* There's a ``bleach.Cleaner`` class that you can instantiate with your
+  favorite clean settings and reuse it.
+
+* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib Filter.
+
+* You can pass ``bleach.linkifier.LinkifyFilter`` as a Filter to
+  ``bleach.Cleaner`` allowing you to clean and linkify in one pass.
+
+* Lots of bug fixes.
+
+* Test cleanup.
+
+* Documentation fixes.
 
 
 Version 1.5 (November 4th, 2016)

From 1199a6323e4e632d6ec984646d8a1eb1aee937ec Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 3 Mar 2017 21:33:15 -0500
Subject: [PATCH 073/314] Update documentation

---
 CHANGES          |   4 +-
 LICENSE          |   2 +-
 docs/clean.rst   |  65 +++++++++++++---------
 docs/conf.py     |   2 +-
 docs/linkify.rst | 141 ++++++++++++++++++++++++++++++++---------------
 5 files changed, 140 insertions(+), 74 deletions(-)

diff --git a/CHANGES b/CHANGES
index c2b91c1a..d7d9b0d2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -35,14 +35,14 @@ Version 2.0 (in development)
 
   Previously you'd do something like this::
 
-      def check_protocol(attrs):
+      def check_protocol(attrs, is_new):
           if not attrs.get('href', '').startswith('http:', 'https:')):
               return None
           return attrs
 
   Now it's more like this::
 
-      def check_protocol(attrs):
+      def check_protocol(attrs, is_new):
           if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')):
               #            ^^^^^^^^^^^^^^
               return None
diff --git a/LICENSE b/LICENSE
index 90a2cb9b..467c38e4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2014-2016, Mozilla Foundation
+Copyright (c) 2014-2017, Mozilla Foundation
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/docs/clean.rst b/docs/clean.rst
index a65d8b93..e281e2ca 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -1,9 +1,9 @@
 .. _clean-chapter:
 .. highlightlang:: python
 
-==================
-``bleach.clean()``
-==================
+=========================
+Sanitizing text fragments
+=========================
 
 :py:func:`bleach.clean` is Bleach's HTML sanitization method.
 
@@ -16,21 +16,15 @@ takes care of things like unclosed and (some) misnested tags.
    always return ``unicode``.
 
 
-If you're cleaning a lot of text, you might want to create a
-:py:class:`bleach.Cleaner` instance.
-
 .. autofunction:: bleach.clean
 
-.. autoclass:: bleach.Cleaner
-   :members:
 
+Allowed tags (``tags``)
+=======================
 
-Tag Whitelist
-=============
-
-The ``tags`` kwarg is a whitelist of allowed HTML tags. It should be a list,
-tuple, or other iterable. Any other HTML tags will be escaped or stripped from
-the text.
+The ``tags`` kwarg specifies the allowed set of HTML tags. It should be a list,
+tuple, or other iterable. Any HTML tags not in this list will be escaped or
+stripped from the text.
 
 For example:
 
@@ -49,8 +43,8 @@ The default value is a relatively conservative list found in
 ``bleach.ALLOWED_TAGS``.
 
 
-Allowed Attributes
-==================
+Allowed Attributes (``attributes``)
+===================================
 
 The ``attributes`` kwarg lets you specify which attributes are allowed.
 
@@ -139,8 +133,8 @@ allowed. Otherwise, it is stripped. For example:
     u'<img alt="an example">'
 
 
-Styles Whitelist
-================
+Allowed styles (``styles``)
+===========================
 
 If you allow the ``style`` attribute, you will also need to whitelist styles
 users are allowed to set, for example ``color`` and ``background-color``.
@@ -172,8 +166,8 @@ For example, to allow users to set the color and font-weight of text:
 Default styles are stored in ``bleach.ALLOWED_STYLES``.
 
 
-Protocol Whitelist
-==================
+Allowed protocols (``protocols``)
+=================================
 
 If you allow tags that have attributes containing a URI value (like the ``href``
 attribute of an anchor tag, you may want to adapt the accepted protocols. The
@@ -208,8 +202,8 @@ This adds smb to the Bleach-specified set of allowed protocols:
 Default protocols are in ``bleach.ALLOWED_PROTOCOLS``.
 
 
-Stripping Markup
-================
+Stripping markup (``strip``)
+============================
 
 By default, Bleach *escapes* tags that aren't specified in the tags
 whitelist and invalid markup. For example:
@@ -237,8 +231,8 @@ If you would rather Bleach stripped this markup entirely, you can pass
    u'<b>is not allowed</b>'
 
 
-Stripping Comments
-==================
+Stripping comments (``strip_comments``)
+=======================================
 
 By default, Bleach will strip out HTML comments. To disable this behavior, set
 ``strip_comments=False``:
@@ -256,8 +250,8 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
    u'my<!-- commented --> html'
 
 
-html5lib Filters
-================
+html5lib Filters (``filters``)
+==============================
 
 Bleach sanitizing is implemented as an html5lib Filter. The consequence of this
 is that we can pass the streamed content through additional specified filters
@@ -298,3 +292,22 @@ Trivial Filter example:
 
    Filters change the output of cleaning. Make sure that whatever changes the
    filter is applying maintain the safety guarantees of the output.
+
+
+Using ``bleach.Cleaner``
+========================
+
+If you're cleaning a lot of text, you might want to create a
+:py:class:`bleach.Cleaner` instance.
+
+.. autoclass:: bleach.Cleaner
+   :members:
+
+
+Using ``bleach.sanitizer.BleachSanitizerFilter``
+================================================
+
+``bleach.clean`` creates a ``bleach.Cleaner`` which creates a
+``bleach.sanitizer.BleachSanitizerFilter`` which does the sanitizing work.
+``BleachSanitizerFilter`` is an html5lib Filter and can be used anywhere you can
+use an html5lib Filter.
diff --git a/docs/conf.py b/docs/conf.py
index e186c827..1d257d01 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -43,7 +43,7 @@
 
 # General information about the project.
 project = u'Bleach'
-copyright = u'2012-2015, James Socol; 2015-2016, Mozilla Foundation'
+copyright = u'2012-2015, James Socol; 2015-2017, Mozilla Foundation'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 705000c2..6fe032ed 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -1,32 +1,31 @@
 .. _linkify-chapter:
 .. highlightlang:: python
 
-====================
-``bleach.linkify()``
-====================
+=========================
+Linkifying text fragments
+=========================
 
 ``linkify()`` searches text for links, URLs, and email addresses and lets you
-control how and when those links are rendered::
-
-    def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-                parse_email=False):
-        """Convert URL-like strings in an HTML fragment to links.
+control how and when those links are rendered.
 
 ``linkify()`` works by building a document tree, so it's guaranteed never to do
 weird things to URLs in attribute values, can modify the value of attributes on
 ``<a>`` tags, and can even do things like skip ``<pre>`` sections.
 
-By default, ``linkify()`` will perform some sanitization, only allowing a set
-of "safe" tags. Because it uses the HTML5 parsing algorithm, it will always
-handle things like unclosed tags.
+By default, ``linkify()`` will perform some sanitization, only allowing a set of
+"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
+things like unclosed tags.
 
 .. note::
+
    You may pass a ``string`` or ``unicode`` object, but Bleach will always
    return ``unicode``.
 
+.. autofunction:: bleach.linkify
+
 
-Callbacks
-=========
+Callbacks for adjusting attributes (``callbacks``)
+==================================================
 
 The second argument to ``linkify()`` is a list or other iterable of callback
 functions. These callbacks can modify links that exist and links that are being
@@ -36,20 +35,23 @@ Each callback will get the following arguments::
 
     def my_callback(attrs, new=False):
 
-The ``attrs`` argument is a dict of attributes of the ``<a>`` tag. The ``new``
-argument is a boolean indicating if the link is new (e.g. an email address or
-URL found in the text) or already existed (e.g. an ``<a>`` tag found in the
-text). The ``attrs`` dict also contains a ``_text`` key, which is the innerText
-of the ``<a>`` tag.
+The ``attrs`` argument is a dict of attributes of the ``<a>`` tag. Keys of the
+``attrs`` dict are namespaced attr names. For example ``(None, 'href')``. The
+``attrs`` dict also contains a ``_text`` key, which is the innerText of the
+``<a>`` tag.
 
-The callback must return a dict of attributes (including ``_text``) or
-``None``. The new dict of attributes will be passed to the next callback in the
-list. If any callback returns ``None``, the link will not be created and the
-original text left in place, or will be removed, and its original innerText
-left in place.
+The ``new`` argument is a boolean indicating if the link is new (e.g. an email
+address or URL found in the text) or already existed (e.g. an ``<a>`` tag found
+in the text).
 
-The default value is simply to add ``rel="nofollow"``. See ``bleach.callbacks``
-for some included callback functions.
+The callback must return a dict of attributes (including ``_text``) or ``None``.
+The new dict of attributes will be passed to the next callback in the list.
+
+If any callback returns ``None``, new links will not be created and existing
+links will be removed leaving the innerText left in its place.
+
+The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
+included callback functions.
 
 
 Setting Attributes
@@ -59,22 +61,24 @@ For example, to set ``rel="nofollow"`` on all links found in the text, a simple
 (and included) callback might be::
 
     def set_nofollow(attrs, new=False):
-        attrs['rel'] = 'nofollow'
+        attrs[(None, 'rel')] = 'nofollow'
         return attrs
 
-This would overwrite the value of the ``rel`` attribute if it was set.
 
-You could also make external links open in a new tab, or set a class::
+This would set the value of the ``rel`` attribute, stomping on a previous value
+if there was one.
+
+You could also make external links open in a new tab or set a class::
 
     from urlparse import urlparse
 
     def set_target(attrs, new=False):
-        p = urlparse(attrs['href'])
+        p = urlparse(attrs[(None, 'href')])
         if p.netloc not in ['my-domain.com', 'other-domain.com']:
-            attrs['target'] = '_blank'
-            attrs['class'] = 'external'
+            attrs[(None, 'target')] = '_blank'
+            attrs[(None, 'class')] = 'external'
         else:
-            attrs.pop('target', None)
+            attrs.pop((None, 'target'), None)
         return attrs
 
 
@@ -89,18 +93,20 @@ sanitizing attributes.)
 
     def allowed_attributes(attrs, new=False):
         """Only allow href, target, rel and title."""
-        allowed = ['href', 'target', 'rel', 'title']
+        allowed = [(None, 'href'), (None, 'target'),
+                   (None, 'rel'), (None, 'title')]
         return dict((k, v) for k, v in attrs.items() if k in allowed)
 
+
 Or you could remove a specific attribute, if it exists::
 
     def remove_title1(attrs, new=False):
-        attrs.pop('title', None)
+        attrs.pop((None, 'title'), None)
         return attrs
 
     def remove_title2(attrs, new=False):
-        if 'title' in attrs:
-            del attrs['title']
+        if (None, 'title') in attrs:
+            del attrs[(None, 'title')]
         return attrs
 
 
@@ -117,6 +123,7 @@ limit the length of text inside an ``<a>`` tag.
         """Shorten overly-long URLs in the text."""
         if not new:  # Only looking at newly-created links.
             return attrs
+
         # _text will be the same as the URL for new links.
         text = attrs['_text']
         if len(text) > 25:
@@ -130,10 +137,10 @@ limit the length of text inside an ``<a>`` tag.
 
     def outgoing_bouncer(attrs, new=False):
         """Send outgoing links through a bouncer."""
-        p = urlparse(attrs['href'])
+        p = urlparse((None, attrs['href']))
         if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
             bouncer = 'http://outgoing.my-domain.com/?destination=%s'
-            attrs['href'] = bouncer % quote(attrs['href'])
+            attrs[(None, 'href')] = bouncer % quote(attrs['href'])
         return attrs
 
 
@@ -151,7 +158,7 @@ write the following callback::
             return attrs
 
         # If the TLD is '.py', make sure it starts with http: or https:
-        href = attrs['href']
+        href = attrs[(None, 'href')]
         if href.endswith('.py') and not href.startswith(('http:', 'https:')):
             # This looks like a Python file, not a URL. Don't make a link.
             return None
@@ -168,13 +175,13 @@ If you want to remove certain links, even if they are written in the text with
 
     def remove_mailto(attrs, new=False):
         """Remove any mailto: links."""
-        if attrs['href'].startswith('mailto:'):
+        if attrs[(None, 'href')].startswith('mailto:'):
             return None
         return attrs
 
 
-``skip_pre``
-============
+Skipping links in pre blocks (``skip_pre``)
+===========================================
 
 ``<pre>`` tags are often special, literal sections. If you don't want to create
 any new links within a ``<pre>`` section, pass ``skip_pre=True``.
@@ -184,8 +191,8 @@ any new links within a ``<pre>`` section, pass ``skip_pre=True``.
    tags will still be passed through all the callbacks.
 
 
-``parse_email``
-===============
+Linkifying email addresses (``parse_email``)
+============================================
 
 By default, ``linkify()`` does not create ``mailto:`` links for email
 addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
@@ -194,4 +201,50 @@ they are newly created or already in the text, so be careful when writing
 callbacks that may need to behave differently if the protocol is ``mailto:``.
 
 
+Using ``bleach.linkifier.LinkifyFilter``
+========================================
+
+``bleach.linkify`` works by paring an HTML fragment and then running it through
+the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
+back into text.
+
+You can use this filter wherever you can use an html5lib Filter. For example, you
+could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+
+For example, using all the defaults:
+
+.. doctest::
+
+   >>> from functools import partial
+
+   >>> from bleach import Cleaner
+   >>> from bleach.linkifier import LinkifyFilter
+
+   >>> cleaner = Cleaner(tags=['pre'])
+   >>> cleaner.clean('<pre>http://example.com</pre>')
+   u'<pre>http://example.com</pre>'
+
+   >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter])
+   >>> cleaner.clean('<pre>http://example.com</pre>')
+   u'<pre><a href="http://example.com">http://example.com</a></pre>'
+
+
+And passing parameters to ``LinkifyFilter``:
+
+.. doctest::
+
+   >>> from functools import partial
+
+   >>> from bleach import Cleaner
+   >>> from bleach.linkifier import LinkifyFilter
+
+   >>> cleaner = Cleaner(
+   ...     tags=['pre'],
+   ...     filters=[partial(LinkifyFilter, skip_pre=True)]
+   ... )
+   ...
+   >>> cleaner.clean('<pre>http://example.com</pre>')
+   u'<pre>http://example.com</pre>'
+
+
 .. _Crate: https://crate.io/

From ddc39ec4a30c5a378976ca97664939c67420ebe5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 4 Mar 2017 09:44:23 -0500
Subject: [PATCH 074/314] Minor code cleanup and comments

---
 bleach/linkifier.py | 159 ++++++++++++++++++++++++++++----------------
 bleach/sanitizer.py |   8 ++-
 2 files changed, 110 insertions(+), 57 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index b4ba2ea8..c6a8486a 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -54,6 +54,18 @@
 
 
 class LinkifyFilter(Filter):
+    """html5lib filter that linkifies text
+
+    This will do the following:
+
+    * convert email addresses into links
+    * convert urls into links
+    * edit existing links by running them through callbacks--the default is to
+      add a ``rel="nofollow"``
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
     def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
         super(LinkifyFilter, self).__init__(source)
 
@@ -62,6 +74,13 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
         self.parse_email = parse_email
 
     def apply_callbacks(self, attrs, is_new):
+        """Given an attrs dict and an is_new bool, runs through callbacks
+
+        Callbacks can return an adjusted attrs dict or None. In the case of
+        None, we stop going through callbacks and return that and the link gets
+        dropped.
+
+        """
         for cb in self.callbacks:
             attrs = cb(attrs, is_new)
             if attrs is None:
@@ -70,6 +89,23 @@ def apply_callbacks(self, attrs, is_new):
 
     def extract_character_data(self, token_list):
         """Extracts and squashes character sequences in a token stream"""
+        # FIXME(willkg): This is a terrible idea. What it does is drop all the
+        # tags from the token list and merge the Characters and SpaceCharacters
+        # tokens into a single text.
+        #
+        # So something like this::
+        #
+        #     "<span>" "<b>" "some text" "</b>" "</span>"
+        #
+        # gets converted to "some text".
+        #
+        # This gets used to figure out the ``_text`` fauxttribute value for
+        # linkify callables.
+        #
+        # I'm not really sure how else to support that ``_text`` fauxttribute and
+        # maintain some modicum of backwards compatability with previous versions
+        # of Bleach.
+
         out = []
         for token in token_list:
             token_type = token['type']
@@ -86,6 +122,7 @@ def handle_email_addresses(self, src_iter):
                 new_tokens = []
                 end = 0
 
+                # For each email address we find in the text
                 for match in email_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
@@ -101,13 +138,13 @@ def handle_email_addresses(self, src_iter):
                     attrs = self.apply_callbacks(attrs, True)
 
                     if attrs is None:
-                        # Just add the text
+                        # Just add the text--but not as a link
                         new_tokens.append(
                             {u'type': u'Characters', u'data': match.group(0)}
                         )
 
                     else:
-                        # Add a "a" tag
+                        # Add an "a" tag for the new link
                         _text = attrs.pop(u'_text', '')
                         attrs = alphabetize_attributes(attrs)
                         new_tokens.extend([
@@ -118,6 +155,8 @@ def handle_email_addresses(self, src_iter):
                     end = match.end()
 
                 if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
                     if end < len(text):
                         new_tokens.append({u'type': u'Characters', u'data': text[end:]})
 
@@ -128,46 +167,58 @@ def handle_email_addresses(self, src_iter):
 
             yield token
 
-    def strip_wrapping_parentheses(self, fragment):
-        """Strips wrapping parentheses"""
+    def strip_parentheses(self, fragment):
+        """Strips parentheses from before and after url"""
         openp = closep = 0
 
         # Count consecutive opening parentheses at the beginning of the
         # fragment (string)
-        for char in fragment:
-            if char == '(':
-                openp += 1
-            else:
-                break
-
-        if openp:
-            newer_frag = ''
-            # Cut the consecutive opening brackets from the fragment
-            fragment = fragment[openp:]
-
-            # Reverse the fragment for easier detection of parentheses
-            # inside the URL
-            reverse_fragment = fragment[::-1]
-            skip = False
-            for char in reverse_fragment:
-                if char == ')' and closep < openp and not skip:
-                    # Remove the closing parentheses if it has a matching
-                    # opening parentheses (they are balanced).
-                    closep += 1
-                    continue
-
-                elif char != ')':
-                    # Do not remove ')' from the URL itself.
-                    skip = True
-
-                newer_frag += char
-
-            # Reverse fragment back
-            fragment = newer_frag[::-1]
+        if fragment.startswith(u'('):
+            for char in fragment:
+                if char == '(':
+                    openp += 1
+                else:
+                    break
+
+            if openp:
+                newer_frag = ''
+
+                # Cut the consecutive opening brackets from the fragment
+                fragment = fragment[openp:]
+
+                # Reverse the fragment for easier detection of parentheses
+                # inside the URL
+                reverse_fragment = fragment[::-1]
+                skip = False
+                for char in reverse_fragment:
+                    if char == ')' and closep < openp and not skip:
+                        # Remove the closing parentheses if it has a matching
+                        # opening parentheses (they are balanced).
+                        closep += 1
+                        continue
+
+                    elif char != ')':
+                        # Do not remove ')' from the URL itself.
+                        skip = True
+
+                    newer_frag += char
+
+                # Reverse fragment back
+                fragment = newer_frag[::-1]
+
+        # Sometimes we pick up ) at the end of a url, but the url is in a
+        # parenthesized phrase like:
+        #
+        #     "i looked at the site (at http://example.com)"
+        if fragment.endswith(u')') and u'(' not in fragment:
+            new_fragment = fragment.rstrip(u')')
+            closep += (len(fragment) - len(new_fragment))
+            fragment = new_fragment
 
         return fragment, u'(' * openp, u')' * closep
 
     def strip_punctuation(self, fragment):
+        """Strips punctuation at the end of a url match"""
         match = re.search(punct_re, fragment)
         if match:
             return fragment[0:match.start()], match.group(0)
@@ -192,19 +243,12 @@ def handle_links(self, src_iter):
                     prefix = suffix = ''
 
                     # Sometimes we pick up ( and ), so drop them from the url
-                    if url.startswith('('):
-                        url, prefix, suffix = self.strip_wrapping_parentheses(url)
-
-                    if url.endswith(u')') and u'(' not in url:
-                        new_url = url.rstrip(u')')
-                        suffix = url[len(new_url):] + suffix
-                        url = new_url
+                    url, prefix, suffix = self.strip_parentheses(url)
 
                     # Sometimes we pick up . and , at the end of the url that's
                     # part of the sentence and not the url so drop it
                     url, punct_suffix = self.strip_punctuation(url)
-                    if punct_suffix:
-                        suffix = suffix + punct_suffix
+                    suffix = suffix + punct_suffix
 
                     # If there's no protocol, add one
                     if re.search(proto_re, url):
@@ -218,19 +262,20 @@ def handle_links(self, src_iter):
                     }
                     attrs = self.apply_callbacks(attrs, True)
 
-                    if prefix:
-                        new_tokens.append(
-                            {u'type': u'Characters', u'data': prefix}
-                        )
-
                     if attrs is None:
                         # Just add the text
                         new_tokens.append(
-                            {u'type': u'Characters', u'data': url}
+                            {u'type': u'Characters', u'data': prefix + url + suffix}
                         )
 
                     else:
-                        # Add an "a" tag!
+                        # Add the "a" tag!
+
+                        if prefix:
+                            new_tokens.append(
+                                {u'type': u'Characters', u'data': prefix}
+                            )
+
                         _text = attrs.pop(u'_text', '')
                         attrs = alphabetize_attributes(attrs)
 
@@ -240,14 +285,16 @@ def handle_links(self, src_iter):
                             {u'type': u'EndTag', u'name': 'a'},
                         ])
 
-                    if suffix:
-                        new_tokens.append(
-                            {u'type': u'Characters', u'data': suffix}
-                        )
+                        if suffix:
+                            new_tokens.append(
+                                {u'type': u'Characters', u'data': suffix}
+                            )
 
                     end = match.end()
 
                 if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
                     if end < len(text):
                         new_tokens.append({u'type': u'Characters', u'data': text[end:]})
 
@@ -334,9 +381,9 @@ def __iter__(self):
                     # yet
                     continue
 
-            elif in_pre:
+            elif in_pre and self.skip_pre:
                 # NOTE(willkg): We put this clause here since in_a and
-                # switching in and out of is_a takes precedence.
+                # switching in and out of in_a takes precedence.
                 if token['type'] == 'EndTag' and token['name'] == 'pre':
                     in_pre = False
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 610dd903..18ce49f4 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -9,6 +9,11 @@
 
 
 class BleachSanitizerFilter(sanitizer.Filter):
+    """html5lib Filter that sanitizes text
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
     def __init__(self, source, allowed_attributes_map,
                  strip_disallowed_elements=False, strip_html_comments=True,
                  **kwargs):
@@ -60,6 +65,7 @@ def sanitize_token(self, token):
             return token
 
     def allow_token(self, token):
+        """Handles the case where we're allowing the tag"""
         if 'data' in token:
             allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
             if not callable(allowed_attributes):
@@ -131,7 +137,7 @@ def allow_token(self, token):
         return token
 
     def sanitize_css(self, style):
-        """html5lib sanitizer filter replacement to fix issues"""
+        """Sanitizes css in style tags"""
         # disallow urls
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 

From 6968e5d8eec7be235ea167d97c73ea2937ad0a59 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 4 Mar 2017 09:52:37 -0500
Subject: [PATCH 075/314] Add tests for alphabetize_attributes

---
 tests/test_utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 tests/test_utils.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..076617df
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,44 @@
+from collections import OrderedDict
+
+from bleach.utils import alphabetize_attributes
+
+
+class TestAlphabeticalAttributes:
+    def test_empty_cases(self):
+        assert alphabetize_attributes(None) is None
+
+        assert alphabetize_attributes({}) == {}
+
+    def test_ordering(self):
+        assert (
+            alphabetize_attributes({
+                (None, 'a'): 1,
+                (None, 'b'): 2
+            }) ==
+            OrderedDict([
+                ((None, 'a'), 1),
+                ((None, 'b'), 2)
+            ])
+        )
+        assert (
+            alphabetize_attributes({
+                (None, 'b'): 1,
+                (None, 'a'): 2}
+            ) ==
+            OrderedDict([
+                ((None, 'a'), 2),
+                ((None, 'b'), 1)
+            ])
+        )
+
+    def test_different_namespaces(self):
+        assert (
+            alphabetize_attributes({
+                ('xlink', 'href'): 'abc',
+                (None, 'alt'): '123'
+            }) ==
+            OrderedDict([
+                ((None, 'alt'), '123'),
+                (('xlink', 'href'), 'abc')
+            ])
+        )

From a1a85e9226e2be45a4eded6680b5ede4b2fa1e4c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 4 Mar 2017 10:37:37 -0500
Subject: [PATCH 076/314] Fix handling for over-eager url matching

---
 bleach/linkifier.py | 100 +++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 58 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index c6a8486a..04ab8275 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -40,8 +40,6 @@
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
-punct_re = re.compile(r'([\.,]+)$')
-
 email_re = re.compile(
     r"""(?<!//)
     (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
@@ -167,63 +165,53 @@ def handle_email_addresses(self, src_iter):
 
             yield token
 
-    def strip_parentheses(self, fragment):
-        """Strips parentheses from before and after url"""
-        openp = closep = 0
+    def strip_non_url_bits(self, fragment):
+        """Strips non-url bits from the url
 
-        # Count consecutive opening parentheses at the beginning of the
-        # fragment (string)
-        if fragment.startswith(u'('):
-            for char in fragment:
-                if char == '(':
-                    openp += 1
-                else:
-                    break
+        This accounts for over-eager matching by the regex.
 
-            if openp:
-                newer_frag = ''
+        """
+        prefix = suffix = ''
+
+        while fragment:
+            # Try removing ( from the beginning and, if it's balanced, from the
+            # end, too
+            if fragment.startswith(u'('):
+                prefix = prefix + u'('
+                fragment = fragment[1:]
+
+                if fragment.endswith(u')'):
+                    suffix = u')' + suffix
+                    fragment = fragment[:-1]
+                continue
 
-                # Cut the consecutive opening brackets from the fragment
-                fragment = fragment[openp:]
+            # Now try extraneous things from the end. For example, sometimes we
+            # pick up ) at the end of a url, but the url is in a parenthesized
+            # phrase like:
+            #
+            #     "i looked at the site (at http://example.com)"
 
-                # Reverse the fragment for easier detection of parentheses
-                # inside the URL
-                reverse_fragment = fragment[::-1]
-                skip = False
-                for char in reverse_fragment:
-                    if char == ')' and closep < openp and not skip:
-                        # Remove the closing parentheses if it has a matching
-                        # opening parentheses (they are balanced).
-                        closep += 1
-                        continue
+            if fragment.endswith(u')') and u'(' not in fragment:
+                fragment = fragment[:-1]
+                suffix = u')' + suffix
+                continue
 
-                    elif char != ')':
-                        # Do not remove ')' from the URL itself.
-                        skip = True
+            # Handle commas
+            if fragment.endswith(u','):
+                fragment = fragment[:-1]
+                suffix = u',' + suffix
+                continue
 
-                    newer_frag += char
+            # Handle periods
+            if fragment.endswith(u'.'):
+                fragment = fragment[:-1]
+                suffix = u'.' + suffix
+                continue
 
-                # Reverse fragment back
-                fragment = newer_frag[::-1]
+            # Nothing matched, so we're done
+            break
 
-        # Sometimes we pick up ) at the end of a url, but the url is in a
-        # parenthesized phrase like:
-        #
-        #     "i looked at the site (at http://example.com)"
-        if fragment.endswith(u')') and u'(' not in fragment:
-            new_fragment = fragment.rstrip(u')')
-            closep += (len(fragment) - len(new_fragment))
-            fragment = new_fragment
-
-        return fragment, u'(' * openp, u')' * closep
-
-    def strip_punctuation(self, fragment):
-        """Strips punctuation at the end of a url match"""
-        match = re.search(punct_re, fragment)
-        if match:
-            return fragment[0:match.start()], match.group(0)
-        else:
-            return fragment, ''
+        return fragment, prefix, suffix
 
     def handle_links(self, src_iter):
         """Handle links in character tokens"""
@@ -242,13 +230,9 @@ def handle_links(self, src_iter):
                     url = match.group(0)
                     prefix = suffix = ''
 
-                    # Sometimes we pick up ( and ), so drop them from the url
-                    url, prefix, suffix = self.strip_parentheses(url)
-
-                    # Sometimes we pick up . and , at the end of the url that's
-                    # part of the sentence and not the url so drop it
-                    url, punct_suffix = self.strip_punctuation(url)
-                    suffix = suffix + punct_suffix
+                    # Sometimes we pick up too much in the url match, so look for
+                    # bits we should drop and remove them from the match
+                    url, prefix, suffix = self.strip_non_url_bits(url)
 
                     # If there's no protocol, add one
                     if re.search(proto_re, url):

From cf248b374796a90930c65d33013457871a553115 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 4 Mar 2017 10:41:20 -0500
Subject: [PATCH 077/314] Add tests from #78

---
 tests/test_links.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_links.py b/tests/test_links.py
index 1712d199..8e166543 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -467,6 +467,14 @@ def test_sarcasm():
         '(http://en.wikipedia.org/wiki/)Test_(assessment',
         ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
          'http://en.wikipedia.org/wiki/)Test_(assessment', '')
+    ),
+    (
+        'hello (http://www.mu.de/blah.html) world',
+        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world')
+    ),
+    (
+        'hello (http://www.mu.de/blah.html). world',
+        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world')
     )
 ])
 def test_wrapping_parentheses(data, expected_data):

From bb44c173700853c0da33a1cbb43632e95f54e885 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 4 Mar 2017 10:53:43 -0500
Subject: [PATCH 078/314] Move "a" tag handling to a method

---
 bleach/linkifier.py | 92 +++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 04ab8275..a3e46009 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -254,7 +254,6 @@ def handle_links(self, src_iter):
 
                     else:
                         # Add the "a" tag!
-
                         if prefix:
                             new_tokens.append(
                                 {u'type': u'Characters', u'data': prefix}
@@ -289,6 +288,49 @@ def handle_links(self, src_iter):
 
             yield token
 
+    def handle_a_tag(self, token_buffer):
+        """Handle the "a" tag
+
+        This could adjust the link or drop it altogether depending on what the
+        callbacks return.
+
+        This yields the new set of tokens.
+
+        """
+        a_token = token_buffer[0]
+        if a_token['data']:
+            attrs = a_token['data']
+        else:
+            attrs = {}
+        text = self.extract_character_data(token_buffer)
+        attrs['_text'] = text
+
+        attrs = self.apply_callbacks(attrs, False)
+
+        if attrs is None:
+            # We're dropping the "a" tag and everything else and replacing
+            # it with character data. So emit that token.
+            yield {'type': 'Characters', 'data': text}
+
+        else:
+            new_text = attrs.pop('_text', '')
+            a_token['data'] = alphabetize_attributes(attrs)
+
+            if text == new_text:
+                # The callbacks didn't change the text, so we yield the new "a"
+                # token, then whatever else was there, then the end "a" token
+                yield a_token
+                for mem in token_buffer[1:]:
+                    yield mem
+
+            else:
+                # If the callbacks changed the text, then we're going to drop
+                # all the tokens between the start and end "a" tags and replace
+                # it with the new text
+                yield a_token
+                yield {'type': 'Characters', 'data': force_unicode(new_text)}
+                yield token_buffer[-1]
+
     def __iter__(self):
         in_a = False
         in_pre = False
@@ -300,47 +342,15 @@ def __iter__(self):
                 # Handle the case where we're in an "a" tag--we want to buffer tokens
                 # until we hit an end "a" tag.
                 if token['type'] == 'EndTag' and token['name'] == 'a':
-                    # We're no longer in an "a" tag, so we get all the things we
-                    # need to apply callbacks and then figure out what to do with
-                    # this "a" tag.
-                    in_a = False
-                    a_token = token_buffer[0]
-                    if a_token['data']:
-                        attrs = a_token['data']
-                    else:
-                        attrs = {}
-
-                    text = self.extract_character_data(token_buffer)
-                    attrs['_text'] = text
-
-                    attrs = self.apply_callbacks(attrs, False)
-                    if attrs is None:
-                        # We're dropping the "a" tag and everything else and replacing
-                        # it with character data. So emit that token.
-                        yield {'type': 'Characters', 'data': text}
-
-                    else:
-                        new_text = attrs.pop('_text', '')
-                        # FIXME(willkg): add nofollow here
-                        a_token['data'] = alphabetize_attributes(attrs)
-
-                        if text == new_text:
-                            # The callbacks didn't change the text, so we yield the
-                            # new "a" token, then whatever else was there, then the
-                            # end "a" token
-                            yield a_token
-                            for mem in token_buffer[1:]:
-                                yield mem
-                            yield token
-
-                        else:
-                            # If the callbacks changed the text, then we're going
-                            # to drop all the tokens between the start and end "a"
-                            # tags and replace it with the new text
-                            yield a_token
-                            yield {'type': 'Characters', 'data': force_unicode(new_text)}
-                            yield token
+                    # Add the end tag to the token buffer and then handle them
+                    # and yield anything returned
+                    token_buffer.append(token)
+                    for new_token in self.handle_a_tag(token_buffer):
+                        yield new_token
 
+                    # Clear "a" related state and continue since we've yielded all
+                    # the tokens we're going to yield
+                    in_a = False
                     token_buffer = []
                     continue
 

From 460aa6d3a95f5cc89f9589e8398491f3a5e2180d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 6 Mar 2017 13:55:05 -0500
Subject: [PATCH 079/314] Restructure linkify; clean up __init__; update docs

This is a ton of changes all in one commit. Sorry.

* Restructures linkify so it mirrors clean. This has the nicety in that
  the two are parallel and can be used the same.
* Rework how url_re and email_re work so that it's possible to override
  them and/or provide your own list of allowed procotols and TLDs
* Overhaul the docs including converting linkify examples to doctest
* Update CHANGES
---
 CHANGES              |  23 ++--
 bleach/__init__.py   | 245 +++++++++---------------------------------
 bleach/linkifier.py  | 100 ++++++++++++++---
 bleach/sanitizer.py  | 142 ++++++++++++++++++++++++
 docs/clean.rst       |  38 ++++---
 docs/linkify.rst     | 250 +++++++++++++++++++++++++++++--------------
 tests/test_basics.py |   8 +-
 tests/test_links.py  |  44 ++++++--
 8 files changed, 518 insertions(+), 332 deletions(-)

diff --git a/CHANGES b/CHANGES
index d7d9b0d2..9afe859f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,7 +17,7 @@ Version 2.0 (in development)
 
 * ``bleach.clean`` and friends were rewritten
 
-  ``clean`` is now implemented as an html5lib Filter and happens at a different
+  ``clean`` was reimplemented as an html5lib filter and happens at a different
   step in the HTML parsing -> traversing -> serializing process. Because of
   that, there are some differences in clean's output as compared with previous
   versions.
@@ -43,11 +43,14 @@ Version 2.0 (in development)
   Now it's more like this::
 
       def check_protocol(attrs, is_new):
-          if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')):
+          if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
               #            ^^^^^^^^^^^^^^
               return None
           return attrs
 
+  Further, you need to make sure you're always using unicode values. If you
+  don't then html5lib will raise an assertion error that the value is not
+  unicode.
 
 **Changes**
 
@@ -55,17 +58,19 @@ Version 2.0 (in development)
 
 * Supports html5lib >= 0.99999999 (8 9s).
 
-* There's a ``bleach.Cleaner`` class that you can instantiate with your
-  favorite clean settings and reuse it.
+* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your
+  favorite clean settings for easy reuse.
 
-* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib Filter.
+* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your
+  favorite linkify settings for easy reuse.
 
-* You can pass ``bleach.linkifier.LinkifyFilter`` as a Filter to
-  ``bleach.Cleaner`` allowing you to clean and linkify in one pass.
+* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
+  you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and
+  linkify in one pass.
 
-* Lots of bug fixes.
+* Tons of bug fixes.
 
-* Test cleanup.
+* Cleaned up tests.
 
 * Documentation fixes.
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 0155a127..07b5075c 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,169 +1,28 @@
 # -*- coding: utf-8 -*-
 
 from __future__ import unicode_literals
-import logging
-import re
-
-import html5lib
-from html5lib.filters import sanitizer
-from html5lib.filters.sanitizer import allowed_protocols
-from html5lib.serializer import HTMLSerializer
-
-from bleach import callbacks as linkify_callbacks
-from bleach.encoding import force_unicode
-from bleach.linkifier import LinkifyFilter
-from bleach.sanitizer import BleachSanitizerFilter
-from bleach.version import __version__, VERSION # flake8: noqa
-
-__all__ = ['Cleaner', 'clean', 'linkify']
-
-log = logging.getLogger(__name__)
-log.addHandler(logging.NullHandler())
-
-ALLOWED_TAGS = [
-    'a',
-    'abbr',
-    'acronym',
-    'b',
-    'blockquote',
-    'code',
-    'em',
-    'i',
-    'li',
-    'ol',
-    'strong',
-    'ul',
-]
-
-ALLOWED_ATTRIBUTES = {
-    'a': ['href', 'title'],
-    'abbr': ['title'],
-    'acronym': ['title'],
-}
-
-ALLOWED_STYLES = []
-
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
-
-
-class Cleaner(object):
-    """Cleaner for cleaning HTML fragments of malicious content
-
-    This cleaner is a security-focused function whose sole purpose is to remove
-    malicious content from a string such that it can be displayed as content in
-    a web page.
-
-    This cleaner is not designed to use to transform content to be used in
-    non-web-page contexts.
-
-    To use::
-
-        from bleach import Cleaner
-
-        cleaner = Cleaner()
-
-        for text in all_the_yucky_things:
-            sanitized = cleaner.clean(text)
-
-    """
-
-    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-                 strip_comments=True, filters=None):
-        """Initializes a Cleaner
-
-        :arg tags: whitelist of allowed tags; defaults to
-            ``bleach.ALLOWED_TAGS``
-
-        :arg attributes: whitelist of allowed attributes; defaults to
-            ``bleach.ALLOWED_ATTRIBUTES``
-
-        :arg styles: whitelist of allowed css; defaults to
-            ``bleach.ALLOWED_STYLES``
-
-        :arg protocols: whitelist of allowed protocols for links; defaults
-            to ``bleach.ALLOWED_PROTOCOLS``
-
-        :arg strip: whether or not to strip disallowed elements
-
-        :arg strip_comments: whether or not to strip HTML comments
-
-        :arg filters: list of html5lib Filter classes to pass streamed content through
-
-            See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
 
-            .. Warning::
-
-               Using filters changes the output of ``bleach.Cleaner.clean``.
-               Make sure the way the filters change the output are secure.
-
-        """
-        self.tags = tags
-        self.attributes = attributes
-        self.styles = styles
-        self.protocols = protocols
-        self.strip = strip
-        self.strip_comments = strip_comments
-        self.filters = filters or []
-
-        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
-        self.walker = html5lib.getTreeWalker('etree')
-        self.serializer = HTMLSerializer(
-            quote_attr_values='always',
-            omit_optional_tags=False,
-
-            # Bleach has its own sanitizer, so don't use the html5lib one
-            sanitize=False,
-
-            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
-            alphabetical_attributes=False,
-        )
-
-    def clean(self, text):
-        """Cleans text and returns sanitized result as unicode
-
-        :arg str text: text to be cleaned
-
-        :returns: sanitized text as unicode
-
-        """
-        if not text:
-            return u''
-
-        text = force_unicode(text)
-
-        dom = self.parser.parseFragment(text)
-        filtered = BleachSanitizerFilter(
-            source=self.walker(dom),
-
-            # Bleach-sanitizer-specific things
-            allowed_attributes_map=self.attributes,
-            strip_disallowed_elements=self.strip,
-            strip_html_comments=self.strip_comments,
-
-            # html5lib-sanitizer things
-            allowed_elements=self.tags,
-            allowed_css_properties=self.styles,
-            allowed_protocols=self.protocols,
-            allowed_svg_properties=[],
-        )
-
-        # Apply any filters after the BleachSanitizerFilter
-        for filter_class in self.filters:
-            filtered = filter_class(source=filtered)
+from bleach.linkifier import (
+    DEFAULT_CALLBACKS,
+    Linker,
+    LinkifyFilter,
+)
+from bleach.sanitizer import (
+    ALLOWED_ATTRIBUTES,
+    ALLOWED_PROTOCOLS,
+    ALLOWED_STYLES,
+    ALLOWED_TAGS,
+    BleachSanitizerFilter,
+    Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
 
-        return self.serializer.render(filtered)
+__all__ = ['clean', 'linkify']
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-          strip_comments=True, filters=None):
+          strip_comments=True):
     """Clean an HTML fragment of malicious content and return it
 
     This function is a security-focused function whose sole purpose is to
@@ -182,36 +41,27 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     .. Note::
 
-       If you're cleaning a lot of text and passing the same argument
-       values, consider caching a ``Cleaner`` instance.
+       If you're cleaning a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.sanitizer.Cleaner` instance.
 
-    :arg text: the text to clean
+    :arg str text: the text to clean
 
-    :arg tags: whitelist of allowed tags; defaults to
+    :arg list tags: whitelist of allowed tags; defaults to
         ``bleach.ALLOWED_TAGS``
 
-    :arg attributes: whitelist of allowed attributes; defaults to
+    :arg dict attributes: whitelist of allowed attributes; defaults to
         ``bleach.ALLOWED_ATTRIBUTES``
 
-    :arg styles: whitelist of allowed css; defaults to
+    :arg list styles: whitelist of allowed css; defaults to
         ``bleach.ALLOWED_STYLES``
 
-    :arg protocols: whitelist of allowed protocols for links; defaults
+    :arg list protocols: whitelist of allowed protocols for links; defaults
         to ``bleach.ALLOWED_PROTOCOLS``
 
-    :arg strip: whether or not to strip disallowed elements
-
-    :arg strip_comments: whether or not to strip HTML comments
-
-    :arg filters: list of html5lib Filter classes to pass streamed content through
-
-        See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+    :arg bool strip: whether or not to strip disallowed elements
 
-        .. Warning::
-
-           Using filters changes the output of
-           ``bleach.Cleaner.clean``. Make sure the way the filters
-           change the output are secure.
+    :arg bool strip_comments: whether or not to strip HTML comments
 
     :returns: cleaned text as unicode
 
@@ -223,7 +73,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         protocols=protocols,
         strip=strip,
         strip_comments=strip_comments,
-        filters=filters,
     )
     return cleaner.clean(text)
 
@@ -231,40 +80,42 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
-    ``linkify()`` converts strings that look like URLs, domain names and email
+    This function converts strings that look like URLs, domain names and email
     addresses in text that may be an HTML fragment to links, while preserving:
 
     1. links already in the string
     2. urls found in attributes
     3. email addresses
 
-    ``linkify()`` does a best-effort approach and tries to recover from bad
+    linkify does a best-effort approach and tries to recover from bad
     situations due to crazy text.
 
-    """
-    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
-    walker = html5lib.getTreeWalker('etree')
-    serializer = HTMLSerializer(
-        quote_attr_values='always',
-        omit_optional_tags=False,
+    .. Note::
 
-        # Bleach has its own sanitizer, so don't use the html5lib one
-        sanitize=False,
+       If you're linking a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.linkifier.Linker` instance.
 
-        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
-        alphabetical_attributes=False,
-    )
+    .. Note::
+
+       If you have text that you want to clean and then linkify, consider using
+       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+       pass. That way you're not parsing the HTML twice.
+
+    :arg str text: the text to linkify
 
-    text = force_unicode(text)
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes
 
-    if not text:
-        return u''
+    :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag
 
-    dom = parser.parseFragment(text)
-    filtered = LinkifyFilter(
-        source=walker(dom),
+    :arg bool parse_email: whether or not to linkify email addresses
+
+    :returns: linkified text as unicode
+
+    """
+    linker = Linker(
         callbacks=callbacks,
         skip_pre=skip_pre,
         parse_email=parse_email
     )
-    return serializer.render(filtered)
+    return linker.linkify(text)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index a3e46009..1396f056 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -1,14 +1,19 @@
 from __future__ import unicode_literals
 import re
 
+import html5lib
 from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
 
-from bleach import allowed_protocols
+from bleach import callbacks as linkify_callbacks
 from bleach.encoding import force_unicode
 from bleach.utils import alphabetize_attributes
 
 
-# FIXME(willkg): Move this to a constants module.
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
        ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
        cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
@@ -27,20 +32,37 @@
 TLDS.reverse()
 
 
-url_re = re.compile(
-    r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
-        # /path/zz (excluding "unsafe" chars from RFC 1738,
-        # except for # and ~, which happen in practice)
-    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
-    re.IGNORECASE | re.VERBOSE | re.UNICODE)
+def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+    """Builds the url regex used by linkifier
+
+   If you want a different set of tlds or allowed protocols, pass those in
+   and stomp on the existing ``url_re``::
+
+       from bleach import linkifier
+
+       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+       linker = LinkifyFilter(url_re=my_url_re)
+
+    """
+    return re.compile(
+        r"""\(*  # Match any opening parentheses.
+        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+            # /path/zz (excluding "unsafe" chars from RFC 1738,
+            # except for # and ~, which happen in practice)
+        """.format('|'.join(protocols), '|'.join(tlds)),
+        re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+URL_RE = build_url_re()
+
 
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
-email_re = re.compile(
+EMAIL_RE = re.compile(
     r"""(?<!//)
     (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
         (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
@@ -51,6 +73,46 @@
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 
 
+class Linker(object):
+    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False,
+                 url_re=URL_RE, email_re=EMAIL_RE):
+        self.callbacks = callbacks
+        self.skip_pre = skip_pre
+        self.parse_email = parse_email
+        self.url_re = url_re
+        self.email_re = email_re
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib.getTreeWalker('etree')
+        self.serializer = HTMLSerializer(
+            quote_attr_values='always',
+            omit_optional_tags=False,
+
+            # linkify does not sanitize
+            sanitize=False,
+
+            # linkify alphabetizes
+            alphabetical_attributes=False,
+        )
+
+    def linkify(self, text):
+        text = force_unicode(text)
+
+        if not text:
+            return u''
+
+        dom = self.parser.parseFragment(text)
+        filtered = LinkifyFilter(
+            source=self.walker(dom),
+            callbacks=self.callbacks,
+            skip_pre=self.skip_pre,
+            parse_email=self.parse_email,
+            url_re=self.url_re,
+            email_re=self.email_re,
+        )
+        return self.serializer.render(filtered)
+
+
 class LinkifyFilter(Filter):
     """html5lib filter that linkifies text
 
@@ -64,13 +126,17 @@ class LinkifyFilter(Filter):
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
+    def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False,
+                 url_re=URL_RE, email_re=EMAIL_RE):
         super(LinkifyFilter, self).__init__(source)
 
         self.callbacks = callbacks or []
         self.skip_pre = skip_pre
         self.parse_email = parse_email
 
+        self.url_re = url_re
+        self.email_re = email_re
+
     def apply_callbacks(self, attrs, is_new):
         """Given an attrs dict and an is_new bool, runs through callbacks
 
@@ -121,7 +187,7 @@ def handle_email_addresses(self, src_iter):
                 end = 0
 
                 # For each email address we find in the text
-                for match in email_re.finditer(text):
+                for match in self.email_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
                             {u'type': u'Characters', u'data': text[end:match.start()]}
@@ -221,7 +287,7 @@ def handle_links(self, src_iter):
                 new_tokens = []
                 end = 0
 
-                for match in url_re.finditer(text):
+                for match in self.url_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
                             {u'type': u'Characters', u'data': text[end:match.start()]}
@@ -235,7 +301,7 @@ def handle_links(self, src_iter):
                     url, prefix, suffix = self.strip_non_url_bits(url)
 
                     # If there's no protocol, add one
-                    if re.search(proto_re, url):
+                    if PROTO_RE.search(url):
                         href = url
                     else:
                         href = u'http://%s' % url
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 18ce49f4..fcbcd915 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -2,12 +2,154 @@
 import re
 from xml.sax.saxutils import unescape
 
+import html5lib
 from html5lib.constants import namespaces
 from html5lib.filters import sanitizer
+from html5lib.serializer import HTMLSerializer
 
+from bleach.encoding import force_unicode
 from bleach.utils import alphabetize_attributes
 
 
+ALLOWED_TAGS = [
+    'a',
+    'abbr',
+    'acronym',
+    'b',
+    'blockquote',
+    'code',
+    'em',
+    'i',
+    'li',
+    'ol',
+    'strong',
+    'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+    'a': ['href', 'title'],
+    'abbr': ['title'],
+    'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+
+ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
+# a simple routine that returns the tag name with the namespace prefix
+# as returned by etree's Element.tag attribute
+
+
+class Cleaner(object):
+    """Cleaner for cleaning HTML fragments of malicious content
+
+    This cleaner is a security-focused function whose sole purpose is to remove
+    malicious content from a string such that it can be displayed as content in
+    a web page.
+
+    This cleaner is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    To use::
+
+        from bleach import Cleaner
+
+        cleaner = Cleaner()
+
+        for text in all_the_yucky_things:
+            sanitized = cleaner.clean(text)
+
+    """
+
+    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+                 strip_comments=True, filters=None):
+        """Initializes a Cleaner
+
+        :arg tags: whitelist of allowed tags; defaults to
+            ``bleach.ALLOWED_TAGS``
+
+        :arg attributes: whitelist of allowed attributes; defaults to
+            ``bleach.ALLOWED_ATTRIBUTES``
+
+        :arg styles: whitelist of allowed css; defaults to
+            ``bleach.ALLOWED_STYLES``
+
+        :arg protocols: whitelist of allowed protocols for links; defaults
+            to ``bleach.ALLOWED_PROTOCOLS``
+
+        :arg strip: whether or not to strip disallowed elements
+
+        :arg strip_comments: whether or not to strip HTML comments
+
+        :arg filters: list of html5lib Filter classes to pass streamed content through
+
+            See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+            .. Warning::
+
+               Using filters changes the output of ``bleach.Cleaner.clean``.
+               Make sure the way the filters change the output are secure.
+
+        """
+        self.tags = tags
+        self.attributes = attributes
+        self.styles = styles
+        self.protocols = protocols
+        self.strip = strip
+        self.strip_comments = strip_comments
+        self.filters = filters or []
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib.getTreeWalker('etree')
+        self.serializer = HTMLSerializer(
+            quote_attr_values='always',
+            omit_optional_tags=False,
+
+            # Bleach has its own sanitizer, so don't use the html5lib one
+            sanitize=False,
+
+            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            alphabetical_attributes=False,
+        )
+
+    def clean(self, text):
+        """Cleans text and returns sanitized result as unicode
+
+        :arg str text: text to be cleaned
+
+        :returns: sanitized text as unicode
+
+        """
+        if not text:
+            return u''
+
+        text = force_unicode(text)
+
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
+
+            # Bleach-sanitizer-specific things
+            allowed_attributes_map=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
+
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
+
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
+
+        return self.serializer.render(filtered)
+
+
 class BleachSanitizerFilter(sanitizer.Filter):
     """html5lib Filter that sanitizes text
 
diff --git a/docs/clean.rst b/docs/clean.rst
index e281e2ca..161e4357 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -214,6 +214,7 @@ whitelist and invalid markup. For example:
 
    >>> bleach.clean('<span>is not allowed</span>')
    u'&lt;span&gt;is not allowed&lt;/span&gt;'
+
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
    u'<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
 
@@ -227,6 +228,7 @@ If you would rather Bleach stripped this markup entirely, you can pass
 
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
    u'is not allowed'
+
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
    u'<b>is not allowed</b>'
 
@@ -250,10 +252,20 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
    u'my<!-- commented --> html'
 
 
+Using ``bleach.sanitizer.Cleaner``
+==================================
+
+If you're cleaning a lot of text or you need better control of things, you
+should create a :py:class:`bleach.sanitizer.Cleaner` instance.
+
+.. autoclass:: bleach.sanitizer.Cleaner
+   :members:
+
+
 html5lib Filters (``filters``)
-==============================
+------------------------------
 
-Bleach sanitizing is implemented as an html5lib Filter. The consequence of this
+Bleach sanitizing is implemented as an html5lib filter. The consequence of this
 is that we can pass the streamed content through additional specified filters
 after the :py:class:`bleach.sanitizer.BleachSanitizingFilter` filter has run.
 
@@ -267,7 +279,7 @@ Trivial Filter example:
 
 .. doctest::
 
-   >>> import bleach
+   >>> from bleach.sanitizer import Cleaner
    >>> from html5lib.filters.base import Filter
 
    >>> class MooFilter(Filter):
@@ -283,8 +295,9 @@ Trivial Filter example:
    ... }
    ...
    >>> TAGS = ['img']
+   >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
    >>> dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
-   >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+   >>> cleaner.clean(dirty)
    u'this is cute! <img rel="moo" src="moo">'
 
 
@@ -294,20 +307,11 @@ Trivial Filter example:
    filter is applying maintain the safety guarantees of the output.
 
 
-Using ``bleach.Cleaner``
-========================
-
-If you're cleaning a lot of text, you might want to create a
-:py:class:`bleach.Cleaner` instance.
-
-.. autoclass:: bleach.Cleaner
-   :members:
-
-
 Using ``bleach.sanitizer.BleachSanitizerFilter``
 ================================================
 
-``bleach.clean`` creates a ``bleach.Cleaner`` which creates a
+``bleach.clean`` creates a ``bleach.sanitizer.Cleaner`` which creates a
 ``bleach.sanitizer.BleachSanitizerFilter`` which does the sanitizing work.
-``BleachSanitizerFilter`` is an html5lib Filter and can be used anywhere you can
-use an html5lib Filter.
+
+``BleachSanitizerFilter`` is an html5lib filter and can be used anywhere you can
+use an html5lib filter.
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 6fe032ed..a468830c 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -5,22 +5,19 @@
 Linkifying text fragments
 =========================
 
-``linkify()`` searches text for links, URLs, and email addresses and lets you
-control how and when those links are rendered.
+:py:func:`bleach.linkify` searches text for links, URLs, and email addresses and
+lets you control how and when those links are rendered.
 
-``linkify()`` works by building a document tree, so it's guaranteed never to do
-weird things to URLs in attribute values, can modify the value of attributes on
-``<a>`` tags, and can even do things like skip ``<pre>`` sections.
-
-By default, ``linkify()`` will perform some sanitization, only allowing a set of
-"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
-things like unclosed tags.
+It works by building a document tree, so it's guaranteed never to do weird
+things to URLs in attribute values, can modify the value of attributes on
+``<a>`` tags and can even do things like skip ``<pre>`` sections.
 
 .. note::
 
    You may pass a ``string`` or ``unicode`` object, but Bleach will always
    return ``unicode``.
 
+
 .. autofunction:: bleach.linkify
 
 
@@ -57,29 +54,44 @@ included callback functions.
 Setting Attributes
 ------------------
 
-For example, to set ``rel="nofollow"`` on all links found in the text, a simple
-(and included) callback might be::
+For example, you could add a ``title`` attribute to all links:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
 
-    def set_nofollow(attrs, new=False):
-        attrs[(None, 'rel')] = 'nofollow'
-        return attrs
+   >>> def set_title(attrs, new=False):
+   ...     attrs[(None, u'title')] = u'link in user text'
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[set_title])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
 
 
 This would set the value of the ``rel`` attribute, stomping on a previous value
 if there was one.
 
-You could also make external links open in a new tab or set a class::
+Here's another example that makes external links open in a new tab and look like
+an external link:
 
-    from urlparse import urlparse
+.. doctest::
 
-    def set_target(attrs, new=False):
-        p = urlparse(attrs[(None, 'href')])
-        if p.netloc not in ['my-domain.com', 'other-domain.com']:
-            attrs[(None, 'target')] = '_blank'
-            attrs[(None, 'class')] = 'external'
-        else:
-            attrs.pop((None, 'target'), None)
-        return attrs
+   >>> from urlparse import urlparse
+   >>> from bleach.linkifier import Linker
+
+   >>> def set_target(attrs, new=False):
+   ...     p = urlparse(attrs[(None, u'href')])
+   ...     if p.netloc not in ['my-domain.com', 'other-domain.com']:
+   ...         attrs[(None, u'target')] = u'_blank'
+   ...         attrs[(None, u'class')] = u'external'
+   ...     else:
+   ...         attrs.pop((None, u'target'), None)
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[set_target])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc <a class="external" href="http://example.com" target="_blank">http://example.com</a> def'
 
 
 Removing Attributes
@@ -89,25 +101,42 @@ You can easily remove attributes you don't want to allow, even on existing
 links (``<a>`` tags) in the text. (See also :ref:`clean() <clean-chapter>` for
 sanitizing attributes.)
 
-::
+.. doctest::
 
-    def allowed_attributes(attrs, new=False):
-        """Only allow href, target, rel and title."""
-        allowed = [(None, 'href'), (None, 'target'),
-                   (None, 'rel'), (None, 'title')]
-        return dict((k, v) for k, v in attrs.items() if k in allowed)
+   >>> from bleach.linkifier import Linker
+
+   >>> def allowed_attrs(attrs, new=False):
+   ...     """Only allow href, target, rel and title."""
+   ...     allowed = [
+   ...         (None, u'href'),
+   ...         (None, u'target'),
+   ...         (None, u'rel'),
+   ...         (None, u'title'),
+   ...         u'_text',
+   ...     ]
+   ...     return dict((k, v) for k, v in attrs.items() if k in allowed)
+   ...
+   >>> linker = Linker(callbacks=[allowed_attrs])
+   >>> linker.linkify('<a style="font-weight: super bold;" href="http://example.com">link</a>')
+   u'<a href="http://example.com">link</a>'
 
 
-Or you could remove a specific attribute, if it exists::
+Or you could remove a specific attribute, if it exists:
 
-    def remove_title1(attrs, new=False):
-        attrs.pop((None, 'title'), None)
-        return attrs
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
 
-    def remove_title2(attrs, new=False):
-        if (None, 'title') in attrs:
-            del attrs[(None, 'title')]
-        return attrs
+   >>> def remove_title(attrs, new=False):
+   ...     attrs.pop((None, u'title'), None)
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[remove_title])
+   >>> linker.linkify('<a href="http://example.com">link</a>')
+   u'<a href="http://example.com">link</a>'
+
+   >>> linker.linkify('<a title="bad title" href="http://example.com">link</a>')
+   u'<a href="http://example.com">link</a>'
 
 
 Altering Attributes
@@ -117,31 +146,50 @@ You can alter and overwrite attributes, including the link text, via the
 ``_text`` key, to, for example, pass outgoing links through a warning page, or
 limit the length of text inside an ``<a>`` tag.
 
-::
+Example of shortening link text:
 
-    def shorten_url(attrs, new=False):
-        """Shorten overly-long URLs in the text."""
-        if not new:  # Only looking at newly-created links.
-            return attrs
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> def shorten_url(attrs, new=False):
+   ...     """Shorten overly-long URLs in the text."""
+   ...     # Only adjust newly-created links
+   ...     if not new:
+   ...         return attrs
+   ...     # _text will be the same as the URL for new links
+   ...     text = attrs[u'_text']
+   ...     if len(text) > 25:
+   ...         attrs[u'_text'] = text[0:22] + u'...'
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[shorten_url])
+   >>> linker.linkify('http://example.com/longlonglonglonglongurl')
+   u'<a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a>'
 
-        # _text will be the same as the URL for new links.
-        text = attrs['_text']
-        if len(text) > 25:
-            attrs['_text'] = text[0:22] + '...'
-        return attrs
 
-::
+Example of switching all links to go through a bouncer first:
 
-    from urllib2 import quote
-    from urlparse import urlparse
+.. doctest::
 
-    def outgoing_bouncer(attrs, new=False):
-        """Send outgoing links through a bouncer."""
-        p = urlparse((None, attrs['href']))
-        if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
-            bouncer = 'http://outgoing.my-domain.com/?destination=%s'
-            attrs[(None, 'href')] = bouncer % quote(attrs['href'])
-        return attrs
+   >>> from six.moves.urllib.parse import quote, urlparse
+   >>> from bleach.linkifier import Linker
+
+   >>> def outgoing_bouncer(attrs, new=False):
+   ...     """Send outgoing links through a bouncer."""
+   ...     href_key = (None, u'href')
+   ...     p = urlparse(attrs.get(href_key, None))
+   ...     if p.netloc not in ['example.com', 'www.example.com', '']:
+   ...         bouncer = 'http://bn.ce/?destination=%s'
+   ...         attrs[href_key] = bouncer % quote(attrs[href_key])
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[outgoing_bouncer])
+   >>> linker.linkify('http://example.com')
+   u'<a href="http://example.com">http://example.com</a>'
+
+   >>> linker.linkify('http://foo.com')
+   u'<a href="http://bn.ce/?destination=http%3A//foo.com">http://foo.com</a>'
 
 
 Preventing Links
@@ -151,33 +199,53 @@ A slightly more complex example is inspired by Crate_, where strings like
 ``models.py`` are often found, and linkified. ``.py`` is the ccTLD for
 Paraguay, so ``example.py`` may be a legitimate URL, but in the case of a site
 dedicated to Python packages, odds are it is not. In this case, Crate_ could
-write the following callback::
+write the following callback:
 
-    def dont_linkify_python(attrs, new=False):
-        if not new:  # This is an existing <a> tag, leave it be.
-            return attrs
+.. doctest::
 
-        # If the TLD is '.py', make sure it starts with http: or https:
-        href = attrs[(None, 'href')]
-        if href.endswith('.py') and not href.startswith(('http:', 'https:')):
-            # This looks like a Python file, not a URL. Don't make a link.
-            return None
+   >>> from bleach.linkifier import Linker
+
+   >>> def dont_linkify_python(attrs, new=False):
+   ...     # This is an existing link, so leave it be
+   ...     if not new:
+   ...         return attrs
+   ...     # If the TLD is '.py', make sure it starts with http: or https:.
+   ...     # Use _text because that's the original text
+   ...     link_text = attrs[u'_text']
+   ...     if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
+   ...         # This looks like a Python file, not a URL. Don't make a link.
+   ...         return None
+   ...     # Everything checks out, keep going to the next callback.
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[dont_linkify_python])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc <a href="http://example.com">http://example.com</a> def'
 
-        # Everything checks out, keep going to the next callback.
-        return attrs
+   >>> linker.linkify('abc models.py def')
+   u'abc models.py def'
 
 
 Removing Links
 --------------
 
 If you want to remove certain links, even if they are written in the text with
-``<a>`` tags, you can still return ``None``::
+``<a>`` tags, have the callback return ``None``.
 
-    def remove_mailto(attrs, new=False):
-        """Remove any mailto: links."""
-        if attrs[(None, 'href')].startswith('mailto:'):
-            return None
-        return attrs
+For example, this removes any ``mailto:`` links:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> def remove_mailto(attrs, new=False):
+   ...     if attrs[(None, u'href')].startswith(u'mailto:'):
+   ...         return None
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[remove_mailto])
+   >>> linker.linkify('<a href="mailto:janet@example.com">mail janet!</a>')
+   u'mail janet!'
 
 
 Skipping links in pre blocks (``skip_pre``)
@@ -194,11 +262,31 @@ any new links within a ``<pre>`` section, pass ``skip_pre=True``.
 Linkifying email addresses (``parse_email``)
 ============================================
 
-By default, ``linkify()`` does not create ``mailto:`` links for email
-addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
-will go through exactly the same set of callbacks as all other links, whether
-they are newly created or already in the text, so be careful when writing
-callbacks that may need to behave differently if the protocol is ``mailto:``.
+By default, :py:func:`bleach.linkify` does not create ``mailto:`` links for
+email addresses, but if you pass ``parse_email=True``, it will. ``mailto:``
+links will go through exactly the same set of callbacks as all other links,
+whether they are newly created or already in the text, so be careful when
+writing callbacks that may need to behave differently if the protocol is
+``mailto:``.
+
+
+Using ``bleach.linkifier.Linker``
+=================================
+
+If you're linking a lot of text and passing the same argument values or you want
+more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+instance.
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> linker = Linker(skip_pre=True)
+   >>> linker.linkify('a b c http://example.com d e f')
+   u'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
+
+
+.. autoclass:: bleach.linkifier.Linker
 
 
 Using ``bleach.linkifier.LinkifyFilter``
@@ -235,7 +323,7 @@ And passing parameters to ``LinkifyFilter``:
 
    >>> from functools import partial
 
-   >>> from bleach import Cleaner
+   >>> from bleach.sanitizer import Cleaner
    >>> from bleach.linkifier import LinkifyFilter
 
    >>> cleaner = Cleaner(
diff --git a/tests/test_basics.py b/tests/test_basics.py
index e3f5d2da..bff29c0f 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -3,6 +3,7 @@
 import six
 
 import bleach
+from bleach.sanitizer import Cleaner
 
 
 class TestClean:
@@ -291,8 +292,11 @@ def __iter__(self):
         }
         TAGS = ['img']
         dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
+
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
         assert (
-            bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) ==
+            cleaner.clean(dirty) ==
             'this is cute! <img rel="moo" src="moo">'
         )
 
@@ -302,7 +306,7 @@ def test_basics(self):
         TAGS = ['span', 'br']
         ATTRS = {'span': ['style']}
 
-        cleaner = bleach.Cleaner(tags=TAGS, attributes=ATTRS)
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
 
         assert (
             cleaner.clean('a <br/><span style="color:red">test</span>') ==
diff --git a/tests/test_links.py b/tests/test_links.py
index 8e166543..e602abd4 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -1,3 +1,4 @@
+import re
 try:
     from urllib.parse import quote_plus
 except ImportError:
@@ -6,13 +7,7 @@
 import pytest
 
 from bleach import linkify, DEFAULT_CALLBACKS as DC
-from bleach.linkifier import url_re
-
-
-def test_url_re():
-    text = 'just what i am looking for...it'
-    match = url_re.search(text)
-    assert not match, 'matched {0!s}'.format(text[slice(*match.span())])
+from bleach.linkifier import Linker
 
 
 def test_empty():
@@ -540,8 +535,7 @@ def test_link_emails_and_urls():
 
 def test_links_case_insensitive():
     """Protocols and domain names are case insensitive."""
-    expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
-              'HTTP://EXAMPLE.COM</a>')
+    expect = '<a href="HTTP://EXAMPLE.COM" rel="nofollow">HTTP://EXAMPLE.COM</a>'
     assert linkify('HTTP://EXAMPLE.COM') == expect
 
 
@@ -599,3 +593,35 @@ def test_hang():
         linkify("an@email.com<mailto:an@email.com>", parse_email=True) ==
         '<a href="mailto:an@email.com">an@email.com</a><mailto:an@email.com></mailto:an@email.com>'
     )
+
+
+def test_url_re_arg():
+    """Verifies that a specified url_re is used"""
+    fred_re = re.compile(r"""(fred\.com)""")
+
+    linker = Linker(url_re=fred_re)
+    assert (
+        linker.linkify('a b c fred.com d e f') ==
+        'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f'
+    )
+
+    assert (
+        linker.linkify('a b c http://example.com d e f') ==
+        'a b c http://example.com d e f'
+    )
+
+
+def test_email_re_arg():
+    """Verifies that a specified email_re is used"""
+    fred_re = re.compile(r"""(fred@example\.com)""")
+
+    linker = Linker(parse_email=True, email_re=fred_re)
+    assert (
+        linker.linkify('a b c fred@example.com d e f') ==
+        'a b c <a href="mailto:fred@example.com">fred@example.com</a> d e f'
+    )
+
+    assert (
+        linker.linkify('a b c jim@example.com d e f') ==
+        'a b c jim@example.com d e f'
+    )

From 975091d0ba9c9ed4000d0c457eddbd03178ab44e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 6 Mar 2017 14:12:49 -0500
Subject: [PATCH 080/314] Minor fixes

---
 CHANGES             | 6 +++---
 bleach/sanitizer.py | 6 +-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/CHANGES b/CHANGES
index 9afe859f..3ff2b789 100644
--- a/CHANGES
+++ b/CHANGES
@@ -44,7 +44,7 @@ Version 2.0 (in development)
 
       def check_protocol(attrs, is_new):
           if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
-              #            ^^^^^^^^^^^^^^
+              #            ^^^^^^^^^^^^^^^
               return None
           return attrs
 
@@ -65,8 +65,8 @@ Version 2.0 (in development)
   favorite linkify settings for easy reuse.
 
 * There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
-  you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and
-  linkify in one pass.
+  you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
+  and linkify in one pass.
 
 * Tons of bug fixes.
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index fcbcd915..06c90665 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -36,10 +36,6 @@
 
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
 
 class Cleaner(object):
     """Cleaner for cleaning HTML fragments of malicious content
@@ -53,7 +49,7 @@ class Cleaner(object):
 
     To use::
 
-        from bleach import Cleaner
+        from bleach.sanitizer import Cleaner
 
         cleaner = Cleaner()
 

From ef442862f6b0ee64e88570705bf4d287f80669c8 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 6 Mar 2017 16:33:23 -0500
Subject: [PATCH 081/314] More test cleanup

* move linkify tests to test_linkify.py
* remove tests that are related to the previous implementation
---
 tests/test_basics.py | 48 ++++++++------------------------------------
 tests/test_links.py  | 35 +++++++++++++++++++-------------
 2 files changed, 29 insertions(+), 54 deletions(-)

diff --git a/tests/test_basics.py b/tests/test_basics.py
index bff29c0f..031ab66d 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -140,8 +140,7 @@ def test_stripping(self):
             '<p>multiply nested text</p>'
         )
 
-        s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
-             '</a></p>')
+        s = '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>'
         assert (
             bleach.clean(s, tags=['p', 'a'], strip=True) ==
             '<p><a href="http://example.com/"></a></p>'
@@ -301,6 +300,13 @@ def __iter__(self):
         )
 
 
+def test_clean_idempotent():
+    """Make sure that applying the filter twice doesn't change anything."""
+    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+
+    assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+
+
 class TestCleaner:
     def test_basics(self):
         TAGS = ['span', 'br']
@@ -312,41 +318,3 @@ def test_basics(self):
             cleaner.clean('a <br/><span style="color:red">test</span>') ==
             'a <br><span style="">test</span>'
         )
-
-
-class TestLinkify:
-    def test_no_href_links(self):
-        s = '<a name="anchor">x</a>'
-        assert bleach.linkify(s) == s
-
-    def test_rel_already_there(self):
-        """Make sure rel attribute is updated not replaced"""
-        linked = ('Click <a href="http://example.com" rel="tooltip">'
-                  'here</a>.')
-
-        link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
-
-        assert bleach.linkify(linked) == link_good
-        assert bleach.linkify(link_good) == link_good
-
-
-def test_idempotent():
-    """Make sure that applying the filter twice doesn't change anything."""
-    dirty = '<span>invalid & </span> < extra http://link.com<em>'
-
-    clean = bleach.clean(dirty)
-    assert bleach.clean(clean) == clean
-
-    linked = bleach.linkify(dirty)
-    assert (
-        bleach.linkify(linked) ==
-        '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" '
-        'rel="nofollow">http://link.com</a><em></em>'
-    )
-
-
-def test_serializer():
-    s = '<table></table>'
-    assert bleach.clean(s, tags=['table']) == s
-    assert bleach.linkify('<table>test</table>') == 'test<table></table>'
-    assert bleach.clean('<p>test</p>', tags=['p']) == '<p>test</p>'
diff --git a/tests/test_links.py b/tests/test_links.py
index e602abd4..28b6ad6d 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -515,12 +515,6 @@ def test_ignore_bad_protocols():
     )
 
 
-def test_max_recursion_depth():
-    """If we hit the max recursion depth, just return the string."""
-    test = '<em>' * 2000 + 'foo' + '</em>' * 2000
-    assert linkify(test) == test
-
-
 def test_link_emails_and_urls():
     """parse_email=True shouldn't prevent URLs from getting linkified."""
     assert (
@@ -551,14 +545,6 @@ def test_elements_inside_links():
     )
 
 
-def test_remove_first_childlink():
-    callbacks = [lambda *a: None]
-    assert (
-        linkify('<p><a href="/foo">something</a></p>', callbacks=callbacks) ==
-        '<p>something</p>'
-    )
-
-
 def test_drop_link_tags():
     """Verify that dropping link tags *just* drops the tag and not the content"""
     html = (
@@ -625,3 +611,24 @@ def test_email_re_arg():
         linker.linkify('a b c jim@example.com d e f') ==
         'a b c jim@example.com d e f'
     )
+
+
+def test_linkify_idempotent():
+    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+    assert linkify(linkify(dirty)) == linkify(dirty)
+
+
+class TestLinkify:
+    def test_no_href_links(self):
+        s = '<a name="anchor">x</a>'
+        assert linkify(s) == s
+
+    def test_rel_already_there(self):
+        """Make sure rel attribute is updated not replaced"""
+        linked = ('Click <a href="http://example.com" rel="tooltip">'
+                  'here</a>.')
+
+        link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+
+        assert linkify(linked) == link_good
+        assert linkify(link_good) == link_good

From a08454cdfea4bd758deab28e19084ccaa7388e1c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 6 Mar 2017 20:17:10 -0500
Subject: [PATCH 082/314] Rework attributes value and filters

This reworks how attributes argument works. Callables now take three arguments:
tag, attribute name and attribute value. Callables can be passed in as the
attributes argument value or as a value for any of the tags in the dict.

This also reworks the implementation so the complexity of the different shapes
is shuffled away out of ``allow_token`` which simplifies it a bit.
---
 CHANGES                | 10 +++++
 bleach/sanitizer.py    | 65 ++++++++++++++++++++++--------
 docs/clean.rst         | 90 ++++++++++++++++++++++++++----------------
 tests/test_basics.py   | 59 +++++++++++++++++++++++----
 tests/test_security.py |  2 +-
 5 files changed, 166 insertions(+), 60 deletions(-)

diff --git a/CHANGES b/CHANGES
index 3ff2b789..79f56a9e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -25,6 +25,12 @@ Version 2.0 (in development)
   Amongst other things, this version will add end tags even if the tag in
   question is to be escaped.
 
+* ``bleach.clean`` and friends attribute callables now take three arguments:
+  tag, attribute name and attribute value. Previously they only took attribute
+  name and attribute value.
+
+  All attribute callables will need to be updated.
+
 * ``bleach.linkify`` was rewritten
 
   ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer
@@ -52,6 +58,8 @@ Version 2.0 (in development)
   don't then html5lib will raise an assertion error that the value is not
   unicode.
 
+  All linkify filters will need to be updated.
+
 **Changes**
 
 * Supports Python 3.6.
@@ -68,6 +76,8 @@ Version 2.0 (in development)
   you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
   and linkify in one pass.
 
+* ``bleach.clean`` and friends can now take a callable as an attributes arg value.
+
 * Tons of bug fixes.
 
 * Cleaned up tests.
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 06c90665..1223e79b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -128,7 +128,7 @@ def clean(self, text):
             source=self.walker(dom),
 
             # Bleach-sanitizer-specific things
-            allowed_attributes_map=self.attributes,
+            attributes=self.attributes,
             strip_disallowed_elements=self.strip,
             strip_html_comments=self.strip_comments,
 
@@ -146,22 +146,58 @@ def clean(self, text):
         return self.serializer.render(filtered)
 
 
+def attribute_filter_factory(attributes):
+    """Generates attribute filter function for the given attributes value
+
+    The attributes value can take one of several shapes. This returns a filter
+    function appropriate to the attributes value. One nice thing about this is
+    that there's less if/then shenanigans in the ``allow_token`` method.
+
+    """
+    if callable(attributes):
+        return attributes
+
+    if isinstance(attributes, dict):
+        def _attr_filter(tag, attr, value):
+            if tag in attributes:
+                attr_val = attributes[tag]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                if attr in attr_val:
+                    return True
+
+            if '*' in attributes:
+                attr_val = attributes['*']
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                return attr in attr_val
+
+            return False
+
+        return _attr_filter
+
+    if isinstance(attributes, list):
+        def _attr_filter(tag, attr, value):
+            return attr in attributes
+
+        return _attr_filter
+
+    raise ValueError('attributes needs to be a callable, a list or a dict')
+
+
 class BleachSanitizerFilter(sanitizer.Filter):
     """html5lib Filter that sanitizes text
 
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, allowed_attributes_map,
+    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
                  strip_disallowed_elements=False, strip_html_comments=True,
                  **kwargs):
 
-        if isinstance(allowed_attributes_map, dict):
-            self.wildcard_attributes = allowed_attributes_map.get('*', [])
-            self.allowed_attributes_map = allowed_attributes_map
-        else:
-            self.wildcard_attributes = allowed_attributes_map
-            self.allowed_attributes_map = {}
+        self.attr_filter = attribute_filter_factory(attributes)
 
         self.strip_disallowed_elements = strip_disallowed_elements
         self.strip_html_comments = strip_html_comments
@@ -205,10 +241,6 @@ def sanitize_token(self, token):
     def allow_token(self, token):
         """Handles the case where we're allowing the tag"""
         if 'data' in token:
-            allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
-            if not callable(allowed_attributes):
-                allowed_attributes += self.wildcard_attributes
-
             # Loop through all the attributes and drop the ones that are not
             # allowed, are unsafe or break other rules. Additionally, fix
             # attribute values that need fixing.
@@ -220,11 +252,10 @@ def allow_token(self, token):
                 namespace, name = namespaced_name
 
                 # Drop attributes that are not explicitly allowed
-                if callable(allowed_attributes):
-                    if not allowed_attributes(name, val):
-                        continue
-
-                elif name not in allowed_attributes:
+                #
+                # NOTE(willkg): We pass in the attribute name--not a namespaced
+                # name.
+                if not self.attr_filter(token['name'], name, val):
                     continue
 
                 # Look at attributes that have uri values
diff --git a/docs/clean.rst b/docs/clean.rst
index 161e4357..b02c4525 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -55,8 +55,8 @@ The default value is also a conservative dict found in
 As a list
 ---------
 
-The ``attributes`` value can be a list, in which case the attributes are allowed
-for any tag.
+The ``attributes`` value can be a list which specifies the list of attributes
+allowed for any tag.
 
 For example:
 
@@ -76,10 +76,12 @@ For example:
 As a dict
 ---------
 
-The ``attributes`` value can be a dict, in which case the keys are tag names (or
-a wildcard: ``*`` for all tags) and the values are lists of allowed attributes.
+The ``attributes`` value can be a dict which maps tags to what attributes they can have.
 
-For example:
+You can also specify ``*``, which will match any tag.
+
+For example, this allows "href" and "rel" for "a" tags, "alt" for the "img" tag
+and "class" for any tag (including "a" and "img"):
 
 .. doctest::
 
@@ -99,48 +101,66 @@ For example:
    u'<img alt="an example">'
 
 
-In this case, ``class`` is allowed on any allowed element (from the ``tags``
-argument), ``<a>`` tags are allowed to have ``href`` and ``rel`` attributes,
-and so on.
-
-
 Using functions
 ---------------
 
-You can also use callables. If the callable returns ``True``, the attribute is
-allowed. Otherwise, it is stripped. For example:
+You can also use callables that take the tag, attribute name and attribute value
+and returns ``True`` to keep the attribute or ``False`` to drop it.
+
+You can pass a callable as the attributes argument value and it'll run for
+every tag/attr.
+
+For example:
+
+.. doctest::
+
+   >>> import bleach
+
+   >>> def allow_h(tag, name, value):
+   ...     return name[0] == 'h'
+
+   >>> bleach.clean(
+   ...    u'<a href="http://example.com" title="link">link</a>',
+   ...    tags=['a'],
+   ...    attributes=allow_h,
+   ... )
+   u'<a href="http://example.com">link</a>'
+
+
+You can also pass a callable as a value in an attributes dict and it'll run for
+attributes for specified tags:
 
 .. doctest::
 
-    >>> from urlparse import urlparse
-    >>> import bleach
+   >>> from urlparse import urlparse
+   >>> import bleach
 
-    >>> def allow_src(name, value):
-    ...     if name in ('alt', 'height', 'width'):
-    ...         return True
-    ...     if name == 'src':
-    ...         p = urlparse(value)
-    ...         return (not p.netloc) or p.netloc == 'mydomain.com'
-    ...     return False
+   >>> def allow_src(tag, name, value):
+   ...     if name in ('alt', 'height', 'width'):
+   ...         return True
+   ...     if name == 'src':
+   ...         p = urlparse(value)
+   ...         return (not p.netloc) or p.netloc == 'mydomain.com'
+   ...     return False
 
-    >>> bleach.clean(
-    ...    u'<img src="http://example.com" alt="an example">',
-    ...    tags=['img'],
-    ...    attributes={
-    ...        'img': allow_src
-    ...    }
-    ... )
-    u'<img alt="an example">'
+   >>> bleach.clean(
+   ...    u'<img src="http://example.com" alt="an example">',
+   ...    tags=['img'],
+   ...    attributes={
+   ...        'img': allow_src
+   ...    }
+   ... )
+   u'<img alt="an example">'
 
 
 Allowed styles (``styles``)
 ===========================
 
-If you allow the ``style`` attribute, you will also need to whitelist styles
-users are allowed to set, for example ``color`` and ``background-color``.
+If you allow the ``style`` attribute, you will also need to specify the allowed
+styles users are allowed to set, for example ``color`` and ``background-color``.
 
-The default value is an empty list, i.e., the ``style`` attribute will be
-allowed but no values will be.
+The default value is an empty list. In other words, the ``style`` attribute will
+be allowed but no style declaration names will be allowed.
 
 For example, to allow users to set the color and font-weight of text:
 
@@ -205,8 +225,8 @@ Default protocols are in ``bleach.ALLOWED_PROTOCOLS``.
 Stripping markup (``strip``)
 ============================
 
-By default, Bleach *escapes* tags that aren't specified in the tags
-whitelist and invalid markup. For example:
+By default, Bleach *escapes* tags that aren't specified in the allowed tags list
+and invalid markup. For example:
 
 .. doctest::
 
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 031ab66d..5b59ebf9 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -164,23 +164,46 @@ def test_lowercase_html(self):
         clean = '<em class="FOO">BAR</em>'
         assert bleach.clean(dirty, attributes=['class']) == clean
 
-    def test_wildcard_attributes(self):
+    def test_attributes_callable(self):
+        """Verify attributes can take a callable"""
+        ATTRS = lambda tag, name, val: name == 'title'
+        TAGS = ['a']
+
+        assert (
+            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+            u'<a title="blah">example</a>'
+        )
+
+    def test_attributes_wildcard(self):
+        """Verify attributes[*] works"""
         ATTRS = {
             '*': ['id'],
             'img': ['src'],
         }
-        TAG = ['img', 'em']
+        TAGS = ['img', 'em']
         dirty = ('both <em id="foo" style="color: black">can</em> have '
                  '<img id="bar" src="foo"/>')
         assert (
-            bleach.clean(dirty, tags=TAG, attributes=ATTRS) ==
+            bleach.clean(dirty, tags=TAGS, attributes=ATTRS) ==
             'both <em id="foo">can</em> have <img id="bar" src="foo">'
         )
 
-    def test_callable_attributes(self):
-        """Verify callable attributes work and get correct arg values"""
-        def img_test(attr, val):
-            return attr == 'src' and val.startswith('https')
+    def test_attributes_wildcard_callable(self):
+        """Verify attributes[*] callable works"""
+        ATTRS = {
+            '*': lambda tag, name, val: name == 'title'
+        }
+        TAGS = ['a']
+
+        assert (
+            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+            u'<a title="blah">example</a>'
+        )
+
+    def test_attributes_tag_callable(self):
+        """Verify attributes[tag] callable works"""
+        def img_test(tag, name, val):
+            return name == 'src' and val.startswith('https')
 
         ATTRS = {
             'img': img_test,
@@ -198,6 +221,28 @@ def img_test(attr, val):
             u'foo <img src="https://example.com"> baz'
         )
 
+    def test_attributes_tag_list(self):
+        """Verify attributes[tag] list works"""
+        ATTRS = {
+            'a': ['title']
+        }
+        TAGS = ['a']
+
+        assert (
+            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+            u'<a title="blah">example</a>'
+        )
+
+    def test_attributes_list(self):
+        """Verify attributes list works"""
+        ATTRS = ['title']
+        TAGS = ['a']
+
+        assert (
+            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+            u'<a title="blah">example</a>'
+        )
+
     def test_svg_attr_val_allows_ref(self):
         """Unescape values in svg attrs that allow url references"""
         # Local IRI, so keep it
diff --git a/tests/test_security.py b/tests/test_security.py
index 2aac0200..da0fe92f 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -75,7 +75,7 @@ def test_invalid_href_attr():
 def test_invalid_filter_attr():
     IMG = ['img', ]
     IMG_ATTR = {
-        'img': lambda attr, val: attr == 'src' and val == "http://example.com/"
+        'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
     }
 
     assert (

From 2cedde71bfa263ccf0ce76f630468912aa3f212f Mon Sep 17 00:00:00 2001
From: "Alexandr N. Zamaraev" <tonal.promsoft@gmail.com>
Date: Tue, 21 Feb 2017 23:49:19 +0700
Subject: [PATCH 083/314] Correct dublicates in email_re

see #247
---
 bleach/linkifier.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 1396f056..92351be4 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -64,10 +64,10 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols):
 
 EMAIL_RE = re.compile(
     r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
+    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
     |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-        |\\[\001-011\013\014\016-\177])*"  # quoted-string
+        |\\[\001-\011\013\014\016-\177])*"  # quoted-string
     )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
     """,
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)

From 77fa742d6bddb4565f79ea2480c7822c44d2357f Mon Sep 17 00:00:00 2001
From: "Alexandr N Zamaraev (aka tonal)" <tonal@promsoft.ru>
Date: Wed, 22 Feb 2017 12:06:41 +0700
Subject: [PATCH 084/314] Add test incorrect email

---
 tests/test_links.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_links.py b/tests/test_links.py
index 28b6ad6d..99b30b89 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -109,6 +109,13 @@ def ft(attrs, new=False):
         True,
         'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.'
     ),
+    # Incorrect email
+    (
+        '"\\\n"@opa.ru',
+        True,
+        '"\\\n"@opa.ru'
+    ),
+
 ])
 def test_email_link(data, parse_email, expected):
     assert linkify(data, parse_email=parse_email) == expected

From 9a617a52d6b5e81bd7ca8407f1e0810fc412cc2a Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 6 Mar 2017 21:50:30 -0500
Subject: [PATCH 085/314] Change skip_pre to the more general skip_tags

This changes skip_pre to a more general skip_tags that lets you skip linkifying
in a specified list of tags--not just pre.
---
 CHANGES             | 17 +++++++++
 README.rst          |  2 +-
 bleach/__init__.py  | 18 +++++----
 bleach/linkifier.py | 89 ++++++++++++++++++++++++++++++++++++---------
 bleach/sanitizer.py | 30 ++++++++++++---
 docs/goals.rst      | 22 +++++------
 docs/linkify.rst    | 14 +++----
 setup.py            |  4 +-
 tests/test_links.py | 10 ++---
 9 files changed, 149 insertions(+), 57 deletions(-)

diff --git a/CHANGES b/CHANGES
index 79f56a9e..050f4fc1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -60,6 +60,23 @@ Version 2.0 (in development)
 
   All linkify filters will need to be updated.
 
+* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been
+  replaced with a more general ``skip_tags`` argument.
+
+  Before, you might do::
+
+      bleach.linkify(some_text, skip_pre=True)
+
+  The equivalent with Bleach 2.0 is::
+
+      bleach.linkify(some_text, skip_tags=['pre'])
+
+  You can skip other tags, too, like ``style`` or ``script`` or other places
+  where you don't want linkification happening.
+
+  All uses of linkify that use ``skip_pre`` will need to be updated.
+
+
 **Changes**
 
 * Supports Python 3.6.
diff --git a/README.rst b/README.rst
index 403ff9b6..08dd886a 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Bleach
 .. image:: https://badge.fury.io/py/bleach.svg
    :target: http://badge.fury.io/py/bleach
 
-Bleach is a whitelist-based HTML sanitizing library that escapes or strips
+Bleach is a allowed-list-based HTML sanitizing library that escapes or strips
 markup and attributes.
 
 Bleach can also linkify text safely, applying filters that Django's ``urlize``
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 07b5075c..a231f136 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -47,16 +47,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     :arg str text: the text to clean
 
-    :arg list tags: whitelist of allowed tags; defaults to
+    :arg list tags: allowed list of tags; defaults to
         ``bleach.ALLOWED_TAGS``
 
-    :arg dict attributes: whitelist of allowed attributes; defaults to
-        ``bleach.ALLOWED_ATTRIBUTES``
+    :arg dict attributes: allowed attributes; can be a callable, list or dict;
+        defaults to ``bleach.ALLOWED_ATTRIBUTES``
 
-    :arg list styles: whitelist of allowed css; defaults to
+    :arg list styles: allowed list of css styles; defaults to
         ``bleach.ALLOWED_STYLES``
 
-    :arg list protocols: whitelist of allowed protocols for links; defaults
+    :arg list protocols: allowed list of protocols for links; defaults
         to ``bleach.ALLOWED_PROTOCOLS``
 
     :arg bool strip: whether or not to strip disallowed elements
@@ -77,7 +77,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     return cleaner.clean(text)
 
 
-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False):
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
     This function converts strings that look like URLs, domain names and email
@@ -106,7 +106,9 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False
 
     :arg list callbacks: list of callbacks to run when adjusting tag attributes
 
-    :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag
+    :arg list skip_tags: list of tags that you don't want to linkify the
+        contents of; for example, you could set this to ``['pre']`` to skip
+        linkifying contents of ``pre`` tags
 
     :arg bool parse_email: whether or not to linkify email addresses
 
@@ -115,7 +117,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False
     """
     linker = Linker(
         callbacks=callbacks,
-        skip_pre=skip_pre,
+        skip_tags=skip_tags,
         parse_email=parse_email
     )
     return linker.linkify(text)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 92351be4..6103e81e 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -74,10 +74,40 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols):
 
 
 class Linker(object):
-    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False,
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    """
+    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
                  url_re=URL_RE, email_re=EMAIL_RE):
+        """Creates a Linker instance
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg email_re: email matching regex
+
+        :returns: linkified text as unicode
+
+        """
         self.callbacks = callbacks
-        self.skip_pre = skip_pre
+        self.skip_tags = skip_tags
         self.parse_email = parse_email
         self.url_re = url_re
         self.email_re = email_re
@@ -105,7 +135,7 @@ def linkify(self, text):
         filtered = LinkifyFilter(
             source=self.walker(dom),
             callbacks=self.callbacks,
-            skip_pre=self.skip_pre,
+            skip_tags=self.skip_tags,
             parse_email=self.parse_email,
             url_re=self.url_re,
             email_re=self.email_re,
@@ -126,12 +156,31 @@ class LinkifyFilter(Filter):
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False,
+    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
                  url_re=URL_RE, email_re=EMAIL_RE):
+        """Creates a LinkifyFilter instance
+
+        :arg TreeWalker source: stream
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg email_re: email matching regex
+
+        :returns: linkified text as unicode
+
+        """
         super(LinkifyFilter, self).__init__(source)
 
         self.callbacks = callbacks or []
-        self.skip_pre = skip_pre
+        self.skip_tags = skip_tags or []
         self.parse_email = parse_email
 
         self.url_re = url_re
@@ -140,9 +189,15 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False,
     def apply_callbacks(self, attrs, is_new):
         """Given an attrs dict and an is_new bool, runs through callbacks
 
-        Callbacks can return an adjusted attrs dict or None. In the case of
-        None, we stop going through callbacks and return that and the link gets
-        dropped.
+        Callbacks can return an adjusted attrs dict or ``None``. In the case of
+        ``None``, we stop going through callbacks and return that and the link
+        gets dropped.
+
+        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
+
+        :arg bool is_new: whether or not this link was added by linkify
+
+        :returns: adjusted attrs dict or ``None``
 
         """
         for cb in self.callbacks:
@@ -399,7 +454,7 @@ def handle_a_tag(self, token_buffer):
 
     def __iter__(self):
         in_a = False
-        in_pre = False
+        in_skip_tag = None
 
         token_buffer = []
 
@@ -425,10 +480,10 @@ def __iter__(self):
                     continue
 
             elif token['type'] in ['StartTag', 'EmptyTag']:
-                if token['name'] == 'pre' and self.skip_pre:
-                    # The "pre" tag starts a "special mode" where we don't linkify
-                    # anything.
-                    in_pre = True
+                if token['name'] in self.skip_tags:
+                    # Skip tags start a "special mode" where we don't linkify
+                    # anything until the end tag.
+                    in_skip_tag = token['name']
 
                 elif token['name'] == 'a':
                     # The "a" tag is special--we switch to a slurp mode and
@@ -441,13 +496,13 @@ def __iter__(self):
                     # yet
                     continue
 
-            elif in_pre and self.skip_pre:
+            elif in_skip_tag and self.skip_tags:
                 # NOTE(willkg): We put this clause here since in_a and
                 # switching in and out of in_a takes precedence.
-                if token['type'] == 'EndTag' and token['name'] == 'pre':
-                    in_pre = False
+                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
+                    in_skip_tag = None
 
-            elif not in_a and not in_pre and token['type'] == 'Characters':
+            elif not in_a and not in_skip_tag and token['type'] == 'Characters':
                 new_stream = iter([token])
                 if self.parse_email:
                     new_stream = self.handle_email_addresses(new_stream)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 1223e79b..b5c2fe95 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -63,16 +63,16 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
                  strip_comments=True, filters=None):
         """Initializes a Cleaner
 
-        :arg tags: whitelist of allowed tags; defaults to
+        :arg list tags: allowed list of tags; defaults to
             ``bleach.ALLOWED_TAGS``
 
-        :arg attributes: whitelist of allowed attributes; defaults to
-            ``bleach.ALLOWED_ATTRIBUTES``
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.ALLOWED_ATTRIBUTES``
 
-        :arg styles: whitelist of allowed css; defaults to
+        :arg list styles: allowed list of css styles; defaults to
             ``bleach.ALLOWED_STYLES``
 
-        :arg protocols: whitelist of allowed protocols for links; defaults
+        :arg list protocols: allowed list of protocols for links; defaults
             to ``bleach.ALLOWED_PROTOCOLS``
 
         :arg strip: whether or not to strip disallowed elements
@@ -196,7 +196,27 @@ class BleachSanitizerFilter(sanitizer.Filter):
     def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
                  strip_disallowed_elements=False, strip_html_comments=True,
                  **kwargs):
+        """Creates a BleachSanitizerFilter instance
 
+        :arg Treewalker source: stream
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.ALLOWED_PROTOCOLS``
+
+        :arg strip_disallowed_elements: whether or not to strip disallowed elements
+
+        :arg strip_html_comments: whether or not to strip HTML comments
+
+        """
         self.attr_filter = attribute_filter_factory(attributes)
 
         self.strip_disallowed_elements = strip_disallowed_elements
diff --git a/docs/goals.rst b/docs/goals.rst
index 632c222c..015bc563 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -13,15 +13,15 @@ Goals
 =====
 
 
-Always take a whitelist-based approach
---------------------------------------
+Always take a allowed-list-based approach
+-----------------------------------------
 
-Bleach should always take a whitelist-based approach to allowing any kind of
-content or markup. Blacklisting is error-prone and not future proof.
+Bleach should always take a allowed-list-based approach to markup filtering.
+Specifying disallowed lists is error-prone and not future proof.
 
 For example, you should have to opt-in to allowing the ``onclick`` attribute,
-not blacklist all the other ``on*`` attributes. Future versions of HTML may add
-new event handlers, like ``ontouch``, that old blacklists would not prevent.
+not opt-out of all the other ``on*`` attributes. Future versions of HTML may add
+new event handlers, like ``ontouch``, that old disallow would not prevent.
 
 
 Main goal is to sanitize input of malicious content
@@ -39,8 +39,8 @@ Examples might include:
 
 These examples, and others, are traditionally prone to security issues like XSS
 or other script injection, or annoying issues like unclosed tags and invalid
-markup. Bleach will take a proactive, whitelist-only approach to allowing HTML
-content, and will use the HTML5 parsing algorithm to handle invalid markup.
+markup. Bleach will take a proactive, allowed-list-only approach to allowing
+HTML content, and will use the HTML5 parsing algorithm to handle invalid markup.
 
 See the :ref:`chapter on clean() <clean-chapter>` for more info.
 
@@ -52,7 +52,7 @@ The secondary goal of Bleach is to provide a mechanism for finding or altering
 links (``<a>`` tags with ``href`` attributes, or things that look like URLs or
 email addresses) in text.
 
-While Bleach itself will always operate on a whitelist-based security model,
+While Bleach itself will always operate on a allowed-list-based security model,
 the :ref:`linkify() method <linkify-chapter>` is flexible enough to allow the
 creation, alteration, and removal of links based on an extremely wide range of
 use cases.
@@ -69,8 +69,8 @@ Sanitize complete HTML documents
 --------------------------------
 
 Once you're creating whole documents, you have to allow so many tags that a
-blacklist approach (e.g. forbidding ``<script>`` or ``<object>``) may be more
-appropriate.
+disallow-list approach (e.g. forbidding ``<script>`` or ``<object>``) may be
+more appropriate.
 
 
 Remove all HTML or transforming content for some non-web-page purpose
diff --git a/docs/linkify.rst b/docs/linkify.rst
index a468830c..71c2e7f5 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -248,15 +248,13 @@ For example, this removes any ``mailto:`` links:
    u'mail janet!'
 
 
-Skipping links in pre blocks (``skip_pre``)
-===========================================
+Skipping links in specified tag blocks (``skip_tags``)
+======================================================
 
 ``<pre>`` tags are often special, literal sections. If you don't want to create
-any new links within a ``<pre>`` section, pass ``skip_pre=True``.
+any new links within a ``<pre>`` section, pass ``skip_tags=['pre']``.
 
-.. note::
-   Though new links will not be created, existing links created with ``<a>``
-   tags will still be passed through all the callbacks.
+This works for ``code``, ``div`` and any other blocks you want to skip over.
 
 
 Linkifying email addresses (``parse_email``)
@@ -281,7 +279,7 @@ instance.
 
    >>> from bleach.linkifier import Linker
 
-   >>> linker = Linker(skip_pre=True)
+   >>> linker = Linker(skip_tags=['pre'])
    >>> linker.linkify('a b c http://example.com d e f')
    u'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
 
@@ -328,7 +326,7 @@ And passing parameters to ``LinkifyFilter``:
 
    >>> cleaner = Cleaner(
    ...     tags=['pre'],
-   ...     filters=[partial(LinkifyFilter, skip_pre=True)]
+   ...     filters=[partial(LinkifyFilter, skip_tags=['pre'])]
    ... )
    ...
    >>> cleaner.clean('<pre>http://example.com</pre>')
diff --git a/setup.py b/setup.py
index 39fbb370..bf95efcb 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 tests_require = [
     'pytest>=3.0.0',
 ]
-    
+
 install_requires = [
     'six',
     # >= 8 9s because of breaking API change
@@ -42,7 +42,7 @@ def get_version():
 setup(
     name='bleach',
     version=get_version(),
-    description='An easy whitelist-based HTML-sanitizing tool.',
+    description='An easy safelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
     maintainer='Will Kahn-Greene',
     url='http://github.com/mozilla/bleach',
diff --git a/tests/test_links.py b/tests/test_links.py
index 99b30b89..a3cb973a 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -357,24 +357,24 @@ def test_unsafe_url():
     )
 
 
-def test_skip_pre():
-    """Skip linkification in <pre> tags."""
+def test_skip_tags():
+    """Skip linkification in skip tags"""
     simple = 'http://xx.com <pre>http://xx.com</pre>'
     linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
               '<pre>http://xx.com</pre>')
     all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
                   '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
                   '</a></pre>')
-    assert linkify(simple, skip_pre=True) == linked
+    assert linkify(simple, skip_tags=['pre']) == linked
     assert linkify(simple) == all_linked
 
     already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
     nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
     assert linkify(already_linked) == nofollowed
-    assert linkify(already_linked, skip_pre=True) == nofollowed
+    assert linkify(already_linked, skip_tags=['pre']) == nofollowed
 
     assert (
-        linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_pre=True) ==
+        linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_tags=['pre']) ==
         (
             '<pre><code>http://example.com</code></pre>'
             '<a href="http://example.com" rel="nofollow">http://example.com</a>'

From f84132a4c59b8e4486c4e712392067fd7041ac8e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 7 Mar 2017 11:05:15 -0500
Subject: [PATCH 086/314] Lots of docs fixes

* fixed autoclass calls
* added autodata things
* fixed errors in docs and docstrings and python modules and so on
---
 bleach/__init__.py  | 11 ++++++-----
 bleach/linkifier.py | 20 ++++++++++++++------
 bleach/sanitizer.py | 36 ++++++++++++++++++++++--------------
 docs/clean.rst      | 44 +++++++++++++++++++++++++++++++++++++-------
 docs/conf.py        | 30 +++++++++++++++++-------------
 docs/linkify.rst    | 27 ++++++++++++++++++++++++++-
 6 files changed, 122 insertions(+), 46 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index a231f136..c9a7fe40 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -48,16 +48,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     :arg str text: the text to clean
 
     :arg list tags: allowed list of tags; defaults to
-        ``bleach.ALLOWED_TAGS``
+        ``bleach.sanitizer.ALLOWED_TAGS``
 
     :arg dict attributes: allowed attributes; can be a callable, list or dict;
-        defaults to ``bleach.ALLOWED_ATTRIBUTES``
+        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
 
     :arg list styles: allowed list of css styles; defaults to
-        ``bleach.ALLOWED_STYLES``
+        ``bleach.sanitizer.ALLOWED_STYLES``
 
     :arg list protocols: allowed list of protocols for links; defaults
-        to ``bleach.ALLOWED_PROTOCOLS``
+        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 
     :arg bool strip: whether or not to strip disallowed elements
 
@@ -104,7 +104,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False
 
     :arg str text: the text to linkify
 
-    :arg list callbacks: list of callbacks to run when adjusting tag attributes
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 
     :arg list skip_tags: list of tags that you don't want to linkify the
         contents of; for example, you could set this to ``['pre']`` to skip
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 6103e81e..fc346c35 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -11,6 +11,7 @@
 from bleach.utils import alphabetize_attributes
 
 
+#: List of default callbacks
 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 
 
@@ -91,7 +92,8 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
                  url_re=URL_RE, email_re=EMAIL_RE):
         """Creates a Linker instance
 
-        :arg list callbacks: list of callbacks to run when adjusting tag attributes
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 
         :arg list skip_tags: list of tags that you don't want to linkify the
             contents of; for example, you could set this to ``['pre']`` to skip
@@ -101,7 +103,7 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
 
         :arg re url_re: url matching regex
 
-        :arg email_re: email matching regex
+        :arg re email_re: email matching regex
 
         :returns: linkified text as unicode
 
@@ -126,6 +128,13 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
         )
 
     def linkify(self, text):
+        """Linkify specified text
+
+        :arg str text: the text to add links to
+
+        :returns: linkified text as unicode
+
+        """
         text = force_unicode(text)
 
         if not text:
@@ -162,7 +171,8 @@ def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
 
         :arg TreeWalker source: stream
 
-        :arg list callbacks: list of callbacks to run when adjusting tag attributes
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 
         :arg list skip_tags: list of tags that you don't want to linkify the
             contents of; for example, you could set this to ``['pre']`` to skip
@@ -172,9 +182,7 @@ def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
 
         :arg re url_re: url matching regex
 
-        :arg email_re: email matching regex
-
-        :returns: linkified text as unicode
+        :arg re email_re: email matching regex
 
         """
         super(LinkifyFilter, self).__init__(source)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index b5c2fe95..539711ac 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -11,6 +11,7 @@
 from bleach.utils import alphabetize_attributes
 
 
+#: List of allowed tags
 ALLOWED_TAGS = [
     'a',
     'abbr',
@@ -26,14 +27,20 @@
     'ul',
 ]
 
+
+#: Map of allowed attributes by tag
 ALLOWED_ATTRIBUTES = {
     'a': ['href', 'title'],
     'abbr': ['title'],
     'acronym': ['title'],
 }
 
+
+#: List of allowed styles
 ALLOWED_STYLES = []
 
+
+#: List of allowed protocols
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
 
@@ -64,24 +71,24 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         """Initializes a Cleaner
 
         :arg list tags: allowed list of tags; defaults to
-            ``bleach.ALLOWED_TAGS``
+            ``bleach.sanitizer.ALLOWED_TAGS``
 
         :arg dict attributes: allowed attributes; can be a callable, list or dict;
-            defaults to ``bleach.ALLOWED_ATTRIBUTES``
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
 
         :arg list styles: allowed list of css styles; defaults to
-            ``bleach.ALLOWED_STYLES``
+            ``bleach.sanitizer.ALLOWED_STYLES``
 
         :arg list protocols: allowed list of protocols for links; defaults
-            to ``bleach.ALLOWED_PROTOCOLS``
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 
-        :arg strip: whether or not to strip disallowed elements
+        :arg bool strip: whether or not to strip disallowed elements
 
-        :arg strip_comments: whether or not to strip HTML comments
+        :arg bool strip_comments: whether or not to strip HTML comments
 
-        :arg filters: list of html5lib Filter classes to pass streamed content through
+        :arg list filters: list of html5lib Filter classes to pass streamed content through
 
-            See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
 
             .. Warning::
 
@@ -201,20 +208,21 @@ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
         :arg Treewalker source: stream
 
         :arg list tags: allowed list of tags; defaults to
-            ``bleach.ALLOWED_TAGS``
+            ``bleach.sanitizer.ALLOWED_TAGS``
 
         :arg dict attributes: allowed attributes; can be a callable, list or dict;
-            defaults to ``bleach.ALLOWED_ATTRIBUTES``
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
 
         :arg list styles: allowed list of css styles; defaults to
-            ``bleach.ALLOWED_STYLES``
+            ``bleach.sanitizer.ALLOWED_STYLES``
 
         :arg list protocols: allowed list of protocols for links; defaults
-            to ``bleach.ALLOWED_PROTOCOLS``
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 
-        :arg strip_disallowed_elements: whether or not to strip disallowed elements
+        :arg bool strip_disallowed_elements: whether or not to strip disallowed
+            elements
 
-        :arg strip_html_comments: whether or not to strip HTML comments
+        :arg bool strip_html_comments: whether or not to strip HTML comments
 
         """
         self.attr_filter = attribute_filter_factory(attributes)
diff --git a/docs/clean.rst b/docs/clean.rst
index b02c4525..37f4d94d 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -40,16 +40,27 @@ For example:
 
 
 The default value is a relatively conservative list found in
-``bleach.ALLOWED_TAGS``.
+``bleach.sanitizer.ALLOWED_TAGS``.
+
+
+.. autodata:: bleach.sanitizer.ALLOWED_TAGS
 
 
 Allowed Attributes (``attributes``)
 ===================================
 
-The ``attributes`` kwarg lets you specify which attributes are allowed.
+The ``attributes`` kwarg lets you specify which attributes are allowed. The
+value can be a list, a callable or a map of tag name to list or callable.
 
 The default value is also a conservative dict found in
-``bleach.ALLOWED_ATTRIBUTES``.
+``bleach.sanitizer.ALLOWED_ATTRIBUTES``.
+
+
+.. autodata:: bleach.sanitizer.ALLOWED_ATTRIBUTES
+
+.. versionchanged:: 2.0
+
+   Prior to 2.0, the ``attributes`` kwarg value could only be a list or a map.
 
 
 As a list
@@ -153,6 +164,13 @@ attributes for specified tags:
    u'<img alt="an example">'
 
 
+.. versionchanged:: 2.0
+
+   In previous versions of Bleach, the callable took an attribute name and a
+   attribute value. Now it takes a tag, an attribute name and an attribute
+   value.
+
+
 Allowed styles (``styles``)
 ===========================
 
@@ -183,15 +201,16 @@ For example, to allow users to set the color and font-weight of text:
    u'<p style="font-weight: heavy;">my html</p>'
 
 
-Default styles are stored in ``bleach.ALLOWED_STYLES``.
+Default styles are stored in ``bleach.sanitizer.ALLOWED_STYLES``.
+
+.. autodata:: bleach.sanitizer.ALLOWED_STYLES
 
 
 Allowed protocols (``protocols``)
 =================================
 
 If you allow tags that have attributes containing a URI value (like the ``href``
-attribute of an anchor tag, you may want to adapt the accepted protocols. The
-default list only allows ``http``, ``https`` and ``mailto``.
+attribute of an anchor tag, you may want to adapt the accepted protocols.
 
 For example, this sets allowed protocols to http, https and smb:
 
@@ -219,7 +238,9 @@ This adds smb to the Bleach-specified set of allowed protocols:
    u'<a href="smb://more_text">allowed protocol</a>'
 
 
-Default protocols are in ``bleach.ALLOWED_PROTOCOLS``.
+Default protocols are in ``bleach.sanitizer.ALLOWED_PROTOCOLS``.
+
+.. autodata:: bleach.sanitizer.ALLOWED_PROTOCOLS
 
 
 Stripping markup (``strip``)
@@ -281,6 +302,8 @@ should create a :py:class:`bleach.sanitizer.Cleaner` instance.
 .. autoclass:: bleach.sanitizer.Cleaner
    :members:
 
+.. versionadded:: 2.0
+
 
 html5lib Filters (``filters``)
 ------------------------------
@@ -326,6 +349,8 @@ Trivial Filter example:
    Filters change the output of cleaning. Make sure that whatever changes the
    filter is applying maintain the safety guarantees of the output.
 
+.. versionadded:: 2.0
+
 
 Using ``bleach.sanitizer.BleachSanitizerFilter``
 ================================================
@@ -335,3 +360,8 @@ Using ``bleach.sanitizer.BleachSanitizerFilter``
 
 ``BleachSanitizerFilter`` is an html5lib filter and can be used anywhere you can
 use an html5lib filter.
+
+.. autoclass:: bleach.sanitizer.BleachSanitizerFilter
+
+
+.. versionadded:: 2.0
diff --git a/docs/conf.py b/docs/conf.py
index 1d257d01..e327ee55 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,19 +11,17 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
-import sys, os
-
 from bleach import __version__, VERSION
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
 
 # -- General configuration -----------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
@@ -36,7 +34,7 @@
 source_suffix = '.rst'
 
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
 
 # The master toctree document.
 master_doc = 'index'
@@ -56,37 +54,43 @@
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
-#language = None
+# language = None
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
 
 # The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
+# default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 
 # A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
+
+
+# -- Options for autodoc -----------
+
+# Display the class docstring and __init__ docstring concatenated
+autoclass_content = 'both'
 
 
 # -- Options for HTML output ---------------------------------------------------
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 71c2e7f5..74cb01eb 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -50,6 +50,16 @@ links will be removed leaving the innerText left in its place.
 The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
 included callback functions.
 
+This defaults to ``bleach.linkify.DEFAULT_CALLBACKS``.
+
+
+.. autodata:: bleach.linkifier.DEFAULT_CALLBACKS
+
+
+.. versionchanged:: 2.0
+
+   In previous versions of Bleach, the attribute names were not namespaced.
+
 
 Setting Attributes
 ------------------
@@ -226,6 +236,9 @@ write the following callback:
    u'abc models.py def'
 
 
+.. _Crate: https://crate.io/
+
+
 Removing Links
 --------------
 
@@ -257,6 +270,11 @@ any new links within a ``<pre>`` section, pass ``skip_tags=['pre']``.
 This works for ``code``, ``div`` and any other blocks you want to skip over.
 
 
+.. versionchanged:: 2.0
+
+   This used to be ``skip_pre``, but this makes it more general.
+
+
 Linkifying email addresses (``parse_email``)
 ============================================
 
@@ -285,6 +303,10 @@ instance.
 
 
 .. autoclass:: bleach.linkifier.Linker
+   :members:
+
+
+.. versionadded:: 2.0
 
 
 Using ``bleach.linkifier.LinkifyFilter``
@@ -333,4 +355,7 @@ And passing parameters to ``LinkifyFilter``:
    u'<pre>http://example.com</pre>'
 
 
-.. _Crate: https://crate.io/
+.. autoclass:: bleach.linkifier.LinkifyFilter
+
+
+.. versionadded:: 2.0

From b8b8e4aa9b688fb00f41959e74d8054e461f6aaf Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 8 Mar 2017 09:31:31 -0500
Subject: [PATCH 087/314] Update for impending 2.0 release

---
 CHANGES          |  4 ++--
 CONTRIBUTORS     |  7 ++++++-
 MANIFEST.in      |  2 +-
 docs/dev.rst     | 28 +++++++++++++++++++++-------
 requirements.txt |  4 ++--
 setup.py         |  2 ++
 6 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/CHANGES b/CHANGES
index 050f4fc1..7caa99ff 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach Changes
 ==============
 
-Version 2.0 (in development)
-----------------------------
+Version 2.0 (March 8th, 2017)
+-----------------------------
 
 **Backwards incompatible changes**
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index f2d02ab6..4c90ae57 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -16,18 +16,22 @@ Contributors:
 - Adam Lofts
 - Adrian "ThiefMaster"
 - Alek
+- Alexandre Macabies
+- Alexandr N. Zamaraev
 - Alex Ehlke
 - Alireza Savand
-- Alexandre Macabies
 - Andreas Malecki
 - Andy Freeland
 - Anton Kovalyov
 - Chris Beaven
+- Dan Gayle
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
 - Istvan Albert
 - Jaime Irurzun
+- James Socol
+- Jannis Leidel
 - Jeff Balogh
 - Lee, Cheon-il
 - Les Orchard
@@ -47,4 +51,5 @@ Contributors:
 - Tim Dumol
 - Timothy Fitz
 - Vitaly Volkov
+- Will Kahn-Greene
 - zyegfryed
diff --git a/MANIFEST.in b/MANIFEST.in
index ab0c8fbf..d8329f63 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -11,4 +11,4 @@ include docs/Makefile
 
 recursive-include docs *.rst
 
-recursive-include tests *.py
+recursive-include tests *.py *.test *.out
diff --git a/docs/dev.rst b/docs/dev.rst
index 02f8d44a..9026a568 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -40,22 +40,36 @@ Release process
 
 5. Verify correctness.
 
-   1. Run tests with tox
-   2. Build the docs
-   3. Verify everything works
+   1. Run tests with tox::
 
-6. Push everything to GitHub. This will cause Travis to run the tests.
+         $ tox
 
-7. After Travis is happy, tag the release::
+   2. Build the docs::
+
+         $ cd docs
+         $ make html
+
+   3. Run the doctests::
+
+         $ cd docs/
+         $ make doctests
+
+   4. Verify everything works
+
+6. Commit the changes.
+
+7. Push the changes to GitHub. This will cause Travis to run the tests.
+
+8. After Travis is happy, tag the release::
 
      $ git tag -a v0.4
 
    Copy the details from ``CHANGES`` into the tag comment.
 
-8. Push the new tag::
+9. Push the new tag::
 
      $ git push --tags official master
 
    That will push the release to PyPI.
 
-9. Blog posts, twitter, update topic in ``#bleach``, etc.
+10. Blog posts, twitter, update topic in ``#bleach``, etc.
diff --git a/requirements.txt b/requirements.txt
index 6ec6bd90..5cfec7f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,9 @@ pytest==3.0.6
 pytest-wholenodeid
 flake8==3.3.0
 tox==2.4.1
- 
+
 # Requirements for building docs
 Sphinx==1.5.2
- 
+
 # Requirements for updating package
 twine==1.8.1
diff --git a/setup.py b/setup.py
index bf95efcb..3e9b30cf 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@
 
 def get_long_desc():
     desc = open('README.rst').read()
+    desc += '\n\n'
     desc += open('CHANGES').read()
     return desc
 
@@ -45,6 +46,7 @@ def get_version():
     description='An easy safelist-based HTML-sanitizing tool.',
     long_description=get_long_desc(),
     maintainer='Will Kahn-Greene',
+    maintainer_email='willkg@mozilla.com',
     url='http://github.com/mozilla/bleach',
     license='Apache Software License',
     packages=find_packages(),

From f3774a29abd3e625f519b5e63be537909a645f5b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 8 Mar 2017 16:01:23 -0500
Subject: [PATCH 088/314] Add note about html5lib 1.0b8 and requirements

---
 CHANGES | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGES b/CHANGES
index 7caa99ff..913a14c5 100644
--- a/CHANGES
+++ b/CHANGES
@@ -15,6 +15,11 @@ Version 2.0 (March 8th, 2017)
   This version is a rewrite to use the new sanitizing API since the old
   one was dropped in html5lib 0.99999999 (8 9s).
 
+  If you're using 0.9999999 (7 9s) upgrade to 0.99999999 (8 9s) or higher.
+
+  If you're using 1.0b8 (equivalent to 0.9999999 (7 9s)), upgrade to 1.0b9
+  (equivalent to 0.99999999 (8 9s)) or higher.
+
 * ``bleach.clean`` and friends were rewritten
 
   ``clean`` was reimplemented as an html5lib filter and happens at a different

From b23c74c1ca5ffcbd308df93e79487fa92a6eb4a7 Mon Sep 17 00:00:00 2001
From: Alex Defsen <defsen@gmail.com>
Date: Mon, 20 Mar 2017 20:51:38 +0300
Subject: [PATCH 089/314] Fixed KeyError exception on tags without href attr

---
 bleach/callbacks.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index d2ba1014..99d56b80 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -4,7 +4,11 @@
 
 def nofollow(attrs, new=False):
     href_key = (None, u'href')
-    if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
+
+    if href_key not in attrs:
+        return attrs
+
+    if attrs[href_key].startswith(u'mailto:'):
         return attrs
 
     rel_key = (None, u'rel')
@@ -18,6 +22,10 @@ def nofollow(attrs, new=False):
 
 def target_blank(attrs, new=False):
     href_key = (None, u'href')
+
+    if href_key not in attrs:
+        return attrs
+
     if attrs[href_key].startswith(u'mailto:'):
         return attrs
 

From 1fa1c0c0b2eb23a6d13daa00dfb5b2cba3fe8c76 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 20 Mar 2017 14:55:37 -0400
Subject: [PATCH 090/314] Add tests for linkify callbacks

* adds tests for linkify callbacks
* fix linkify calls to always specify "callbacks" argument name
---
 tests/test_callbacks.py | 63 +++++++++++++++++++++++++++++++++++++++++
 tests/test_links.py     | 12 ++++----
 2 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_callbacks.py

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
new file mode 100644
index 00000000..1d447a2e
--- /dev/null
+++ b/tests/test_callbacks.py
@@ -0,0 +1,63 @@
+from bleach.callbacks import nofollow, target_blank
+
+
+class TestNofollowCallback:
+    def test_blank(self):
+        attrs = {}
+        assert nofollow(attrs) == attrs
+
+    def test_no_href(self):
+        attrs = {'_text': 'something something'}
+        assert nofollow(attrs) == attrs
+
+    def test_basic(self):
+        attrs = {(None, 'href'): 'http://example.com'}
+        assert (
+            nofollow(attrs) ==
+            {(None, 'href'): 'http://example.com', (None, 'rel'): 'nofollow'}
+        )
+
+    def test_mailto(self):
+        attrs = {(None, 'href'): 'mailto:joe@example.com'}
+        assert nofollow(attrs) == attrs
+
+    def test_has_nofollow_already(self):
+        attrs = {
+            (None, 'href'): 'http://example.com',
+            (None, 'rel'): 'nofollow',
+        }
+        assert nofollow(attrs) == attrs
+
+    def test_other_rel(self):
+        attrs = {
+            (None, 'href'): 'http://example.com',
+            (None, 'rel'): 'next',
+        }
+        assert (
+            nofollow(attrs) ==
+            {(None, 'href'): 'http://example.com', (None, 'rel'): 'next nofollow'}
+        )
+
+
+class TestTargetBlankCallback:
+    def test_empty(self):
+        attrs = {}
+        assert target_blank(attrs) == attrs
+
+    def test_mailto(self):
+        attrs = {(None, u'href'): u'mailto:joe@example.com'}
+        assert target_blank(attrs) == attrs
+
+    def test_add_target(self):
+        attrs = {(None, u'href'): u'http://example.com'}
+        assert (
+            target_blank(attrs) ==
+            {(None, u'href'): u'http://example.com', (None, u'target'): u'_blank'}
+        )
+
+    def test_stomp_target(self):
+        attrs = {(None, u'href'): u'http://example.com', (None, u'target'): u'foo'}
+        assert (
+            target_blank(attrs) ==
+            {(None, u'href'): 'http://example.com', (None, u'target'): u'_blank'}
+        )
diff --git a/tests/test_links.py b/tests/test_links.py
index a3cb973a..2c7ddd50 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -53,7 +53,7 @@ def filter_url(attrs, new=False):
         return attrs
 
     assert (
-        linkify('http://example.com', DC + [filter_url]) ==
+        linkify('http://example.com', callbacks=DC + [filter_url]) ==
         '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>'
     )
 
@@ -66,7 +66,7 @@ def ft(attrs, new=False):
         return attrs
 
     assert (
-        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', [ft]) ==
+        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', callbacks=[ft]) ==
         '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>'
     )
 
@@ -185,7 +185,7 @@ def test_prevent_links(callback, expected):
     """Returning None from any callback should remove links or prevent them
     from being created."""
     text = 'a ex.mp <a href="http://example.com">example</a>'
-    assert linkify(text, callback) == expected
+    assert linkify(text, callbacks=callback) == expected
 
 
 def test_set_attrs():
@@ -196,7 +196,7 @@ def set_attr(attrs, new=False):
         return attrs
 
     assert (
-        linkify('ex.mp', [set_attr]) ==
+        linkify('ex.mp', callbacks=[set_attr]) ==
         '<a href="http://ex.mp" rev="canonical">ex.mp</a>'
     )
 
@@ -210,7 +210,7 @@ def only_proto(attrs, new=False):
 
     in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
     assert (
-        linkify(in_text, [only_proto]) ==
+        linkify(in_text, callbacks=[only_proto]) ==
         'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>'
     )
 
@@ -248,7 +248,7 @@ def test_escaping():
 
 
 def test_nofollow_off():
-    assert linkify('example.com', []) == '<a href="http://example.com">example.com</a>'
+    assert linkify('example.com', callbacks=[]) == '<a href="http://example.com">example.com</a>'
 
 
 def test_link_in_html():

From 08f4fda1ac1cb2257b213117417782997aa2d1c6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Mar 2017 13:11:47 -0400
Subject: [PATCH 091/314] Add Alex to CONTRIBUTORS

---
 CONTRIBUTORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 4c90ae57..287590a8 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -18,6 +18,7 @@ Contributors:
 - Alek
 - Alexandre Macabies
 - Alexandr N. Zamaraev
+- Alex Defsen
 - Alex Ehlke
 - Alireza Savand
 - Andreas Malecki

From e6b656cc87b7eb3739cda65f260c0b1321c03495 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 29 Mar 2017 08:36:06 -0400
Subject: [PATCH 092/314] Tweak docs re: installing for development

---
 README.rst   |  8 --------
 docs/dev.rst | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index 08dd886a..db927ee4 100644
--- a/README.rst
+++ b/README.rst
@@ -62,14 +62,6 @@ Or with ``easy_install``::
 
     $ easy_install bleach
 
-Or by cloning the repo from GitHub_::
-
-    $ git clone git://github.com/mozilla/bleach.git
-
-Then install it by running::
-
-    $ python setup.py install
-
 
 Upgrading Bleach
 ================
diff --git a/docs/dev.rst b/docs/dev.rst
index 9026a568..bd667c79 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -2,6 +2,23 @@
 Bleach development
 ==================
 
+Install for development
+=======================
+
+To install Bleach to make changes to it::
+
+1. Clone the repo from GitHub::
+
+       $ git clone git://github.com/mozilla/bleach.git
+
+2. Create a virtual environment using whatever method you want.
+
+3. Install Bleach into the virtual environment such that you can see
+   changes::
+
+       $ pip install -e .
+
+
 Docs
 ====
 

From d6604aa8965c2a7565a75bfc13bccef7318f2ec0 Mon Sep 17 00:00:00 2001
From: Stu Cox <stu@stucox.co.uk>
Date: Thu, 13 Apr 2017 13:23:59 +0100
Subject: [PATCH 093/314] Add comparison of Bleach vs html5lib to README
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Used notes provided by @willkg on #277 to add a short description of why Bleach may be preferable to html5lib’s built-in sanitizer.

Let me know if you think this should live somewhere else.
---
 README.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.rst b/README.rst
index db927ee4..dce3b9c0 100644
--- a/README.rst
+++ b/README.rst
@@ -89,6 +89,19 @@ The simplest way to use Bleach is:
     u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
 
 
+Bleach vs html5lib
+==================
+
+Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer <https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_, so why use Bleach?
+
+* Bleach's API is simpler.
+* Bleach's sanitizer allows a map to be provided for ``ALLOWED_ATTRIBUTES``, giving you a lot more control over sanitizing attributes: you can sanitize attributes for specific tags, you can sanitize based on value, etc.
+* Bleach's sanitizer always alphabetizes attributes, but uses an alphabetizer that works with namespaces — the html5lib one is broken in that regard.
+* Bleach's sanitizer always quotes attribute values because that's the safe thing to do. The html5lib one makes that configurable. In this case, Bleach doesn't make something configurable that isn't safe.
+* Bleach's sanitizer has a very restricted set of ``ALLOWED_PROTOCOLS`` by default. html5lib has a much more expansive one that Bleach's authors claim is less safe.
+* ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't work.
+
+
 .. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/

From 77f0c6c2393280857426fdab20a41c742a1e97fd Mon Sep 17 00:00:00 2001
From: Stu Cox <stu@stucox.co.uk>
Date: Mon, 24 Apr 2017 10:15:02 +0100
Subject: [PATCH 094/314] Move "Bleach vs html5lib" comparison from README to
 Goals

---
 README.rst     | 13 -------------
 docs/goals.rst | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.rst b/README.rst
index dce3b9c0..db927ee4 100644
--- a/README.rst
+++ b/README.rst
@@ -89,19 +89,6 @@ The simplest way to use Bleach is:
     u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
 
 
-Bleach vs html5lib
-==================
-
-Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer <https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_, so why use Bleach?
-
-* Bleach's API is simpler.
-* Bleach's sanitizer allows a map to be provided for ``ALLOWED_ATTRIBUTES``, giving you a lot more control over sanitizing attributes: you can sanitize attributes for specific tags, you can sanitize based on value, etc.
-* Bleach's sanitizer always alphabetizes attributes, but uses an alphabetizer that works with namespaces — the html5lib one is broken in that regard.
-* Bleach's sanitizer always quotes attribute values because that's the safe thing to do. The html5lib one makes that configurable. In this case, Bleach doesn't make something configurable that isn't safe.
-* Bleach's sanitizer has a very restricted set of ``ALLOWED_PROTOCOLS`` by default. html5lib has a much more expansive one that Bleach's authors claim is less safe.
-* ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't work.
-
-
 .. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
diff --git a/docs/goals.rst b/docs/goals.rst
index 015bc563..98b326be 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -104,3 +104,16 @@ There are a number of interesting CSS properties that can do dangerous things,
 like Opera's ``-o-link``. Painful as it is, if you want your users to be able to
 change nearly anything in a ``style`` attribute, you should have to opt into
 this.
+
+
+Bleach vs html5lib
+==================
+
+Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer <https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_, so why use Bleach?
+
+* Bleach's API is simpler.
+* Bleach's sanitizer allows a map to be provided for ``ALLOWED_ATTRIBUTES``, giving you a lot more control over sanitizing attributes: you can sanitize attributes for specific tags, you can sanitize based on value, etc.
+* Bleach's sanitizer always alphabetizes attributes, but uses an alphabetizer that works with namespaces — the html5lib one is broken in that regard.
+* Bleach's sanitizer always quotes attribute values because that's the safe thing to do. The html5lib one makes that configurable. In this case, Bleach doesn't make something configurable that isn't safe.
+* Bleach's sanitizer has a very restricted set of ``ALLOWED_PROTOCOLS`` by default. html5lib has a much more expansive one that Bleach's authors claim is less safe.
+* ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't work.

From 13c2c4875970aacb4e4ff0b5bf7a2b9661a13432 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 24 Apr 2017 07:56:20 -0400
Subject: [PATCH 095/314] Fix formatting

---
 docs/goals.rst | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/docs/goals.rst b/docs/goals.rst
index 98b326be..75c2ce7e 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -109,11 +109,21 @@ this.
 Bleach vs html5lib
 ==================
 
-Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer <https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_, so why use Bleach?
+Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer
+<https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_,
+so why use Bleach?
 
 * Bleach's API is simpler.
-* Bleach's sanitizer allows a map to be provided for ``ALLOWED_ATTRIBUTES``, giving you a lot more control over sanitizing attributes: you can sanitize attributes for specific tags, you can sanitize based on value, etc.
-* Bleach's sanitizer always alphabetizes attributes, but uses an alphabetizer that works with namespaces — the html5lib one is broken in that regard.
-* Bleach's sanitizer always quotes attribute values because that's the safe thing to do. The html5lib one makes that configurable. In this case, Bleach doesn't make something configurable that isn't safe.
-* Bleach's sanitizer has a very restricted set of ``ALLOWED_PROTOCOLS`` by default. html5lib has a much more expansive one that Bleach's authors claim is less safe.
-* ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't work.
+* Bleach's sanitizer allows a map to be provided for ``ALLOWED_ATTRIBUTES``,
+  giving you a lot more control over sanitizing attributes: you can sanitize
+  attributes for specific tags, you can sanitize based on value, etc.
+* Bleach's sanitizer always alphabetizes attributes, but uses an alphabetizer
+  that works with namespaces — the html5lib one is broken in that regard.
+* Bleach's sanitizer always quotes attribute values because that's the safe
+  thing to do. The html5lib one makes that configurable. In this case, Bleach
+  doesn't make something configurable that isn't safe.
+* Bleach's sanitizer has a very restricted set of ``ALLOWED_PROTOCOLS`` by
+  default. html5lib has a much more expansive one that Bleach's authors claim is
+  less safe.
+* ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't
+  work.

From 93620bc65a0ae426bd280804413c6f2a3e6efc6b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 26 Apr 2017 09:10:17 -0400
Subject: [PATCH 096/314] Fix links in Bleach vs. html5lib section

---
 docs/goals.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/goals.rst b/docs/goals.rst
index 75c2ce7e..a2b1e56d 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -109,8 +109,8 @@ this.
 Bleach vs html5lib
 ==================
 
-Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer
-<https://github.com/html5lib/html5lib-python/blob/1.0b10/html5lib/serializer.py#L148>`_,
+Bleach is built upon html5lib_, and html5lib has `a built-in sanitizer filter
+<https://html5lib.readthedocs.io/en/latest/html5lib.filters.html#module-html5lib.filters.sanitizer>`_,
 so why use Bleach?
 
 * Bleach's API is simpler.
@@ -127,3 +127,5 @@ so why use Bleach?
   less safe.
 * ``html5lib.filters.sanitizer.Filter``'s ``sanitize_css`` is broken and doesn't
   work.
+
+.. _html5lib: https://github.com/html5lib/html5lib-python

From 866e9744555f976f56aaa651d1aad2f72e456a86 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 28 Apr 2017 10:10:45 -0400
Subject: [PATCH 097/314] Update CONTRIBUTORS

---
 CONTRIBUTORS | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 287590a8..8b2dd178 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,8 +1,9 @@
 Bleach was originally written and maintained by James Socol and various
 contributors within and without the Mozilla Corporation and Foundation.
-It is currently maintained by Jannis Leidel and Will Kahn-Greene.
 
-Maintainers:
+It is currently maintained by Will Kahn-Greene.
+
+Maintainer:
 
 - Will Kahn-Greene <willkg@mozilla.com>
 
@@ -29,6 +30,7 @@ Contributors:
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
+- Greg Guthe
 - Istvan Albert
 - Jaime Irurzun
 - James Socol

From 5bcae14fe368ae7095b982552703f85e79777dd7 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 28 Apr 2017 10:09:05 -0400
Subject: [PATCH 098/314] Adjust website tests

* moves it to a directory name that I think is clearer in what it's for
* switches README.txt to README.rst
---
 {website => tests_website}/.gitignore        |  0
 tests_website/README.rst                     | 28 ++++++++++++++++++++
 {website => tests_website}/data_to_json.py   |  0
 {website => tests_website}/index.html        |  0
 {website => tests_website}/open_test_page.py |  0
 {website => tests_website}/server.py         |  0
 website/README.txt                           | 16 -----------
 7 files changed, 28 insertions(+), 16 deletions(-)
 rename {website => tests_website}/.gitignore (100%)
 create mode 100644 tests_website/README.rst
 rename {website => tests_website}/data_to_json.py (100%)
 rename {website => tests_website}/index.html (100%)
 rename {website => tests_website}/open_test_page.py (100%)
 rename {website => tests_website}/server.py (100%)
 delete mode 100644 website/README.txt

diff --git a/website/.gitignore b/tests_website/.gitignore
similarity index 100%
rename from website/.gitignore
rename to tests_website/.gitignore
diff --git a/tests_website/README.rst b/tests_website/README.rst
new file mode 100644
index 00000000..e83eda6c
--- /dev/null
+++ b/tests_website/README.rst
@@ -0,0 +1,28 @@
+============
+Test website
+============
+
+This holds infrastructure for running Bleach regression tests in a browser.
+
+
+Usage
+=====
+
+From the repository root:
+
+1. Generate the test cases::
+
+      python tests_website/data_to_json.py tests/data > tests_website/testcases.json
+
+2. Run the test server as a background process::
+
+      cd tests_website && python server.py &
+
+   You could also run it in a separate terminal by omitting the ``&`` at the
+   end.
+
+3. Open the page in browsers Python can find on your machine::
+
+      python tests_website/open_test_page.py
+
+4. Go through the web pages and inspect the bleached HTML.
diff --git a/website/data_to_json.py b/tests_website/data_to_json.py
similarity index 100%
rename from website/data_to_json.py
rename to tests_website/data_to_json.py
diff --git a/website/index.html b/tests_website/index.html
similarity index 100%
rename from website/index.html
rename to tests_website/index.html
diff --git a/website/open_test_page.py b/tests_website/open_test_page.py
similarity index 100%
rename from website/open_test_page.py
rename to tests_website/open_test_page.py
diff --git a/website/server.py b/tests_website/server.py
similarity index 100%
rename from website/server.py
rename to tests_website/server.py
diff --git a/website/README.txt b/website/README.txt
deleted file mode 100644
index 2223e7ab..00000000
--- a/website/README.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Scripts for a Bleach demo/test website
-
-Usage:
-
-from the project root:
-
-# generate testcases.json
-python website/data_to_json.py tests/data > testcases.json
-
-# run the test server
-cd website && python server.py &
-
-# open the page in browsers python can find
-python open_test_page.py
-
-# inspect bleached html and iframe

From 5e41c6a5c2b43271c8ec142c83cc5b5de0de9328 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n?= <islandofcalmness@gmail.com>
Date: Tue, 2 May 2017 08:07:03 +0200
Subject: [PATCH 099/314] Add >=1.0b9 to html5lib dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3e9b30cf..2d46b59a 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 install_requires = [
     'six',
     # >= 8 9s because of breaking API change
-    'html5lib>=0.99999999',
+    'html5lib>=0.99999999,>=1.0b9',
 ]
 
 

From a95e62d54a9cca98023de5c0088515e681f01418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n?= <islandofcalmness@gmail.com>
Date: Tue, 2 May 2017 08:09:05 +0200
Subject: [PATCH 100/314] Add tox for html5lib 1.0b{9,10} versions

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 53c175c9..cd38b56d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999
+envlist = py{27,33,34,35,36}-html5lib{99999999,999999999,1.0b9,1.0b10},pypy-html5lib99999999
 
 [testenv]
 basepython =

From 64167be73944718e9b41aabdc9b6ce056e40bd9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n?= <islandofcalmness@gmail.com>
Date: Wed, 3 May 2017 19:19:36 +0200
Subject: [PATCH 101/314] Fix html5lib version dependency format.

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2d46b59a..1df9765f 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,8 @@
 install_requires = [
     'six',
     # >= 8 9s because of breaking API change
-    'html5lib>=0.99999999,>=1.0b9',
+    # the 'pre' suffix is needed for supporting '1.0b*' versions
+    'html5lib>=0.99999999pre,!=1.0b1,!=1.0b2,!=1.0b3,!=1.0b4,!=1.0b5,!=1.0b6,!=1.0b7,!=1.0b8',
 ]
 
 

From 0db6ee1ae3d8e119102ccf8da245ab59fd10dbde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n?= <islandofcalmness@gmail.com>
Date: Wed, 3 May 2017 19:21:51 +0200
Subject: [PATCH 102/314] Add tox dependencies for alternative html5lib
 versions

---
 tox.ini | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index cd38b56d..3b2a28df 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35,36}-html5lib{99999999,999999999,1.0b9,1.0b10},pypy-html5lib99999999
+envlist = py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10},pypy-html5lib99999999
 
 [testenv]
 basepython =
@@ -17,5 +17,7 @@ deps =
     -rrequirements.txt
     html5lib99999999: html5lib==0.99999999
     html5lib999999999: html5lib==0.999999999
+    html5lib10b9: html5lib==1.0b9
+    html5lib10b10: html5lib==1.0b10
 commands =
     py.test {posargs:-v}

From 0aac93673be106b235e97d2834addc364e2faf08 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Fri, 19 May 2017 11:51:50 -0400
Subject: [PATCH 103/314] docs: Add XSS gadget safe escaping as a non-goal

---
 docs/goals.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/goals.rst b/docs/goals.rst
index a2b1e56d..e3741e8b 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -105,6 +105,18 @@ like Opera's ``-o-link``. Painful as it is, if you want your users to be able to
 change nearly anything in a ``style`` attribute, you should have to opt into
 this.
 
+Usage with Javascript frameworks and template languages
+-------------------------------------------------------
+
+A number of Javascript frameworks and template languages allow `XSS
+via Javascript Gadgets <http://sebastian-lekies.de/slides/appsec2017.pdf>`_.
+While Bleach usually produces output safe for these contexts, it is
+not tested against them nor guaranteed to produce safe output.  Check
+that bleach properly strips or escapes language-specific syntax like
+``data-bind`` attributes for Knockout.js or ``ng-*`` attributes from
+Angular templates before using bleach-sanitized output with your
+framework or template language.
+
 
 Bleach vs html5lib
 ==================

From c89a09fd314fe68f17a52a877a6cf062a5eec668 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Jul 2017 11:17:27 -0400
Subject: [PATCH 104/314] Clarify the dangers of using bleach.clean() output in
 html attributes

---
 docs/clean.rst | 32 +++++++++++++++++++++++++++++---
 docs/dev.rst   |  2 +-
 docs/goals.rst | 29 ++++++++++++++++++++++++++---
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index 37f4d94d..9b1bb012 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -11,9 +11,35 @@ Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing
 algorithm and sanitize any disallowed tags or attributes. This algorithm also
 takes care of things like unclosed and (some) misnested tags.
 
-.. note::
-   You may pass in a ``string`` or a ``unicode`` object, but Bleach will
-   always return ``unicode``.
+You may pass in a ``string`` or a ``unicode`` object, but Bleach will always
+return ``unicode``.
+
+.. Note::
+
+   :py:func:`bleach.clean` is for sanitizing HTML **fragments** and not entire
+   HTML documents.
+
+
+.. Warning::
+
+   :py:func:`bleach.clean` is for sanitising HTML fragments to use in an HTML
+   context--not for HTML attributes.
+
+   For example, this is safe::
+
+     <p>
+       {{ bleach.clean(user_bio) }}
+     </p>
+
+
+   This is **not safe**::
+
+     <body data-bio="{{ bleach.clean(user_bio} }}">
+
+
+   If you need to use the output of ``bleach.clean()`` in an HTML attribute, you
+   need to pass it through your template library's escape function. For example,
+   Jinja2's ``escape`` or ``django.utils.html.escape`` or something like that.
 
 
 .. autofunction:: bleach.clean
diff --git a/docs/dev.rst b/docs/dev.rst
index bd667c79..5cc1691c 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -5,7 +5,7 @@ Bleach development
 Install for development
 =======================
 
-To install Bleach to make changes to it::
+To install Bleach to make changes to it:
 
 1. Clone the repo from GitHub::
 
diff --git a/docs/goals.rst b/docs/goals.rst
index e3741e8b..1533fdb0 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -68,9 +68,32 @@ non-goal use cases include:
 Sanitize complete HTML documents
 --------------------------------
 
-Once you're creating whole documents, you have to allow so many tags that a
-disallow-list approach (e.g. forbidding ``<script>`` or ``<object>``) may be
-more appropriate.
+Bleach's ``clean`` is not for sanitizing entire HTML documents. Once you're
+creating whole documents, you have to allow so many tags that a disallow-list
+approach (e.g. forbidding ``<script>`` or ``<object>``) may be more appropriate.
+
+
+Sanitize for use in HTML attributes
+-----------------------------------
+
+Bleach's ``clean`` is used for sanitizing content to be used in an HTML
+context--not for HTML attributes.
+
+For example, this is safe::
+
+    <p>
+      {{ bleach.clean(user_bio) }}
+    </p>
+
+
+This is **not safe**::
+
+    <body data-bio="{{ bleach.clean(user_bio} }}">
+
+
+If you need to use the output of ``bleach.clean()`` in an HTML attribute, you
+need to pass it through your template library's escape function. For example,
+Jinja2's ``escape`` or ``django.utils.html.escape`` or something like that.
 
 
 Remove all HTML or transforming content for some non-web-page purpose

From a75f19c3d33eb1b91f4866c282ac2c2dfbe3ece3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 4 Aug 2017 10:01:58 -0400
Subject: [PATCH 105/314] Add Greg to maintainers

---
 CONTRIBUTORS | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 8b2dd178..3220156d 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,11 +1,12 @@
 Bleach was originally written and maintained by James Socol and various
 contributors within and without the Mozilla Corporation and Foundation.
 
-It is currently maintained by Will Kahn-Greene.
+It is currently maintained by Will Kahn-Greene an Greg Guthe.
 
-Maintainer:
+Maintainers:
 
 - Will Kahn-Greene <willkg@mozilla.com>
+- Greg Guthe <gguthe@mozilla.com>
 
 Maintainer emeritus:
 

From 285676021a715be31794f062e6c297369b204aa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Janusz=20Kamie=C5=84ski?= <janusz.kamienski@gmail.com>
Date: Sat, 26 Aug 2017 22:06:13 +0200
Subject: [PATCH 106/314] args for linkify and clean must be six.string_types
 or raise TypeError

---
 bleach/linkifier.py    | 35 ++++++++++++++++++--------------
 bleach/sanitizer.py    | 45 +++++++++++++++++++++++-------------------
 tests/test_links.py    | 13 ++++++++++++
 tests/test_security.py | 14 +++++++++++++
 4 files changed, 72 insertions(+), 35 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index fc346c35..471ce93f 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 import re
+import six
 
 import html5lib
 from html5lib.filters.base import Filter
@@ -135,21 +136,25 @@ def linkify(self, text):
         :returns: linkified text as unicode
 
         """
-        text = force_unicode(text)
-
-        if not text:
-            return u''
-
-        dom = self.parser.parseFragment(text)
-        filtered = LinkifyFilter(
-            source=self.walker(dom),
-            callbacks=self.callbacks,
-            skip_tags=self.skip_tags,
-            parse_email=self.parse_email,
-            url_re=self.url_re,
-            email_re=self.email_re,
-        )
-        return self.serializer.render(filtered)
+        if isinstance(text, six.string_types):
+
+            text = force_unicode(text)
+
+            if not text:
+                return u''
+
+            dom = self.parser.parseFragment(text)
+            filtered = LinkifyFilter(
+                source=self.walker(dom),
+                callbacks=self.callbacks,
+                skip_tags=self.skip_tags,
+                parse_email=self.parse_email,
+                url_re=self.url_re,
+                email_re=self.email_re,
+            )
+            return self.serializer.render(filtered)
+
+        raise TypeError('argument must of text type')
 
 
 class LinkifyFilter(Filter):
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 539711ac..2fbbd3ad 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 import re
+import six
 from xml.sax.saxutils import unescape
 
 import html5lib
@@ -125,32 +126,36 @@ def clean(self, text):
         :returns: sanitized text as unicode
 
         """
-        if not text:
-            return u''
+        if isinstance(text, six.string_types):
 
-        text = force_unicode(text)
+            if not text:
+                return u''
 
-        dom = self.parser.parseFragment(text)
-        filtered = BleachSanitizerFilter(
-            source=self.walker(dom),
+            text = force_unicode(text)
 
-            # Bleach-sanitizer-specific things
-            attributes=self.attributes,
-            strip_disallowed_elements=self.strip,
-            strip_html_comments=self.strip_comments,
+            dom = self.parser.parseFragment(text)
+            filtered = BleachSanitizerFilter(
+                source=self.walker(dom),
 
-            # html5lib-sanitizer things
-            allowed_elements=self.tags,
-            allowed_css_properties=self.styles,
-            allowed_protocols=self.protocols,
-            allowed_svg_properties=[],
-        )
+                # Bleach-sanitizer-specific things
+                attributes=self.attributes,
+                strip_disallowed_elements=self.strip,
+                strip_html_comments=self.strip_comments,
+
+                # html5lib-sanitizer things
+                allowed_elements=self.tags,
+                allowed_css_properties=self.styles,
+                allowed_protocols=self.protocols,
+                allowed_svg_properties=[],
+            )
+
+            # Apply any filters after the BleachSanitizerFilter
+            for filter_class in self.filters:
+                filtered = filter_class(source=filtered)
 
-        # Apply any filters after the BleachSanitizerFilter
-        for filter_class in self.filters:
-            filtered = filter_class(source=filtered)
+            return self.serializer.render(filtered)
 
-        return self.serializer.render(filtered)
+        raise TypeError('argument must of text type')
 
 
 def attribute_filter_factory(attributes):
diff --git a/tests/test_links.py b/tests/test_links.py
index 2c7ddd50..b967151f 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -639,3 +639,16 @@ def test_rel_already_there(self):
 
         assert linkify(linked) == link_good
         assert linkify(link_good) == link_good
+
+    def test_only_text_is_linkified(self):
+        some_text = 'text'
+        some_type = int
+        no_type = None
+
+        assert linkify(some_text) == some_text
+
+        with pytest.raises(TypeError):
+            linkify(some_type)
+
+        with pytest.raises(TypeError):
+            linkify(no_type)
diff --git a/tests/test_security.py b/tests/test_security.py
index da0fe92f..0eeb09c6 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -184,3 +184,17 @@ def test_regression_manually():
     expected = """&lt;img src="jav\rascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;"""
 
     assert clean(s) == expected
+
+
+def test_only_text_is_cleaned():
+    some_text = 'text'
+    some_type = int
+    no_type = None
+
+    assert clean(some_text) == some_text
+
+    with pytest.raises(TypeError):
+        clean(some_type)
+
+    with pytest.raises(TypeError):
+        clean(no_type)

From be9ad94962b1c726c620319b9886546269aa727d Mon Sep 17 00:00:00 2001
From: sedrubal <dev@sedrubal.de>
Date: Sun, 27 Aug 2017 20:12:20 +0200
Subject: [PATCH 107/314] Add shebang to setup.py, ignore virtual environments

---
 .gitignore | 1 +
 setup.py   | 2 ++
 2 files changed, 3 insertions(+)
 mode change 100644 => 100755 setup.py

diff --git a/.gitignore b/.gitignore
index f5adb549..26bbdf8e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ build
 docs/_build/
 .cache/
 .eggs/
+.*env*/
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 1df9765f..7e5c6ca8
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import re
 import sys
 

From b6537008a61bee98a03eda309e6d26f77af34f9b Mon Sep 17 00:00:00 2001
From: sedrubal <dev@sedrubal.de>
Date: Sun, 27 Aug 2017 20:13:27 +0200
Subject: [PATCH 108/314] Do not create a http link inside a mailto link

Partly fixes #300:

```py
>>> linkify('foo@exa-mple.com', parse_email=True)
'<a href="mailto:foo@exa-mple.com">foo@exa<a href="http://-mple.com" rel="nofollow">-mple.com</a></a>'
'<a href="mailto:foo@exa-mple.com">foo@exa-mple.com</a>'
```

but it does not fix this issue:

```py
>>> linkify('foo@bar-baz.com')
'foo@bar<a href="http://-baz.com" rel="nofollow">-baz.com</a>'
```
---
 bleach/linkifier.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index fc346c35..b8f801e9 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -344,7 +344,17 @@ def strip_non_url_bits(self, fragment):
 
     def handle_links(self, src_iter):
         """Handle links in character tokens"""
+        in_a = False  # happens, if parse_email=True and if a mail was found
         for token in src_iter:
+            if in_a:
+                if token['type'] == 'EndTag' and token['name'] == 'a':
+                    in_a = False
+                yield token
+                continue
+            elif token['type'] == 'StartTag' and token['name'] == 'a':
+                in_a = True
+                yield token
+                continue
             if token['type'] == 'Characters':
                 text = token['data']
                 new_tokens = []

From d12ea1329ada471b199015b27c9743afa6b9cc92 Mon Sep 17 00:00:00 2001
From: sedrubal <dev@sedrubal.de>
Date: Sun, 27 Aug 2017 20:20:19 +0200
Subject: [PATCH 109/314] Add test for mail address with hyphen

---
 tests/test_links.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_links.py b/tests/test_links.py
index 2c7ddd50..fb51abac 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -588,6 +588,14 @@ def test_hang():
     )
 
 
+def test_hyphen_in_mail():
+    """Test hyphens `-` in mails. Issue #300."""
+    assert (
+        linkify('ex@am-ple.com', parse_email=True) ==
+        '<a href="mailto:ex@am-ple.com">ex@am-ple.com</a>'
+    )
+
+
 def test_url_re_arg():
     """Verifies that a specified url_re is used"""
     fred_re = re.compile(r"""(fred\.com)""")

From 8965cdd3bb50c2a974153b254c65744142e4c1f4 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 10:30:27 -0400
Subject: [PATCH 110/314] Clarify security policy

Fixes #303
---
 CHANGES    | 162 ++++++++++++++++++++++++++++++++++-------------------
 README.rst |  15 +++++
 2 files changed, 120 insertions(+), 57 deletions(-)

diff --git a/CHANGES b/CHANGES
index 913a14c5..8f33a401 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,10 @@ Bleach Changes
 Version 2.0 (March 8th, 2017)
 -----------------------------
 
+**Security issues**
+
+* None
+
 **Backwards incompatible changes**
 
 * Removed support for Python 2.6. #206
@@ -110,9 +114,13 @@ Version 2.0 (March 8th, 2017)
 Version 1.5 (November 4th, 2016)
 --------------------------------
 
+**Security issues**
+
+* None
+
 **Backwards incompatible changes**
 
-- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
+* clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
   mailto.
 
   Previously it was a long list of protocols something like ed2k, ftp, http,
@@ -121,28 +129,40 @@ Version 1.5 (November 4th, 2016)
 
 **Changes**
 
-- clean: Added ``protocols`` to arguments list to let you override the list of
+* clean: Added ``protocols`` to arguments list to let you override the list of
   allowed protocols. Thank you, Andreas Malecki! #149
-- linkify: Fix a bug involving periods at the end of an email address. Thank you,
+
+* linkify: Fix a bug involving periods at the end of an email address. Thank you,
   Lorenz Schori! #219
-- linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
+
+* linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies!
   #207
-- linkify: Fix linkify inappropriately removing node tails when dropping nodes.
+
+* linkify: Fix linkify inappropriately removing node tails when dropping nodes.
   #132
-- Fixed a test that failed periodically. #161
-- Switched from nose to py.test. #204
-- Add test matrix for all supported Python and html5lib versions. #230
-- Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
+
+* Fixed a test that failed periodically. #161
+
+* Switched from nose to py.test. #204
+
+* Add test matrix for all supported Python and html5lib versions. #230
+
+* Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999
   and 0.99999 are busted.
-- Add support for ``python setup.py test``. #97
+
+* Add support for ``python setup.py test``. #97
 
 
 Version 1.4.3 (May 23rd, 2016)
 ------------------------------
 
+**Security issues**
+
+* None
+
 **Changes**
 
-- Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to
+* Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to
   sanitizer api. #195
 
 
@@ -151,10 +171,13 @@ Version 1.4.2 (September 11, 2015)
 
 **Changes**
 
-- linkify: Fix hang in linkify with ``parse_email=True``. #124
-- linkify: Fix crash in linkify when removing a link that is a first-child. #136
-- Updated TLDs.
-- linkify: Don't remove exterior brackets when linkifying. #146
+* linkify: Fix hang in linkify with ``parse_email=True``. #124
+
+* linkify: Fix crash in linkify when removing a link that is a first-child. #136
+
+* Updated TLDs.
+
+* linkify: Don't remove exterior brackets when linkifying. #146
 
 
 Version 1.4.1 (December 15, 2014)
@@ -162,8 +185,9 @@ Version 1.4.1 (December 15, 2014)
 
 **Changes**
 
-- Consistent order of attributes in output.
-- Python 3.4 support.
+* Consistent order of attributes in output.
+
+* Python 3.4 support.
 
 
 Version 1.4 (January 12, 2014)
@@ -171,44 +195,54 @@ Version 1.4 (January 12, 2014)
 
 **Changes**
 
-- linkify: Update linkify to use etree type Treewalker instead of simpletree.
-- Updated html5lib to version ``>=0.999``.
-- Update all code to be compatible with Python 3 and 2 using six.
-- Switch to Apache License.
+* linkify: Update linkify to use etree type Treewalker instead of simpletree.
+
+* Updated html5lib to version ``>=0.999``.
+
+* Update all code to be compatible with Python 3 and 2 using six.
+
+* Switch to Apache License.
 
 
 Version 1.3
 -----------
 
-- Used by Python 3-only fork.
+* Used by Python 3-only fork.
 
 
 Version 1.2.2 (May 18, 2013)
 ----------------------------
 
-- Pin html5lib to version 0.95 for now due to major API break.
+* Pin html5lib to version 0.95 for now due to major API break.
+
 
 Version 1.2.1 (February 19, 2013)
 ---------------------------------
 
-- clean() no longer considers ``feed:`` an acceptable protocol due to
+* ``clean()`` no longer considers ``feed:`` an acceptable protocol due to
   inconsistencies in browser behavior.
 
 
 Version 1.2 (January 28, 2013)
 ------------------------------
 
-- linkify() has changed considerably. Many keyword arguments have been
-  replaced with a single callbacks list. Please see the documentation
-  for more information.
-- Bleach will no longer consider unacceptable protocols when linkifying.
-- linkify() now takes a tokenizer argument that allows it to skip
+* ``linkify()`` has changed considerably. Many keyword arguments have been
+  replaced with a single callbacks list. Please see the documentation for more
+  information.
+
+* Bleach will no longer consider unacceptable protocols when linkifying.
+
+* ``linkify()`` now takes a tokenizer argument that allows it to skip
   sanitization.
-- delinkify() is gone.
-- Removed exception handling from _render. clean() and linkify() may now
-  throw.
-- linkify() correctly ignores case for protocols and domain names.
-- linkify() correctly handles markup within an <a> tag.
+
+* ``delinkify()`` is gone.
+
+* Removed exception handling from ``_render``. ``clean()`` and ``linkify()`` may
+  now throw.
+
+* ``linkify()`` correctly ignores case for protocols and domain names.
+
+* ``linkify()`` correctly handles markup within an <a> tag.
 
 
 Version 1.1.5
@@ -222,61 +256,75 @@ Version 1.1.4
 Version 1.1.3 (July 10, 2012)
 -----------------------------
 
-- Fix parsing bare URLs when parse_email=True.
+* Fix parsing bare URLs when parse_email=True.
 
 
 Version 1.1.2 (June 1, 2012)
 ----------------------------
 
-- Fix hang in style attribute sanitizer. (#61)
-- Allow '/' in style attribute values.
+* Fix hang in style attribute sanitizer. (#61)
+
+* Allow ``/`` in style attribute values.
 
 
 Version 1.1.1 (February 17, 2012)
 ---------------------------------
 
-- Fix tokenizer for html5lib 0.9.5.
+* Fix tokenizer for html5lib 0.9.5.
 
 
 Version 1.1.0 (October 24, 2011)
 --------------------------------
 
-- linkify() now understands port numbers. (#38)
-- Documented character encoding behavior. (#41)
-- Add an optional target argument to linkify().
-- Add delinkify() method. (#45)
-- Support subdomain whitelist for delinkify(). (#47, #48)
+* ``linkify()`` now understands port numbers. (#38)
+
+* Documented character encoding behavior. (#41)
+
+* Add an optional target argument to ``linkify()``.
+
+* Add ``delinkify()`` method. (#45)
+
+* Support subdomain whitelist for ``delinkify()``. (#47, #48)
 
 
 Version 1.0.4 (September 2, 2011)
 ---------------------------------
 
-- Switch to SemVer git tags.
-- Make linkify() smarter about trailing punctuation. (#30)
-- Pass exc_info to logger during rendering issues.
-- Add wildcard key for attributes. (#19)
-- Make linkify() use the HTMLSanitizer tokenizer. (#36)
-- Fix URLs wrapped in parentheses. (#23)
-- Make linkify() UTF-8 safe. (#33)
+* Switch to SemVer git tags.
+
+* Make ``linkify()`` smarter about trailing punctuation. (#30)
+
+* Pass ``exc_info`` to logger during rendering issues.
+
+* Add wildcard key for attributes. (#19)
+
+* Make ``linkify()`` use the ``HTMLSanitizer`` tokenizer. (#36)
+
+* Fix URLs wrapped in parentheses. (#23)
+
+* Make ``linkify()`` UTF-8 safe. (#33)
 
 
 Version 1.0.3 (June 14, 2011)
 -----------------------------
 
-- linkify() works with 3rd level domains. (#24)
-- clean() supports vendor prefixes in style values. (#31, #32)
-- Fix linkify() email escaping.
+* ``linkify()`` works with 3rd level domains. (#24)
+
+* ``clean()`` supports vendor prefixes in style values. (#31, #32)
+
+* Fix ``linkify()`` email escaping.
 
 
 Version 1.0.2 (June 6, 2011)
 ----------------------------
 
-- linkify() supports email addresses.
-- clean() supports callables in attributes filter.
+* ``linkify()`` supports email addresses.
+
+* ``clean()`` supports callables in attributes filter.
 
 
 Version 1.0.1 (April 12, 2011)
 ------------------------------
 
-- linkify() doesn't drop trailing slashes. (#21)
-- linkify() won't linkify 'libgl.so.1'. (#22)
+* ``linkify()`` doesn't drop trailing slashes. (#21)
+* ``linkify()`` won't linkify 'libgl.so.1'. (#22)
diff --git a/README.rst b/README.rst
index db927ee4..b728c292 100644
--- a/README.rst
+++ b/README.rst
@@ -51,6 +51,21 @@ please read our wiki page at
 `<https://www.mozilla.org/en-US/security/#For_Developers>`_.
 
 
+Security
+========
+
+Bleach is a security-related library.
+
+We have a responsible security vulnerability reporting process. Please use
+that if you're reporting a security issue.
+
+Security issues are fixed in private. After we land such a fix, we'll do a
+release.
+
+For every release, we mark security issues we've fixed in the ``CHANGES`` in
+the **Security issues** section. We include relevant CVE links.
+
+
 Installing Bleach
 =================
 

From 8b91e536af18383f06754d15abc1995a9f61f337 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 14:16:21 -0400
Subject: [PATCH 111/314] Invert if blocks; add exceptions to docstrings

---
 bleach/linkifier.py | 39 +++++++++++++++++++------------------
 bleach/sanitizer.py | 47 +++++++++++++++++++++++----------------------
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index c8d33404..14b8f83f 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -135,26 +135,27 @@ def linkify(self, text):
 
         :returns: linkified text as unicode
 
-        """
-        if isinstance(text, six.string_types):
-
-            text = force_unicode(text)
-
-            if not text:
-                return u''
+        :raises TypeError: if ``text`` is not a text type
 
-            dom = self.parser.parseFragment(text)
-            filtered = LinkifyFilter(
-                source=self.walker(dom),
-                callbacks=self.callbacks,
-                skip_tags=self.skip_tags,
-                parse_email=self.parse_email,
-                url_re=self.url_re,
-                email_re=self.email_re,
-            )
-            return self.serializer.render(filtered)
-
-        raise TypeError('argument must of text type')
+        """
+        if not isinstance(text, six.string_types):
+            raise TypeError('argument must of text type')
+
+        text = force_unicode(text)
+
+        if not text:
+            return u''
+
+        dom = self.parser.parseFragment(text)
+        filtered = LinkifyFilter(
+            source=self.walker(dom),
+            callbacks=self.callbacks,
+            skip_tags=self.skip_tags,
+            parse_email=self.parse_email,
+            url_re=self.url_re,
+            email_re=self.email_re,
+        )
+        return self.serializer.render(filtered)
 
 
 class LinkifyFilter(Filter):
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 2fbbd3ad..0adcd772 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -125,37 +125,38 @@ def clean(self, text):
 
         :returns: sanitized text as unicode
 
-        """
-        if isinstance(text, six.string_types):
+        :raises TypeError: if ``text`` is not a text type
 
-            if not text:
-                return u''
+        """
+        if not isinstance(text, six.string_types):
+            raise TypeError('argument must of text type')
 
-            text = force_unicode(text)
+        if not text:
+            return u''
 
-            dom = self.parser.parseFragment(text)
-            filtered = BleachSanitizerFilter(
-                source=self.walker(dom),
+        text = force_unicode(text)
 
-                # Bleach-sanitizer-specific things
-                attributes=self.attributes,
-                strip_disallowed_elements=self.strip,
-                strip_html_comments=self.strip_comments,
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
 
-                # html5lib-sanitizer things
-                allowed_elements=self.tags,
-                allowed_css_properties=self.styles,
-                allowed_protocols=self.protocols,
-                allowed_svg_properties=[],
-            )
+            # Bleach-sanitizer-specific things
+            attributes=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
 
-            # Apply any filters after the BleachSanitizerFilter
-            for filter_class in self.filters:
-                filtered = filter_class(source=filtered)
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
 
-            return self.serializer.render(filtered)
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
 
-        raise TypeError('argument must of text type')
+        return self.serializer.render(filtered)
 
 
 def attribute_filter_factory(attributes):

From 4e4996cde546a75489146f9761878b1e32c7452e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 14:50:33 -0400
Subject: [PATCH 112/314] Redo versioning

* moves versioning information into bleach/__init__.py
* bases versioning on __version__ which is a version string
* adds a __releasedate__
* updates docs to include version and release date in the footer
* sets the version to 2.1.dev0

Fixes #307
---
 bleach/__init__.py | 11 ++++++++++-
 bleach/version.py  |  6 ------
 docs/conf.py       | 14 +++++++-------
 docs/dev.rst       | 11 +++++------
 setup.py           | 17 ++++++-----------
 5 files changed, 28 insertions(+), 31 deletions(-)
 delete mode 100644 bleach/version.py

diff --git a/bleach/__init__.py b/bleach/__init__.py
index c9a7fe40..68027314 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,6 +2,8 @@
 
 from __future__ import unicode_literals
 
+from pkg_resources import parse_version
+
 from bleach.linkifier import (
     DEFAULT_CALLBACKS,
     Linker,
@@ -15,7 +17,14 @@
     BleachSanitizerFilter,
     Cleaner,
 )
-from bleach.version import __version__, VERSION # flake8: noqa
+
+
+# yyyymmdd
+__releasedate__ = ''
+# x.y or x.y.dev0
+__version__ = '2.1.dev0'
+VERSION = parse_version(__version__)
+
 
 __all__ = ['clean', 'linkify']
 
diff --git a/bleach/version.py b/bleach/version.py
deleted file mode 100644
index bcd8affc..00000000
--- a/bleach/version.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
-VERSION = (2, 0, 0)
-__version__ = '.'.join([str(n) for n in VERSION])
diff --git a/docs/conf.py b/docs/conf.py
index e327ee55..6a490050 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,12 +11,12 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
-from bleach import __version__, VERSION
+import os
+import sys
 
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath('..'))
+
+import bleach  # noqa
 
 # -- General configuration -----------------------------------------------------
 
@@ -48,9 +48,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '.'.join([str(n) for n in VERSION[:2]])
+version = bleach.__version__
 # The full version, including alpha/beta/rc tags.
-release = __version__
+release = bleach.__version__ + ' ' + bleach.__releasedate__
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/dev.rst b/docs/dev.rst
index 5cc1691c..cfa0a8c7 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -47,11 +47,10 @@ Release process
 2. Check to make sure ``setup.py`` and ``requirements.txt`` are
    correct and match requirements-wise.
 
-3. Update version number in:
+3. Update version numbers in ``bleach/__init__.py``.
 
-   * ``bleach/version.py``
-
-   Set the version to something like ``VERSION = (1, 4, 3)``.
+   1. Set ``__version__`` to something like ``2.0``.
+   2. Set ``__releasedate__`` to something like ``20120731``.
 
 4. Update ``CONTRIBUTORS``, ``CHANGES`` and ``MANIFEST.in``.
 
@@ -77,9 +76,9 @@ Release process
 
 7. Push the changes to GitHub. This will cause Travis to run the tests.
 
-8. After Travis is happy, tag the release::
+8. After Travis is happy, create a signed tag for the release::
 
-     $ git tag -a v0.4
+     $ git tag -s v0.4
 
    Copy the details from ``CHANGES`` into the tag comment.
 
diff --git a/setup.py b/setup.py
index 7e5c6ca8..2bd04207 100755
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python
 
+import os
 import re
 import sys
 
 from setuptools import setup, find_packages
-from distutils.util import convert_path
+
 
 setup_requires = []
 if 'test' in sys.argv:
@@ -31,16 +32,10 @@ def get_long_desc():
 
 
 def get_version():
-    version_path = convert_path('bleach/version.py')
-    with open(version_path) as version_file:
-        for line in version_file:
-            if line.startswith('VERSION = '):
-                match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line)
-                return '{0!s}.{1!s}.{2!s}'.format(
-                    match.group(1),
-                    match.group(2),
-                    match.group(3)
-                )
+    fn = os.path.join('bleach', '__init__.py')
+    vsre = r"""^__version__ = ['"]([^'"]*)['"]"""
+    version_file = open(fn, 'rt').read()
+    return re.search(vsre, version_file, re.M).group(1)
 
 
 setup(

From 489c259342806e2278a6e263f7a16dacc61222de Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 15:03:45 -0400
Subject: [PATCH 113/314] Fix unrelated linting issue

---
 bleach/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 68027314..8097787d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -7,14 +7,12 @@
 from bleach.linkifier import (
     DEFAULT_CALLBACKS,
     Linker,
-    LinkifyFilter,
 )
 from bleach.sanitizer import (
     ALLOWED_ATTRIBUTES,
     ALLOWED_PROTOCOLS,
     ALLOWED_STYLES,
     ALLOWED_TAGS,
-    BleachSanitizerFilter,
     Cleaner,
 )
 

From f904b5214d2fe84ed3332cb50171c77c4b68290c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 15:45:06 -0400
Subject: [PATCH 114/314] Update CHANGES and CONTRIBUTORS

---
 CHANGES      | 34 ++++++++++++++++++++++++++++++++++
 CONTRIBUTORS |  3 +++
 2 files changed, 37 insertions(+)

diff --git a/CHANGES b/CHANGES
index 8f33a401..6abd02f7 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,40 @@
 Bleach Changes
 ==============
 
+Version 2.1 (in development)
+----------------------------
+
+**Security issues**
+
+**Backwards incompatible changes**
+
+* Redid versioning. ``bleach.VERSION`` is no longer available. Use the string
+  version at ``bleach.__version__`` and parse it with
+  ``pkg_resources.parse_version``. (#307)
+
+**Features**
+
+**Bug fixes**
+
+* linkify: fix http links inside of mailto links; thank you, sedrubal! (#300)
+
+* clean, linkify: linkify and clean should only accept text types; thank you,
+  Janusz! (#292)
+
+* clarify security policy in docs (#303)
+
+* fix dependency specification for html5lib 1.0b8, 1.0b9, and 1.0b10; thank you,
+  Zoltán! (#268)
+
+* add Bleach vs. html5lib comparison to README; thank you, Stu Cox! (#278)
+
+* fix KeyError exceptions on tags without href attr; thank you, Alex Defsen!
+  (#273)
+
+* add test website and scripts to test ``bleach.clean()`` output in browser;
+  thank you, Greg Guthe!
+
+
 Version 2.0 (March 8th, 2017)
 -----------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 3220156d..9638b48a 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -36,6 +36,7 @@ Contributors:
 - Jaime Irurzun
 - James Socol
 - Jannis Leidel
+- Janusz Kamieński
 - Jeff Balogh
 - Lee, Cheon-il
 - Les Orchard
@@ -52,8 +53,10 @@ Contributors:
 - Ricky Rosario
 - Ryan Niemeyer
 - Sébastien Fievet
+- sedrubal
 - Tim Dumol
 - Timothy Fitz
 - Vitaly Volkov
 - Will Kahn-Greene
+- Zoltán
 - zyegfryed

From f592dee6fac2ded945c2e4df879ac34c0cb6848f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 16:32:58 -0400
Subject: [PATCH 115/314] Drop encoding module

Fixes #176
---
 CHANGES             |  8 +++---
 bleach/encoding.py  | 62 ---------------------------------------------
 bleach/linkifier.py |  3 +--
 bleach/sanitizer.py |  3 +--
 bleach/utils.py     | 21 +++++++++++++++
 5 files changed, 28 insertions(+), 69 deletions(-)
 delete mode 100644 bleach/encoding.py

diff --git a/CHANGES b/CHANGES
index 6abd02f7..59db3338 100644
--- a/CHANGES
+++ b/CHANGES
@@ -12,15 +12,17 @@ Version 2.1 (in development)
   version at ``bleach.__version__`` and parse it with
   ``pkg_resources.parse_version``. (#307)
 
+* clean, linkify: linkify and clean should only accept text types; thank you,
+  Janusz! (#292)
+
+* clean, linkify: accept only unicode or utf-8-encoded str (#176)
+
 **Features**
 
 **Bug fixes**
 
 * linkify: fix http links inside of mailto links; thank you, sedrubal! (#300)
 
-* clean, linkify: linkify and clean should only accept text types; thank you,
-  Janusz! (#292)
-
 * clarify security policy in docs (#303)
 
 * fix dependency specification for html5lib 1.0b8, 1.0b9, and 1.0b10; thank you,
diff --git a/bleach/encoding.py b/bleach/encoding.py
deleted file mode 100644
index 707adaa2..00000000
--- a/bleach/encoding.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import datetime
-from decimal import Decimal
-import types
-import six
-
-
-def is_protected_type(obj):
-    """Determine if the object instance is of a protected type.
-
-    Objects of protected types are preserved as-is when passed to
-    force_unicode(strings_only=True).
-    """
-    return isinstance(obj, (
-        six.integer_types +
-        (types.NoneType,
-         datetime.datetime, datetime.date, datetime.time,
-         float, Decimal))
-    )
-
-
-def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
-    """
-    Similar to smart_text, except that lazy instances are resolved to
-    strings, rather than kept as lazy objects.
-
-    If strings_only is True, don't convert (some) non-string-like objects.
-    """
-    # Handle the common case first, saves 30-40% when s is an instance of
-    # six.text_type. This function gets called often in that setting.
-    if isinstance(s, six.text_type):
-        return s
-    if strings_only and is_protected_type(s):
-        return s
-    try:
-        if not isinstance(s, six.string_types):
-            if hasattr(s, '__unicode__'):
-                s = s.__unicode__()
-            else:
-                if six.PY3:
-                    if isinstance(s, bytes):
-                        s = six.text_type(s, encoding, errors)
-                    else:
-                        s = six.text_type(s)
-                else:
-                    s = six.text_type(bytes(s), encoding, errors)
-        else:
-            # Note: We use .decode() here, instead of six.text_type(s,
-            # encoding, errors), so that if s is a SafeBytes, it ends up being
-            # a SafeText at the end.
-            s = s.decode(encoding, errors)
-    except UnicodeDecodeError as e:
-        if not isinstance(s, Exception):
-            raise UnicodeDecodeError(*e.args)
-        else:
-            # If we get to here, the caller has passed in an Exception
-            # subclass populated with non-ASCII bytestring data without a
-            # working unicode method. Try to handle this without raising a
-            # further exception by individually forcing the exception args
-            # to unicode.
-            s = ' '.join([force_unicode(arg, encoding, strings_only,
-                          errors) for arg in s])
-    return s
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 14b8f83f..0aa37e5e 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -8,8 +8,7 @@
 from html5lib.serializer import HTMLSerializer
 
 from bleach import callbacks as linkify_callbacks
-from bleach.encoding import force_unicode
-from bleach.utils import alphabetize_attributes
+from bleach.utils import alphabetize_attributes, force_unicode
 
 
 #: List of default callbacks
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0adcd772..26cfad2a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -8,8 +8,7 @@
 from html5lib.filters import sanitizer
 from html5lib.serializer import HTMLSerializer
 
-from bleach.encoding import force_unicode
-from bleach.utils import alphabetize_attributes
+from bleach.utils import alphabetize_attributes, force_unicode
 
 
 #: List of allowed tags
diff --git a/bleach/utils.py b/bleach/utils.py
index d9c211fc..add7ba6e 100644
--- a/bleach/utils.py
+++ b/bleach/utils.py
@@ -1,5 +1,7 @@
 from collections import OrderedDict
 
+import six
+
 
 def _attr_key(attr):
     """Returns appropriate key for sorting attribute names
@@ -21,3 +23,22 @@ def alphabetize_attributes(attrs):
     return OrderedDict(
         [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
     )
+
+
+def force_unicode(text):
+    """Takes a text (Python 2: str/unicode; Python 3: unicode) and converts to unicode
+
+    :arg str/unicode text: the text in question
+
+    :returns: text as unicode
+
+    :raises UnicodeDecodeError: if the text was a Python 2 str and isn't in
+        utf-8
+
+    """
+    # If it's already unicode, then return it
+    if isinstance(text, six.text_type):
+        return text
+
+    # If not, convert it
+    return six.text_type(text, 'utf-8', 'strict')

From d7c920dd35a640cb6c63042e7605298d14c8b0d5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 21:38:51 -0400
Subject: [PATCH 116/314] Clean up tests

This renames two of the test files to names that better reflect what they're
testing.

I also went through and adjusted the clean() tests to make it easier to read
them and verify the input and output. Granted, this is highly subjective and
specific to me, but I'm hoping the end result is that the tests are more uniform
in structure and easier to read for others, too.
---
 tests/test_basics.py                     | 365 --------------------
 tests/test_clean.py                      | 404 +++++++++++++++++++++++
 tests/{test_links.py => test_linkify.py} |   0
 3 files changed, 404 insertions(+), 365 deletions(-)
 delete mode 100644 tests/test_basics.py
 create mode 100644 tests/test_clean.py
 rename tests/{test_links.py => test_linkify.py} (100%)

diff --git a/tests/test_basics.py b/tests/test_basics.py
deleted file mode 100644
index 5b59ebf9..00000000
--- a/tests/test_basics.py
+++ /dev/null
@@ -1,365 +0,0 @@
-from html5lib.filters.base import Filter
-import pytest
-import six
-
-import bleach
-from bleach.sanitizer import Cleaner
-
-
-class TestClean:
-    def test_empty(self):
-        assert bleach.clean('') == ''
-
-    def test_nbsp(self):
-        if six.PY3:
-            expected = '\xa0test string\xa0'
-        else:
-            expected = six.u('\\xa0test string\\xa0')
-
-        assert bleach.clean('&nbsp;test string&nbsp;') == expected
-
-    def test_comments_only(self):
-        comment = '<!-- this is a comment -->'
-        open_comment = '<!-- this is an open comment'
-        assert bleach.clean(comment) == ''
-        assert bleach.clean(open_comment) == ''
-        assert bleach.clean(comment, strip_comments=False) == comment
-        assert (
-            bleach.clean(open_comment, strip_comments=False) ==
-            '{0!s}-->'.format(open_comment)
-        )
-
-    def test_with_comments(self):
-        html = '<!-- comment -->Just text'
-        assert 'Just text', bleach.clean(html) == 'Just text'
-        assert bleach.clean(html, strip_comments=False) == html
-
-    def test_no_html(self):
-        assert bleach.clean('no html string') == 'no html string'
-
-    def test_allowed_html(self):
-        assert (
-            bleach.clean('an <strong>allowed</strong> tag') ==
-            'an <strong>allowed</strong> tag'
-        )
-        assert (
-            bleach.clean('another <em>good</em> tag') ==
-            'another <em>good</em> tag'
-        )
-
-    def test_bad_html(self):
-        assert (
-            bleach.clean('a <em>fixed tag') ==
-            'a <em>fixed tag</em>'
-        )
-
-    def test_function_arguments(self):
-        TAGS = ['span', 'br']
-        ATTRS = {'span': ['style']}
-
-        assert (
-            bleach.clean('a <br/><span style="color:red">test</span>',
-                         tags=TAGS, attributes=ATTRS) ==
-            'a <br><span style="">test</span>'
-        )
-
-    def test_named_arguments(self):
-        ATTRS = {'a': ['rel', 'href']}
-
-        text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
-
-        assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
-        assert (
-            bleach.clean(text, attributes=ATTRS) ==
-            '<a href="http://xx.com" rel="alternate">xx.com</a>'
-        )
-
-    def test_disallowed_html(self):
-        assert (
-            bleach.clean('a <script>safe()</script> test') ==
-            'a &lt;script&gt;safe()&lt;/script&gt; test'
-        )
-        assert (
-            bleach.clean('a <style>body{}</style> test') ==
-            'a &lt;style&gt;body{}&lt;/style&gt; test'
-        )
-
-    def test_bad_href(self):
-        assert (
-            bleach.clean('<em href="fail">no link</em>') ==
-            '<em>no link</em>'
-        )
-
-    def test_bare_entities(self):
-        assert (
-            bleach.clean('an & entity') ==
-            'an &amp; entity'
-        )
-        assert (
-            bleach.clean('an < entity') ==
-            'an &lt; entity'
-        )
-
-        assert (
-            bleach.clean('tag < <em>and</em> entity') ==
-            'tag &lt; <em>and</em> entity'
-        )
-
-        assert (
-            bleach.clean('&amp;') ==
-            '&amp;'
-        )
-
-    def test_escaped_entities(self):
-        s = '&lt;em&gt;strong&lt;/em&gt;'
-        assert bleach.clean(s) == s
-
-    def test_weird_strings(self):
-        s = '</3'
-        assert bleach.clean(s) == ''
-
-    def test_stripping(self):
-        assert (
-            bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True) ==
-            'a test <em>with</em> <b>html</b> tags'
-        )
-        assert (
-            bleach.clean('a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
-                         strip=True) ==
-            'a test <em>with</em>  <b>html</b> tags'
-        )
-
-        s = '<p><a href="http://example.com/">link text</a></p>'
-        assert (
-            bleach.clean(s, tags=['p'], strip=True) ==
-            '<p>link text</p>'
-        )
-        s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
-        assert (
-            bleach.clean(s, tags=['p'], strip=True) ==
-            '<p>multiply nested text</p>'
-        )
-
-        s = '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>'
-        assert (
-            bleach.clean(s, tags=['p', 'a'], strip=True) ==
-            '<p><a href="http://example.com/"></a></p>'
-        )
-
-    def test_allowed_styles(self):
-        ATTRS = ['style']
-        STYLE = ['color']
-        blank = '<b style=""></b>'
-        s = '<b style="color: blue;"></b>'
-        assert bleach.clean('<b style="top:0"></b>', attributes=ATTRS) == blank
-        assert bleach.clean(s, attributes=ATTRS, styles=STYLE) == s
-        assert (
-            bleach.clean('<b style="top: 0; color: blue;"></b>', attributes=ATTRS, styles=STYLE) ==
-            s
-        )
-
-    def test_lowercase_html(self):
-        """We should output lowercase HTML."""
-        dirty = '<EM CLASS="FOO">BAR</EM>'
-        clean = '<em class="FOO">BAR</em>'
-        assert bleach.clean(dirty, attributes=['class']) == clean
-
-    def test_attributes_callable(self):
-        """Verify attributes can take a callable"""
-        ATTRS = lambda tag, name, val: name == 'title'
-        TAGS = ['a']
-
-        assert (
-            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-            u'<a title="blah">example</a>'
-        )
-
-    def test_attributes_wildcard(self):
-        """Verify attributes[*] works"""
-        ATTRS = {
-            '*': ['id'],
-            'img': ['src'],
-        }
-        TAGS = ['img', 'em']
-        dirty = ('both <em id="foo" style="color: black">can</em> have '
-                 '<img id="bar" src="foo"/>')
-        assert (
-            bleach.clean(dirty, tags=TAGS, attributes=ATTRS) ==
-            'both <em id="foo">can</em> have <img id="bar" src="foo">'
-        )
-
-    def test_attributes_wildcard_callable(self):
-        """Verify attributes[*] callable works"""
-        ATTRS = {
-            '*': lambda tag, name, val: name == 'title'
-        }
-        TAGS = ['a']
-
-        assert (
-            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-            u'<a title="blah">example</a>'
-        )
-
-    def test_attributes_tag_callable(self):
-        """Verify attributes[tag] callable works"""
-        def img_test(tag, name, val):
-            return name == 'src' and val.startswith('https')
-
-        ATTRS = {
-            'img': img_test,
-        }
-        TAGS = ['img']
-
-        assert (
-            bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS,
-                         attributes=ATTRS) ==
-            u'foo <img> baz'
-        )
-        assert (
-            bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS,
-                         attributes=ATTRS) ==
-            u'foo <img src="https://example.com"> baz'
-        )
-
-    def test_attributes_tag_list(self):
-        """Verify attributes[tag] list works"""
-        ATTRS = {
-            'a': ['title']
-        }
-        TAGS = ['a']
-
-        assert (
-            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-            u'<a title="blah">example</a>'
-        )
-
-    def test_attributes_list(self):
-        """Verify attributes list works"""
-        ATTRS = ['title']
-        TAGS = ['a']
-
-        assert (
-            bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-            u'<a title="blah">example</a>'
-        )
-
-    def test_svg_attr_val_allows_ref(self):
-        """Unescape values in svg attrs that allow url references"""
-        # Local IRI, so keep it
-        text = '<svg><rect fill="url(#foo)" /></svg>'
-        TAGS = ['svg', 'rect']
-        ATTRS = {
-            'rect': ['fill'],
-        }
-        assert (
-            bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
-            '<svg><rect fill="url(#foo)"></rect></svg>'
-        )
-
-        # Non-local IRI, so drop it
-        text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
-        TAGS = ['svg', 'rect']
-        ATTRS = {
-            'rect': ['fill'],
-        }
-        assert (
-            bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
-            '<svg><rect></rect></svg>'
-        )
-
-    @pytest.mark.parametrize('text, expected', [
-        (
-            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
-            '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
-        ),
-        (
-            '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
-            # NOTE(willkg): Bug in html5lib serializer drops the xlink part
-            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
-        ),
-    ])
-    def test_svg_allow_local_href(self, text, expected):
-        """Keep local hrefs for svg elements"""
-        TAGS = ['svg', 'pattern']
-        ATTRS = {
-            'pattern': ['id', 'href'],
-        }
-        assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
-    @pytest.mark.parametrize('text, expected', [
-        (
-            '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
-            '<svg><pattern id="patt1"></pattern></svg>'
-        ),
-        (
-            '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
-            '<svg><pattern id="patt1"></pattern></svg>'
-        ),
-    ])
-    def test_svg_allow_local_href_nonlocal(self, text, expected):
-        """Drop non-local hrefs for svg elements"""
-        TAGS = ['svg', 'pattern']
-        ATTRS = {
-            'pattern': ['id', 'href'],
-        }
-        assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
-    @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
-    def test_sarcasm(self):
-        """Jokes should crash.<sarcasm/>"""
-        dirty = 'Yeah right <sarcasm/>'
-        clean = 'Yeah right &lt;sarcasm/&gt;'
-        assert bleach.clean(dirty) == clean
-
-    def test_user_defined_protocols_valid(self):
-        valid_href = '<a href="myprotocol://more_text">allowed href</a>'
-        assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
-
-    def test_user_defined_protocols_invalid(self):
-        invalid_href = '<a href="http://xx.com">invalid href</a>'
-        cleaned_href = '<a>invalid href</a>'
-        assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
-
-    def test_filters(self):
-        # Create a Filter that changes all the attr values to "moo"
-        class MooFilter(Filter):
-            def __iter__(self):
-                for token in Filter.__iter__(self):
-                    if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
-                        for attr, value in token['data'].items():
-                            token['data'][attr] = 'moo'
-
-                    yield token
-
-        ATTRS = {
-            'img': ['rel', 'src']
-        }
-        TAGS = ['img']
-        dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
-
-        cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
-
-        assert (
-            cleaner.clean(dirty) ==
-            'this is cute! <img rel="moo" src="moo">'
-        )
-
-
-def test_clean_idempotent():
-    """Make sure that applying the filter twice doesn't change anything."""
-    dirty = '<span>invalid & </span> < extra http://link.com<em>'
-
-    assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
-
-
-class TestCleaner:
-    def test_basics(self):
-        TAGS = ['span', 'br']
-        ATTRS = {'span': ['style']}
-
-        cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
-
-        assert (
-            cleaner.clean('a <br/><span style="color:red">test</span>') ==
-            'a <br><span style="">test</span>'
-        )
diff --git a/tests/test_clean.py b/tests/test_clean.py
new file mode 100644
index 00000000..a6a37557
--- /dev/null
+++ b/tests/test_clean.py
@@ -0,0 +1,404 @@
+from html5lib.filters.base import Filter
+import pytest
+import six
+
+import bleach
+from bleach.sanitizer import Cleaner
+
+
+def test_empty():
+    assert bleach.clean('') == ''
+
+
+def test_nbsp():
+    if six.PY3:
+        expected = '\xa0test string\xa0'
+    else:
+        expected = six.u('\\xa0test string\\xa0')
+
+    assert bleach.clean('&nbsp;test string&nbsp;') == expected
+
+
+def test_comments_only():
+    comment = '<!-- this is a comment -->'
+    assert bleach.clean(comment) == ''
+    assert bleach.clean(comment, strip_comments=False) == comment
+
+    open_comment = '<!-- this is an open comment'
+    assert bleach.clean(open_comment) == ''
+    assert (
+        bleach.clean(open_comment, strip_comments=False) ==
+        '{0!s}-->'.format(open_comment)
+    )
+
+
+def test_with_comments():
+    text = '<!-- comment -->Just text'
+    assert bleach.clean(text) == 'Just text'
+    assert bleach.clean(text, strip_comments=False) == text
+
+
+def test_no_html():
+    assert bleach.clean('no html string') == 'no html string'
+
+
+def test_allowed_html():
+    assert (
+        bleach.clean('an <strong>allowed</strong> tag') ==
+        'an <strong>allowed</strong> tag'
+    )
+    assert (
+        bleach.clean('another <em>good</em> tag') ==
+        'another <em>good</em> tag'
+    )
+
+
+def test_bad_html():
+    assert (
+        bleach.clean('a <em>fixed tag') ==
+        'a <em>fixed tag</em>'
+    )
+
+
+def test_function_arguments():
+    TAGS = ['span', 'br']
+    ATTRS = {'span': ['style']}
+
+    text = 'a <br/><span style="color:red">test</span>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        'a <br><span style="">test</span>'
+    )
+
+
+def test_named_arguments():
+    ATTRS = {'a': ['rel', 'href']}
+
+    text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
+    assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
+    assert (
+        bleach.clean(text, attributes=ATTRS) ==
+        '<a href="http://xx.com" rel="alternate">xx.com</a>'
+    )
+
+
+def test_disallowed_html():
+    assert (
+        bleach.clean('a <script>safe()</script> test') ==
+        'a &lt;script&gt;safe()&lt;/script&gt; test'
+    )
+    assert (
+        bleach.clean('a <style>body{}</style> test') ==
+        'a &lt;style&gt;body{}&lt;/style&gt; test'
+    )
+
+
+def test_bad_href():
+    assert (
+        bleach.clean('<em href="fail">no link</em>') ==
+        '<em>no link</em>'
+    )
+
+
+def test_bare_entities():
+    assert (
+        bleach.clean('an & entity') ==
+        'an &amp; entity'
+    )
+    assert (
+        bleach.clean('an < entity') ==
+        'an &lt; entity'
+    )
+
+    assert (
+        bleach.clean('tag < <em>and</em> entity') ==
+        'tag &lt; <em>and</em> entity'
+    )
+
+    assert (
+        bleach.clean('&amp;') ==
+        '&amp;'
+    )
+
+
+def test_escaped_entities():
+    s = '&lt;em&gt;strong&lt;/em&gt;'
+    assert bleach.clean(s) == s
+
+
+def test_weird_strings():
+    s = '</3'
+    assert bleach.clean(s) == ''
+
+
+def test_stripping():
+    text = 'a test <em>with</em> <b>html</b> tags'
+    assert (
+        bleach.clean(text, strip=True) ==
+        'a test <em>with</em> <b>html</b> tags'
+    )
+
+    text = 'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags'
+    assert (
+        bleach.clean(text, strip=True) ==
+        'a test <em>with</em>  <b>html</b> tags'
+    )
+
+    text = '<p><a href="http://example.com/">link text</a></p>'
+    assert (
+        bleach.clean(text, tags=['p'], strip=True) ==
+        '<p>link text</p>'
+    )
+    text = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
+    assert (
+        bleach.clean(text, tags=['p'], strip=True) ==
+        '<p>multiply nested text</p>'
+    )
+
+    text = '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>'
+    assert (
+        bleach.clean(text, tags=['p', 'a'], strip=True) ==
+        '<p><a href="http://example.com/"></a></p>'
+    )
+
+
+def test_allowed_styles():
+    ATTRS = ['style']
+    STYLE = ['color']
+
+    assert (
+        bleach.clean('<b style="top:0"></b>', attributes=ATTRS) ==
+        '<b style=""></b>'
+    )
+
+    text = '<b style="color: blue;"></b>'
+    assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text
+
+    text = '<b style="top: 0; color: blue;"></b>'
+    assert (
+        bleach.clean(text, attributes=ATTRS, styles=STYLE) ==
+        '<b style="color: blue;"></b>'
+    )
+
+
+def test_lowercase_html():
+    """We should output lowercase HTML."""
+    assert (
+        bleach.clean('<EM CLASS="FOO">BAR</EM>', attributes=['class']) ==
+        '<em class="FOO">BAR</em>'
+    )
+
+
+def test_attributes_callable():
+    """Verify attributes can take a callable"""
+    ATTRS = lambda tag, name, val: name == 'title'
+    TAGS = ['a']
+
+    text = u'<a href="/foo" title="blah">example</a>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        u'<a title="blah">example</a>'
+    )
+
+
+def test_attributes_wildcard():
+    """Verify attributes[*] works"""
+    ATTRS = {
+        '*': ['id'],
+        'img': ['src'],
+    }
+    TAGS = ['img', 'em']
+
+    text = 'both <em id="foo" style="color: black">can</em> have <img id="bar" src="foo"/>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        'both <em id="foo">can</em> have <img id="bar" src="foo">'
+    )
+
+
+def test_attributes_wildcard_callable():
+    """Verify attributes[*] callable works"""
+    ATTRS = {
+        '*': lambda tag, name, val: name == 'title'
+    }
+    TAGS = ['a']
+
+    assert (
+        bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        u'<a title="blah">example</a>'
+    )
+
+
+def test_attributes_tag_callable():
+    """Verify attributes[tag] callable works"""
+    def img_test(tag, name, val):
+        return name == 'src' and val.startswith('https')
+
+    ATTRS = {
+        'img': img_test,
+    }
+    TAGS = ['img']
+
+    text = 'foo <img src="http://example.com" alt="blah"> baz'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        u'foo <img> baz'
+    )
+    text = 'foo <img src="https://example.com" alt="blah"> baz'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        u'foo <img src="https://example.com"> baz'
+    )
+
+
+def test_attributes_tag_list():
+    """Verify attributes[tag] list works"""
+    ATTRS = {
+        'a': ['title']
+    }
+    TAGS = ['a']
+
+    assert (
+        bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        u'<a title="blah">example</a>'
+    )
+
+
+def test_attributes_list():
+    """Verify attributes list works"""
+    ATTRS = ['title']
+    TAGS = ['a']
+
+    text = u'<a href="/foo" title="blah">example</a>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        u'<a title="blah">example</a>'
+    )
+
+
+def test_svg_attr_val_allows_ref():
+    """Unescape values in svg attrs that allow url references"""
+    # Local IRI, so keep it
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+
+    text = '<svg><rect fill="url(#foo)" /></svg>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect fill="url(#foo)"></rect></svg>'
+    )
+
+    # Non-local IRI, so drop it
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+    text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect></rect></svg>'
+    )
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+        '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
+        # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href(text, expected):
+    """Keep local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href_nonlocal(text, expected):
+    """Drop non-local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
+def test_sarcasm():
+    """Jokes should crash.<sarcasm/>"""
+    dirty = 'Yeah right <sarcasm/>'
+    clean = 'Yeah right &lt;sarcasm/&gt;'
+    assert bleach.clean(dirty) == clean
+
+
+def test_user_defined_protocols_valid():
+    valid_href = '<a href="myprotocol://more_text">allowed href</a>'
+    assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
+
+
+def test_user_defined_protocols_invalid():
+    invalid_href = '<a href="http://xx.com">invalid href</a>'
+    cleaned_href = '<a>invalid href</a>'
+    assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+
+
+def test_filters():
+    # Create a Filter that changes all the attr values to "moo"
+    class MooFilter(Filter):
+        def __iter__(self):
+            for token in Filter.__iter__(self):
+                if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+                    for attr, value in token['data'].items():
+                        token['data'][attr] = 'moo'
+
+                yield token
+
+    ATTRS = {
+        'img': ['rel', 'src']
+    }
+    TAGS = ['img']
+
+    cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
+    dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
+    assert (
+        cleaner.clean(dirty) ==
+        'this is cute! <img rel="moo" src="moo">'
+    )
+
+
+def test_clean_idempotent():
+    """Make sure that applying the filter twice doesn't change anything."""
+    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+    assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+
+
+class TestCleaner:
+    def test_basics(self):
+        TAGS = ['span', 'br']
+        ATTRS = {'span': ['style']}
+
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
+
+        assert (
+            cleaner.clean('a <br/><span style="color:red">test</span>') ==
+            'a <br><span style="">test</span>'
+        )
diff --git a/tests/test_links.py b/tests/test_linkify.py
similarity index 100%
rename from tests/test_links.py
rename to tests/test_linkify.py

From 205edc0094c3a5ad217d164048d57a22a69fed93 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 20 Sep 2017 09:00:30 -0400
Subject: [PATCH 117/314] Add code of conduct blurb, move some docs around

Fixes #313
---
 CODE_OF_CONDUCT.rst |  9 +++++++++
 README.rst          | 41 ++++++++++++++++++++++++++---------------
 docs/dev.rst        |  6 ++++++
 3 files changed, 41 insertions(+), 15 deletions(-)
 create mode 100644 CODE_OF_CONDUCT.rst

diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst
new file mode 100644
index 00000000..da20d8db
--- /dev/null
+++ b/CODE_OF_CONDUCT.rst
@@ -0,0 +1,9 @@
+Code of conduct
+===============
+
+This project and repository is governed by Mozilla's code of conduct and
+etiquette guidelines. For more details please see the `Mozilla Community
+Participation Guidelines
+<https://www.mozilla.org/about/governance/policies/participation/>`_ and
+`Developer Etiquette Guidelines
+<https://bugzilla.mozilla.org/page.cgi?id=etiquette.html>`_.
diff --git a/README.rst b/README.rst
index b728c292..863772e8 100644
--- a/README.rst
+++ b/README.rst
@@ -51,21 +51,6 @@ please read our wiki page at
 `<https://www.mozilla.org/en-US/security/#For_Developers>`_.
 
 
-Security
-========
-
-Bleach is a security-related library.
-
-We have a responsible security vulnerability reporting process. Please use
-that if you're reporting a security issue.
-
-Security issues are fixed in private. After we land such a fix, we'll do a
-release.
-
-For every release, we mark security issues we've fixed in the ``CHANGES`` in
-the **Security issues** section. We include relevant CVE links.
-
-
 Installing Bleach
 =================
 
@@ -104,6 +89,32 @@ The simplest way to use Bleach is:
     u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
 
 
+Security
+========
+
+Bleach is a security-related library.
+
+We have a responsible security vulnerability reporting process. Please use
+that if you're reporting a security issue.
+
+Security issues are fixed in private. After we land such a fix, we'll do a
+release.
+
+For every release, we mark security issues we've fixed in the ``CHANGES`` in
+the **Security issues** section. We include relevant CVE links.
+
+
+Code of conduct
+===============
+
+This project and repository is governed by Mozilla's code of conduct and
+etiquette guidelines. For more details please see the `Mozilla Community
+Participation Guidelines
+<https://www.mozilla.org/about/governance/policies/participation/>`_ and
+`Developer Etiquette Guidelines
+<https://bugzilla.mozilla.org/page.cgi?id=etiquette.html>`_.
+
+
 .. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
diff --git a/docs/dev.rst b/docs/dev.rst
index cfa0a8c7..98707048 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -19,6 +19,12 @@ To install Bleach to make changes to it:
        $ pip install -e .
 
 
+.. include:: ../CONTRIBUTING.rst
+
+
+.. include:: ../CODE_OF_CONDUCT.rst
+
+
 Docs
 ====
 

From 4c80d008059257a17af3982c1aba4a3b7879370b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 20 Sep 2017 09:07:20 -0400
Subject: [PATCH 118/314] Change "Security issues" to "Security fixes"

This is clearer regarding the intent of that block.
---
 CHANGES | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGES b/CHANGES
index 59db3338..5ea7aff9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,7 +4,7 @@ Bleach Changes
 Version 2.1 (in development)
 ----------------------------
 
-**Security issues**
+**Security fixes**
 
 **Backwards incompatible changes**
 
@@ -40,7 +40,7 @@ Version 2.1 (in development)
 Version 2.0 (March 8th, 2017)
 -----------------------------
 
-**Security issues**
+**Security fixes**
 
 * None
 
@@ -150,7 +150,7 @@ Version 2.0 (March 8th, 2017)
 Version 1.5 (November 4th, 2016)
 --------------------------------
 
-**Security issues**
+**Security fixes**
 
 * None
 
@@ -192,7 +192,7 @@ Version 1.5 (November 4th, 2016)
 Version 1.4.3 (May 23rd, 2016)
 ------------------------------
 
-**Security issues**
+**Security fixes**
 
 * None
 

From 2a9854d9484797beeed1673454980404483774b3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 20 Sep 2017 11:08:11 -0400
Subject: [PATCH 119/314] Fix test_websites to work with Python 3

---
 tests_website/data_to_json.py   |  2 +-
 tests_website/open_test_page.py |  2 ++
 tests_website/server.py         | 16 ++++++++++------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py
index ffd346f5..debe5a9d 100755
--- a/tests_website/data_to_json.py
+++ b/tests_website/data_to_json.py
@@ -50,4 +50,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py
index b812de92..79f4adf2 100755
--- a/tests_website/open_test_page.py
+++ b/tests_website/open_test_page.py
@@ -2,6 +2,7 @@
 
 import webbrowser
 
+
 TEST_BROWSERS = set([
     # 'mozilla',
     'firefox',
@@ -29,6 +30,7 @@
 ])
 REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
 
+
 if __name__ == '__main__':
     for b in TEST_BROWSERS & REGISTERED_BROWSERS:
         webbrowser.get(b).open_new_tab('http://localhost:8080')
diff --git a/tests_website/server.py b/tests_website/server.py
index 83fcf84a..8a8c6438 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -9,17 +9,19 @@
 python server.py
 """
 
-import SimpleHTTPServer
-import SocketServer
-import json
+# import SimpleHTTPServer
+# import SocketServer
+
+import six
+
 
 import bleach
 
 
 PORT = 8080
 
-class BleachCleanHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
 
+class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
     def do_POST(self):
         content_len = int(self.headers.getheader('content-length', 0))
         body = self.rfile.read(content_len)
@@ -36,7 +38,9 @@ def do_POST(self):
 
 
 if __name__ == '__main__':
-    SocketServer.TCPServer.allow_reuse_address = True  # Prevent 'cannot bind to address' errors on restart
-    httpd = SocketServer.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
+    # Prevent 'cannot bind to address' errors on restart
+    six.moves.socketserver.TCPServer.allow_reuse_address = True
+
+    httpd = six.moves.socketserver.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
     print("listening on localhost port %d" % PORT)
     httpd.serve_forever()

From daec5ef18487fa31779165cb104a22b5931b4c3b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 20 Sep 2017 11:31:06 -0400
Subject: [PATCH 120/314] More Python 3 fixes for tests_websites

---
 tests_website/server.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tests_website/server.py b/tests_website/server.py
index 8a8c6438..edc791a4 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -1,20 +1,17 @@
 #!/usr/bin/env python
 
 """
-Simple Test/Demo Server for running bleach.clean output
-on various desktops.
+Simple Test/Demo Server for running bleach.clean output on various
+desktops.
 
 Usage:
 
-python server.py
-"""
+    python server.py
 
-# import SimpleHTTPServer
-# import SocketServer
+"""
 
 import six
 
-
 import bleach
 
 
@@ -23,17 +20,26 @@
 
 class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
     def do_POST(self):
-        content_len = int(self.headers.getheader('content-length', 0))
+        if six.PY2:
+            content_len = int(self.headers.getheader('content-length', 0))
+        else:
+            content_len = int(self.headers.get('content-length', 0))
         body = self.rfile.read(content_len)
         print("read %s bytes: %s" % (content_len, body))
+
+        if six.PY3:
+            body = body.decode('utf-8')
+        print('input: %r' % body)
         cleaned = bleach.clean(body)
-        print("cleaned %s" % cleaned)
 
         self.send_response(200)
         self.send_header('Content-Length', len(cleaned))
         self.send_header('Content-Type', 'text/plain;charset=UTF-8')
         self.end_headers()
 
+        if six.PY3:
+            cleaned = bytes(cleaned, encoding='utf-8')
+        print("cleaned: %r" % cleaned)
         self.wfile.write(cleaned)
 
 

From 67afdf8ae7d323305ea104c0efb6bcb37547edc2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 27 Jul 2017 13:07:08 -0400
Subject: [PATCH 121/314] Prevent HTMLTokenizer from unescaping entities

This overrides the HTMLTokenizer's .consumeEntity() method such that it doesn't
convert character entities.

This also fixes some other escaping/unescaping oddities so that the output of
bleach.clean() is more correct in regards to intended behavior.

One thing this breaks is the idempotent property for bleach.clean()--it's no
longer idempotent. Since it escapes text more correctly now and that's not an
idempotent transform, this is no longer idempotent.

For example, bleach.clean() can't differentiate between a user talking about
code and saying this:

   I like my html wrapped in <b>!

and this:

   I like my html escaped like this &lt;b&gt;!

I'm not sure why we thought bleach.clean() could ever be correct and idempotent.
Seems like that was an error.
---
 CHANGES                |  7 ++++-
 bleach/sanitizer.py    | 66 ++++++++++++++++++++++++++++++++++++++++--
 tests/data/13.test.out |  2 +-
 tests/data/14.test.out |  2 +-
 tests/data/15.test.out |  2 +-
 tests/data/16.test.out |  2 +-
 tests/data/17.test.out |  2 +-
 tests/data/18.test.out |  2 +-
 tests/data/19.test.out |  3 +-
 tests/test_security.py | 15 ++++++++--
 10 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/CHANGES b/CHANGES
index 5ea7aff9..ae1d52f3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,6 +17,12 @@ Version 2.1 (in development)
 
 * clean, linkify: accept only unicode or utf-8-encoded str (#176)
 
+* ``bleach.clean()`` no longer unescapes entities including ones that are missing
+  a ``;`` at the end which can happen in urls and other places. (#143)
+
+* ``bleach.clean()`` is no longer idempotent. If you run ``bleach.clean()`` on
+  text multiple times, it'll escape things again and again.
+
 **Features**
 
 **Bug fixes**
@@ -36,7 +42,6 @@ Version 2.1 (in development)
 * add test website and scripts to test ``bleach.clean()`` output in browser;
   thank you, Greg Guthe!
 
-
 Version 2.0 (March 8th, 2017)
 -----------------------------
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 26cfad2a..f9fb4287 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -4,9 +4,15 @@
 from xml.sax.saxutils import unescape
 
 import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+    ReparseException,
+    namespaces,
+    prefixes,
+    tokenTypes,
+)
 from html5lib.filters import sanitizer
 from html5lib.serializer import HTMLSerializer
+from html5lib._tokenizer import HTMLTokenizer
 
 from bleach.utils import alphabetize_attributes, force_unicode
 
@@ -44,6 +50,33 @@
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
 
+class BleachHTMLTokenizer(HTMLTokenizer):
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # We don't want to consume and convert entities. Instead we put the
+        # '&' in output.
+        if fromAttribute:
+            self.currentToken['data'][-1][1] += '&'
+
+        else:
+            self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
+
+
+class BleachHTMLParser(html5lib.HTMLParser):
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+        # Override HTMLParser so we can swap out the tokenizer.
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs)
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()
+
+
 class Cleaner(object):
     """Cleaner for cleaning HTML fragments of malicious content
 
@@ -104,7 +137,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         self.strip_comments = strip_comments
         self.filters = filters or []
 
-        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.parser = BleachHTMLParser(namespaceHTMLElements=False)
         self.walker = html5lib.getTreeWalker('etree')
         self.serializer = HTMLSerializer(
             quote_attr_values='always',
@@ -338,6 +371,35 @@ def allow_token(self, token):
 
         return token
 
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (
+                    name if ns is None else "%s:%s" % (prefixes[ns], name),
+                    # Note: HTMLSerializer escapes attribute values already, so
+                    # if we do it here (like HTMLSerializer does), then we end
+                    # up double-escaping.
+                    v)
+                )
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+
+        else:
+            token["data"] = "<%s>" % token["name"]
+
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
     def sanitize_css(self, style):
         """Sanitizes css in style tags"""
         # disallow urls
diff --git a/tests/data/13.test.out b/tests/data/13.test.out
index 1c866507..0053081c 100644
--- a/tests/data/13.test.out
+++ b/tests/data/13.test.out
@@ -1 +1 @@
-&lt;img src="JaVaScRiPt:alert("XSS&amp;lt;WBR"&gt;")&gt;
\ No newline at end of file
+&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&amp;quot;)&gt;
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 8e5ff754..04091589 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;:a&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&amp;#99;&amp;#114;&amp;#105;&amp;#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&amp;#58;&amp;#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index 8b90245f..a7dc6e69 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-le&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;t('XS&lt;wbr&gt;&lt;/wbr&gt;;S')&gt;
\ No newline at end of file
+&amp;#108;&amp;#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&amp;#116;&amp;#40;&amp;#39;&amp;#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&amp;#83;&amp;#39;&amp;#41&gt;
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
index 1ecb332b..c8e31d88 100644
--- a/tests/data/16.test.out
+++ b/tests/data/16.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099ri&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112t:&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097le&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114t(&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039XS&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083')&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118&amp;#0000097&amp;#0000115&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099&amp;#0000114&amp;#0000105&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112&amp;#0000116&amp;#0000058&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097&amp;#0000108&amp;#0000101&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114&amp;#0000116&amp;#0000040&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039&amp;#0000088&amp;#0000083&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083&amp;#0000039&amp;#0000041&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
index ae928a99..8d47f574 100644
--- a/tests/data/17.test.out
+++ b/tests/data/17.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61lert(&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27XSS')&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63&amp;#x72&amp;#x69&amp;#x70&amp;#x74&amp;#x3A&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61&amp;#x6C&amp;#x65&amp;#x72&amp;#x74&amp;#x28&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27&amp;#x58&amp;#x53&amp;#x53&amp;#x27&amp;#x29&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
diff --git a/tests/data/18.test.out b/tests/data/18.test.out
index 8046c715..e4fe2cf3 100644
--- a/tests/data/18.test.out
+++ b/tests/data/18.test.out
@@ -1 +1 @@
-&lt;img src="jav	ascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;
\ No newline at end of file
+&lt;img src="jav&amp;#x09;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/19.test.out b/tests/data/19.test.out
index 8eb8794c..4daa11ad 100644
--- a/tests/data/19.test.out
+++ b/tests/data/19.test.out
@@ -1,2 +1 @@
-&lt;img src="jav
-ascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;
\ No newline at end of file
+&lt;img src="jav&amp;#x0A;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 0eeb09c6..28e3cf2a 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -8,6 +8,17 @@
 from bleach import clean
 
 
+def test_escaped_entities():
+    # html5lib unescapes character entities, so these would become ' and "
+    # which makes it possible to break out of html attributes.
+    #
+    # Verify that bleach.clean() doesn't unescape entities.
+    assert (
+        clean('&#39;&#34;') ==
+        '&amp;#39;&amp;#34;'
+    )
+
+
 def test_nested_script_tag():
     assert (
         clean('<<script>script>evil()<</script>/script>') ==
@@ -105,7 +116,7 @@ def test_invalid_tag_char():
 def test_unclosed_tag():
     assert (
         clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;&lt;/script&gt;'
+        '&lt;script src="http://xx.com/xss.js&lt;b"&gt;&lt;/script&gt;'
     )
     assert (
         clean('<script src="http://xx.com/xss.js"<b>') in
@@ -181,7 +192,7 @@ def test_regression_manually():
     """Regression tests for clean so we can see if there are issues"""
     # NOTE(willkg): Have to do this one by hand because of the \r
     s = """<IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">"""
-    expected = """&lt;img src="jav\rascript:alert(&amp;lt;WBR&amp;gt;'XSS');"&gt;"""
+    expected = """&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;"""
 
     assert clean(s) == expected
 

From 3e5bddb6f12f573be3bd977983c36c3756c5be6b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 28 Jul 2017 13:54:13 -0400
Subject: [PATCH 122/314] Merge manual regression test in

We don't have the problem with \r anymore, so we can merge this in with the
other regression tests.
---
 tests/data/20.test     | 1 +
 tests/data/20.test.out | 1 +
 tests/test_security.py | 9 ---------
 3 files changed, 2 insertions(+), 9 deletions(-)
 create mode 100644 tests/data/20.test
 create mode 100644 tests/data/20.test.out

diff --git a/tests/data/20.test b/tests/data/20.test
new file mode 100644
index 00000000..614b544f
--- /dev/null
+++ b/tests/data/20.test
@@ -0,0 +1 @@
+<IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/20.test.out b/tests/data/20.test.out
new file mode 100644
index 00000000..de3a6a14
--- /dev/null
+++ b/tests/data/20.test.out
@@ -0,0 +1 @@
+&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/test_security.py b/tests/test_security.py
index 28e3cf2a..4860cf19 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -188,15 +188,6 @@ def test_regressions(fn, text):
     assert clean(text.strip()) == expected.strip()
 
 
-def test_regression_manually():
-    """Regression tests for clean so we can see if there are issues"""
-    # NOTE(willkg): Have to do this one by hand because of the \r
-    s = """<IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">"""
-    expected = """&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;"""
-
-    assert clean(s) == expected
-
-
 def test_only_text_is_cleaned():
     some_text = 'text'
     some_type = int

From 29de026000c2894b129702bf4b971bf6768b5bab Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 19 Sep 2017 10:29:08 -0400
Subject: [PATCH 123/314] Rework entity fix to not escape all &

This redoes the entity fix such that it doesn't escape all &. What this does is
add a "sanitize_characters" pass for Characters tokens which looks for entities
within the token data and extracts them into Entity tokens.

Handling entities this way prevents them from being expanded during
tokenization, but keeps them as distinct entities during serialization so they
don't get escaped.

This handles the &curren problem by requiring that an entity always start with a
& and end with a ;. If it doesn't, then it's treated like characters and all &
are escaped.
---
 CHANGES                |   9 +--
 bleach/sanitizer.py    | 169 +++++++++++++++++++++++++++++++++++++++--
 tests/data/13.test.out |   2 +-
 tests/data/14.test.out |   2 +-
 tests/data/15.test.out |   2 +-
 tests/test_clean.py    |  74 +++++++++++-------
 tests/test_security.py |   2 +-
 7 files changed, 216 insertions(+), 44 deletions(-)

diff --git a/CHANGES b/CHANGES
index ae1d52f3..93719ae4 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,16 +17,13 @@ Version 2.1 (in development)
 
 * clean, linkify: accept only unicode or utf-8-encoded str (#176)
 
-* ``bleach.clean()`` no longer unescapes entities including ones that are missing
-  a ``;`` at the end which can happen in urls and other places. (#143)
-
-* ``bleach.clean()`` is no longer idempotent. If you run ``bleach.clean()`` on
-  text multiple times, it'll escape things again and again.
-
 **Features**
 
 **Bug fixes**
 
+* ``bleach.clean()`` no longer unescapes entities including ones that are missing
+  a ``;`` at the end which can happen in urls and other places. (#143)
+
 * linkify: fix http links inside of mailto links; thank you, sedrubal! (#300)
 
 * clarify security policy in docs (#303)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index f9fb4287..c63fd803 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,22 +1,30 @@
 from __future__ import unicode_literals
 import re
+import string
+
 import six
 from xml.sax.saxutils import unescape
 
 import html5lib
 from html5lib.constants import (
+    entities,
     ReparseException,
     namespaces,
     prefixes,
     tokenTypes,
 )
+from html5lib.filters.base import Filter
 from html5lib.filters import sanitizer
 from html5lib.serializer import HTMLSerializer
 from html5lib._tokenizer import HTMLTokenizer
+from html5lib._trie import Trie
 
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
+#: Trie of html entity string -> character representation
+ENTITIES_TRIE = Trie(entities)
+
 #: List of allowed tags
 ALLOWED_TAGS = [
     'a',
@@ -50,10 +58,16 @@
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
 
+AMP_SPLIT_RE = re.compile('(&)')
+
+
 class BleachHTMLTokenizer(HTMLTokenizer):
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
-        # We don't want to consume and convert entities. Instead we put the
-        # '&' in output.
+        # We don't want to consume and convert entities, so this overrides the
+        # html5lib tokenizer's consumeEntity so that it's now a no-op.
+        #
+        # However, when that gets called, it's consumed an &, so we put that in
+        # the steam.
         if fromAttribute:
             self.currentToken['data'][-1][1] += '&'
 
@@ -63,7 +77,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
 
 class BleachHTMLParser(html5lib.HTMLParser):
     def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
-        # Override HTMLParser so we can swap out the tokenizer.
+        # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
         self.container = container
         self.scripting = scripting
@@ -143,6 +157,10 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
             quote_attr_values='always',
             omit_optional_tags=False,
 
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
+
             # Bleach has its own sanitizer, so don't use the html5lib one
             sanitize=False,
 
@@ -270,6 +288,19 @@ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
 
         return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
+    def __iter__(self):
+        for token in Filter.__iter__(self):
+            ret = self.sanitize_token(token)
+
+            if not ret:
+                continue
+
+            if isinstance(ret, list):
+                for subtoken in ret:
+                    yield subtoken
+            else:
+                yield ret
+
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
 
@@ -281,6 +312,10 @@ def sanitize_token(self, token):
 
         Also gives the option to strip tags instead of encoding.
 
+        :arg dict token: token to sanitize
+
+        :returns: token or list of tokens
+
         """
         token_type = token['type']
         if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
@@ -288,7 +323,7 @@ def sanitize_token(self, token):
                 return self.allow_token(token)
 
             elif self.strip_disallowed_elements:
-                pass
+                return None
 
             else:
                 if 'data' in token:
@@ -300,10 +335,130 @@ def sanitize_token(self, token):
         elif token_type == 'Comment':
             if not self.strip_html_comments:
                 return token
+            else:
+                return None
+
+        elif token_type == 'Characters':
+            return self.sanitize_characters(token)
 
         else:
             return token
 
+    def match_entity(self, stream):
+        """Returns first entity in stream or None if no entity exists
+
+        Note: For Bleach purposes, entities must start with a "&" and end with
+        a ";".
+
+        :arg stream: the character stream
+
+        :returns: ``None`` or the entity string without "&" or ";"
+
+        """
+        # Nix the & at the beginning
+        if stream[0] != '&':
+            raise ValueError('Stream should begin with "&"')
+
+        stream = stream[1:]
+
+        stream = list(stream)
+        possible_entity = ''
+        end_characters = '<&=;' + string.whitespace
+
+        # Handle number entities
+        if stream and stream[0] == '#':
+            possible_entity = '#'
+            stream.pop(0)
+
+            if stream and stream[0] in ('x', 'X'):
+                allowed = '0123456789abcdefABCDEF'
+                possible_entity += stream.pop(0)
+            else:
+                allowed = '0123456789'
+
+            # FIXME(willkg): Do we want to make sure these are valid number
+            # entities? This doesn't do that currently.
+            while stream and stream[0] not in end_characters:
+                c = stream.pop(0)
+                if c not in allowed:
+                    break
+                possible_entity += c
+
+            if possible_entity and stream and stream[0] == ';':
+                return possible_entity
+            return None
+
+        # Handle character entities
+        while stream and stream[0] not in end_characters:
+            c = stream.pop(0)
+            if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+                break
+            possible_entity += c
+
+        if possible_entity and stream and stream[0] == ';':
+            return possible_entity
+
+        return None
+
+    def next_possible_entity(self, text):
+        """Takes a text and generates a list of possible entities
+
+        :arg text: the text to look at
+
+        :returns: generator where each part (except the first) starts with an
+            "&"
+
+        """
+        for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+            if i == 0:
+                yield part
+            elif i % 2 == 0:
+                yield '&' + part
+
+    def sanitize_characters(self, token):
+        """Handles Characters tokens
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: a list of tokens
+
+        """
+        data = token.get('data', '')
+
+        # If there isn't a & in the data, we can return now
+        if '&' not in data:
+            return token
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in self.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith('&'):
+                entity = self.match_entity(part)
+                if entity is not None:
+                    new_tokens.append({'type': 'Entity', 'name': entity})
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and and one for ; at the end
+                    part = part[len(entity) + 2:]
+                    if part:
+                        new_tokens.append({'type': 'Characters', 'data': part})
+                    continue
+
+            new_tokens.append({'type': 'Characters', 'data': part})
+
+        return new_tokens
+
     def allow_token(self, token):
         """Handles the case where we're allowing the tag"""
         if 'data' in token:
@@ -382,9 +537,9 @@ def disallowed_token(self, token):
             for (ns, name), v in token["data"].items():
                 attrs.append(' %s="%s"' % (
                     name if ns is None else "%s:%s" % (prefixes[ns], name),
-                    # Note: HTMLSerializer escapes attribute values already, so
-                    # if we do it here (like HTMLSerializer does), then we end
-                    # up double-escaping.
+                    # NOTE(willkg): HTMLSerializer escapes attribute values
+                    # already, so if we do it here (like HTMLSerializer does),
+                    # then we end up double-escaping.
                     v)
                 )
             token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
diff --git a/tests/data/13.test.out b/tests/data/13.test.out
index 0053081c..70be2ba4 100644
--- a/tests/data/13.test.out
+++ b/tests/data/13.test.out
@@ -1 +1 @@
-&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&amp;quot;)&gt;
+&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&quot;)&gt;
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 04091589..4800a4df 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&amp;#99;&amp;#114;&amp;#105;&amp;#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&amp;#58;&amp;#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&#99;&#114;&#105;&#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&#58;&#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index a7dc6e69..4a572463 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-&amp;#108;&amp;#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&amp;#116;&amp;#40;&amp;#39;&amp;#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&amp;#83;&amp;#39;&amp;#41&gt;
+&#108;&#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&#116;&#40;&#39;&#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&#83;&#39;&amp;#41&gt;
diff --git a/tests/test_clean.py b/tests/test_clean.py
index a6a37557..b28d7055 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,6 +1,5 @@
 from html5lib.filters.base import Filter
 import pytest
-import six
 
 import bleach
 from bleach.sanitizer import Cleaner
@@ -11,12 +10,7 @@ def test_empty():
 
 
 def test_nbsp():
-    if six.PY3:
-        expected = '\xa0test string\xa0'
-    else:
-        expected = six.u('\\xa0test string\\xa0')
-
-    assert bleach.clean('&nbsp;test string&nbsp;') == expected
+    assert bleach.clean('&nbsp;test string&nbsp;') == '&nbsp;test string&nbsp;'
 
 
 def test_comments_only():
@@ -100,30 +94,56 @@ def test_bad_href():
     )
 
 
-def test_bare_entities():
-    assert (
-        bleach.clean('an & entity') ==
-        'an &amp; entity'
-    )
-    assert (
-        bleach.clean('an < entity') ==
-        'an &lt; entity'
-    )
+@pytest.mark.parametrize('text, expected', [
+    ('an & entity', 'an &amp; entity'),
+    ('an < entity', 'an &lt; entity'),
+    ('tag < <em>and</em> entity', 'tag &lt; <em>and</em> entity'),
+])
+def test_bare_entities(text, expected):
+    assert bleach.clean(text) == expected
 
-    assert (
-        bleach.clean('tag < <em>and</em> entity') ==
-        'tag &lt; <em>and</em> entity'
-    )
 
-    assert (
-        bleach.clean('&amp;') ==
-        '&amp;'
-    )
+@pytest.mark.parametrize('text, expected', [
+    # Test character entities
+    ('&amp;', '&amp;'),
+    ('&nbsp;', '&nbsp;'),
+    ('&lt;em&gt;strong&lt;/em&gt;', '&lt;em&gt;strong&lt;/em&gt;'),
+
+    # Test character entity at beginning of string
+    ('&amp;is cool', '&amp;is cool'),
+
+    # Test it at the end of the string
+    ('cool &amp;', 'cool &amp;'),
+
+    # Test bare ampersands and entities at beginning
+    ('&&amp; is cool', '&amp;&amp; is cool'),
+
+    # Test entities and bare ampersand at end
+    ('&amp; is cool &amp;&', '&amp; is cool &amp;&amp;'),
 
+    # Test missing semi-colon means we don't treat it like an entity
+    ('this &amp that', 'this &amp;amp that'),
 
-def test_escaped_entities():
-    s = '&lt;em&gt;strong&lt;/em&gt;'
-    assert bleach.clean(s) == s
+    # Test a thing that looks like a character entity, but isn't because it's
+    # missing a ; (&curren)
+    (
+        'http://example.com?active=true&current=true',
+        'http://example.com?active=true&amp;current=true'
+    ),
+
+    # Test numeric entities
+    ('&#39;', '&#39;'),
+    ('&#34;', '&#34;'),
+    ('&#123;', '&#123;'),
+    ('&#x0007b;', '&#x0007b;'),
+    ('&#x0007B;', '&#x0007B;'),
+
+    # Test non-numeric entities
+    ('&#', '&amp;#'),
+    ('&#<', '&amp;#&lt;')
+])
+def test_character_entities(text, expected):
+    assert bleach.clean(text) == expected
 
 
 def test_weird_strings():
diff --git a/tests/test_security.py b/tests/test_security.py
index 4860cf19..c7157b53 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -15,7 +15,7 @@ def test_escaped_entities():
     # Verify that bleach.clean() doesn't unescape entities.
     assert (
         clean('&#39;&#34;') ==
-        '&amp;#39;&amp;#34;'
+        '&#39;&#34;'
     )
 
 

From 2648d263296bd6b94248f916e03724eba85e938c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 20 Sep 2017 11:40:46 -0400
Subject: [PATCH 124/314] Move test_only_text_is_cleaned to test_clean.py

---
 tests/test_clean.py    | 14 ++++++++++++++
 tests/test_security.py | 14 --------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index b28d7055..bd8fff91 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -411,6 +411,20 @@ def test_clean_idempotent():
     assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
 
 
+def test_only_text_is_cleaned():
+    some_text = 'text'
+    some_type = int
+    no_type = None
+
+    assert bleach.clean(some_text) == some_text
+
+    with pytest.raises(TypeError):
+        bleach.clean(some_type)
+
+    with pytest.raises(TypeError):
+        bleach.clean(no_type)
+
+
 class TestCleaner:
     def test_basics(self):
         TAGS = ['span', 'br']
diff --git a/tests/test_security.py b/tests/test_security.py
index c7157b53..8238c044 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -186,17 +186,3 @@ def test_regressions(fn, text):
     # maintain the files. If there comes a time when the input needs whitespace
     # at the beginning or end, then we'll have to figure out something else.
     assert clean(text.strip()) == expected.strip()
-
-
-def test_only_text_is_cleaned():
-    some_text = 'text'
-    some_type = int
-    no_type = None
-
-    assert clean(some_text) == some_text
-
-    with pytest.raises(TypeError):
-        clean(some_type)
-
-    with pytest.raises(TypeError):
-        clean(no_type)

From 25388a6ab6293196ad266afe801d52a36b06c28d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 21 Sep 2017 12:50:57 -0400
Subject: [PATCH 125/314] Further clarify where it's safe to use bleach.clean()
 output

---
 docs/clean.rst | 10 +++++++---
 docs/goals.rst | 20 +++++++++++++-------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index 9b1bb012..68178ce5 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -23,16 +23,16 @@ return ``unicode``.
 .. Warning::
 
    :py:func:`bleach.clean` is for sanitising HTML fragments to use in an HTML
-   context--not for HTML attributes.
+   context--not for HTML attributes, CSS, JSON, xhtml, SVG, or other contexts.
 
-   For example, this is safe::
+   For example, this is a safe use of ``clean`` output in an HTML context::
 
      <p>
        {{ bleach.clean(user_bio) }}
      </p>
 
 
-   This is **not safe**::
+   This is a **not safe** use of ``clean`` output in an HTML attribute::
 
      <body data-bio="{{ bleach.clean(user_bio} }}">
 
@@ -41,6 +41,10 @@ return ``unicode``.
    need to pass it through your template library's escape function. For example,
    Jinja2's ``escape`` or ``django.utils.html.escape`` or something like that.
 
+   If you need to use the output of ``bleach.clean()`` in any other context,
+   you need to pass it through an appropriate sanitizer/escaper for that
+   context.
+
 
 .. autofunction:: bleach.clean
 
diff --git a/docs/goals.rst b/docs/goals.rst
index 1533fdb0..16e4d317 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -28,8 +28,10 @@ Main goal is to sanitize input of malicious content
 ---------------------------------------------------
 
 The primary goal of Bleach is to sanitize user input that is allowed to contain
-*some* HTML as markup and is to be included in the content of a larger page.
-Examples might include:
+*some* HTML as markup and is to be included in the content of a larger page
+in an HTML context.
+
+Examples of such content might include:
 
 * User comments on a blog.
 
@@ -73,20 +75,20 @@ creating whole documents, you have to allow so many tags that a disallow-list
 approach (e.g. forbidding ``<script>`` or ``<object>``) may be more appropriate.
 
 
-Sanitize for use in HTML attributes
------------------------------------
+Sanitize for use in HTML attributes, CSS, JSON, xhtml, SVG, or other contexts
+----------------------------------------------------------------------------
 
 Bleach's ``clean`` is used for sanitizing content to be used in an HTML
-context--not for HTML attributes.
+context--not for HTML attributes, CSS, JSON, xhtml, SVG, or other contexts.
 
-For example, this is safe::
+For example, this is a safe use of ``clean`` output in an HTML context::
 
     <p>
       {{ bleach.clean(user_bio) }}
     </p>
 
 
-This is **not safe**::
+This is a **not safe** use of ``clean`` output in an HTML attribute::
 
     <body data-bio="{{ bleach.clean(user_bio} }}">
 
@@ -95,6 +97,10 @@ If you need to use the output of ``bleach.clean()`` in an HTML attribute, you
 need to pass it through your template library's escape function. For example,
 Jinja2's ``escape`` or ``django.utils.html.escape`` or something like that.
 
+If you need to use the output of ``bleach.clean()`` in any other context,
+you need to pass it through an appropriate sanitizer/escaper for that
+context.
+
 
 Remove all HTML or transforming content for some non-web-page purpose
 ---------------------------------------------------------------------

From d8f7d1d651a6b5a4c8e5a5932ea4fd61769a60d4 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 21 Sep 2017 14:14:09 -0400
Subject: [PATCH 126/314] Fix character entities in HTML attributes

This is squirrely, but what's going on is that the tokenizer no longer consumes
entities so when the serializer goes to convert all & to &amp;, it's not a good
thing to do.

This wraps the serializer and looks for HTML attribute values, undoes the
categorical "all & shall be &amp;!" and then redoes it taking care only to
escape bare &.
---
 bleach/sanitizer.py | 205 ++++++++++++++++++++++++++++----------------
 tests/test_clean.py |  14 +++
 2 files changed, 145 insertions(+), 74 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c63fd803..439c43b4 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -153,9 +153,10 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
         self.parser = BleachHTMLParser(namespaceHTMLElements=False)
         self.walker = html5lib.getTreeWalker('etree')
-        self.serializer = HTMLSerializer(
+        self.serializer = BleachHTMLSerializer(
             quote_attr_values='always',
             omit_optional_tags=False,
+            escape_lt_in_attrs=True,
 
             # We want to leave entities as they are without escaping or
             # resolving or expanding
@@ -250,6 +251,79 @@ def _attr_filter(tag, attr, value):
     raise ValueError('attributes needs to be a callable, a list or a dict')
 
 
+def match_entity(stream):
+    """Returns first entity in stream or None if no entity exists
+
+    Note: For Bleach purposes, entities must start with a "&" and end with
+    a ";".
+
+    :arg stream: the character stream
+
+    :returns: ``None`` or the entity string without "&" or ";"
+
+    """
+    # Nix the & at the beginning
+    if stream[0] != '&':
+        raise ValueError('Stream should begin with "&"')
+
+    stream = stream[1:]
+
+    stream = list(stream)
+    possible_entity = ''
+    end_characters = '<&=;' + string.whitespace
+
+    # Handle number entities
+    if stream and stream[0] == '#':
+        possible_entity = '#'
+        stream.pop(0)
+
+        if stream and stream[0] in ('x', 'X'):
+            allowed = '0123456789abcdefABCDEF'
+            possible_entity += stream.pop(0)
+        else:
+            allowed = '0123456789'
+
+        # FIXME(willkg): Do we want to make sure these are valid number
+        # entities? This doesn't do that currently.
+        while stream and stream[0] not in end_characters:
+            c = stream.pop(0)
+            if c not in allowed:
+                break
+            possible_entity += c
+
+        if possible_entity and stream and stream[0] == ';':
+            return possible_entity
+        return None
+
+    # Handle character entities
+    while stream and stream[0] not in end_characters:
+        c = stream.pop(0)
+        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+            break
+        possible_entity += c
+
+    if possible_entity and stream and stream[0] == ';':
+        return possible_entity
+
+    return None
+
+
+def next_possible_entity(text):
+    """Takes a text and generates a list of possible entities
+
+    :arg text: the text to look at
+
+    :returns: generator where each part (except the first) starts with an
+        "&"
+
+    """
+    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+        if i == 0:
+            yield part
+        elif i % 2 == 0:
+            yield '&' + part
+
+
 class BleachSanitizerFilter(sanitizer.Filter):
     """html5lib Filter that sanitizes text
 
@@ -344,77 +418,6 @@ def sanitize_token(self, token):
         else:
             return token
 
-    def match_entity(self, stream):
-        """Returns first entity in stream or None if no entity exists
-
-        Note: For Bleach purposes, entities must start with a "&" and end with
-        a ";".
-
-        :arg stream: the character stream
-
-        :returns: ``None`` or the entity string without "&" or ";"
-
-        """
-        # Nix the & at the beginning
-        if stream[0] != '&':
-            raise ValueError('Stream should begin with "&"')
-
-        stream = stream[1:]
-
-        stream = list(stream)
-        possible_entity = ''
-        end_characters = '<&=;' + string.whitespace
-
-        # Handle number entities
-        if stream and stream[0] == '#':
-            possible_entity = '#'
-            stream.pop(0)
-
-            if stream and stream[0] in ('x', 'X'):
-                allowed = '0123456789abcdefABCDEF'
-                possible_entity += stream.pop(0)
-            else:
-                allowed = '0123456789'
-
-            # FIXME(willkg): Do we want to make sure these are valid number
-            # entities? This doesn't do that currently.
-            while stream and stream[0] not in end_characters:
-                c = stream.pop(0)
-                if c not in allowed:
-                    break
-                possible_entity += c
-
-            if possible_entity and stream and stream[0] == ';':
-                return possible_entity
-            return None
-
-        # Handle character entities
-        while stream and stream[0] not in end_characters:
-            c = stream.pop(0)
-            if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
-                break
-            possible_entity += c
-
-        if possible_entity and stream and stream[0] == ';':
-            return possible_entity
-
-        return None
-
-    def next_possible_entity(self, text):
-        """Takes a text and generates a list of possible entities
-
-        :arg text: the text to look at
-
-        :returns: generator where each part (except the first) starts with an
-            "&"
-
-        """
-        for i, part in enumerate(AMP_SPLIT_RE.split(text)):
-            if i == 0:
-                yield part
-            elif i % 2 == 0:
-                yield '&' + part
-
     def sanitize_characters(self, token):
         """Handles Characters tokens
 
@@ -440,12 +443,12 @@ def sanitize_characters(self, token):
 
         # For each possible entity that starts with a "&", we try to extract an
         # actual entity and re-tokenize accordingly
-        for part in self.next_possible_entity(data):
+        for part in next_possible_entity(data):
             if not part:
                 continue
 
             if part.startswith('&'):
-                entity = self.match_entity(part)
+                entity = match_entity(part)
                 if entity is not None:
                     new_tokens.append({'type': 'Entity', 'name': entity})
                     # Length of the entity plus 2--one for & at the beginning
@@ -588,3 +591,57 @@ def sanitize_css(self, style):
                 clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
+
+
+class BleachHTMLSerializer(HTMLSerializer):
+    """Wraps the HTMLSerializer and undoes & -> &amp; in attributes"""
+    def escape_base_amp(self, stoken):
+        """Escapes bare & in HTML attribute values"""
+        # First, undo what the HTMLSerializer did
+        stoken = stoken.replace('&amp;', '&')
+
+        # Then, escape any bare &
+        for part in next_possible_entity(stoken):
+            if not part:
+                continue
+
+            if part.startswith('&'):
+                entity = match_entity(part)
+                if entity is not None:
+                    yield '&' + entity + ';'
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and and one for ; at the end
+                    part = part[len(entity) + 2:]
+                    if part:
+                        yield part
+                    continue
+
+            yield part.replace('&', '&amp;')
+
+    def serialize(self, treewalker, encoding=None):
+        """Wrap HTMLSerializer.serialize and escape bare & in attributes"""
+        in_tag = False
+        after_equals = False
+
+        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
+            if in_tag:
+                if stoken == '>':
+                    in_tag = False
+
+                elif after_equals:
+                    if stoken != '"':
+                        for part in self.escape_base_amp(stoken):
+                            yield part
+
+                        after_equals = False
+                        continue
+
+                elif stoken == '=':
+                    after_equals = True
+
+                yield stoken
+            else:
+                if stoken.startswith('<'):
+                    in_tag = True
+                yield stoken
diff --git a/tests/test_clean.py b/tests/test_clean.py
index bd8fff91..8c63ef22 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -131,6 +131,20 @@ def test_bare_entities(text, expected):
         'http://example.com?active=true&amp;current=true'
     ),
 
+    # Test entities in HTML attributes
+    (
+        '<a href="?art&amp;copy">foo</a>',
+        '<a href="?art&amp;copy">foo</a>'
+    ),
+    (
+        '<a href="?this=&gt;that">foo</a>',
+        '<a href="?this=&gt;that">foo</a>'
+    ),
+    (
+        '<a href="http://example.com?active=true&current=true">foo</a>',
+        '<a href="http://example.com?active=true&amp;current=true">foo</a>'
+    ),
+
     # Test numeric entities
     ('&#39;', '&#39;'),
     ('&#34;', '&#34;'),

From c0fa1aa5bd59cf9e9f4824d96191ba0acaa25438 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 27 Sep 2017 16:26:26 -0400
Subject: [PATCH 127/314] Convert invisible characters to ? in Characters
 tokens

This prevents someone from using backspace and other invisible characters from
tricking a user into copy and pasting a seemingly innocuous command into doing
something they really don't want to do.

I made the replacement character a constant figuring people can replace it if
they want something different.

Fixes #298.
---
 CHANGES                | 13 +++++++++++++
 bleach/sanitizer.py    | 20 ++++++++++++++++++++
 tests/test_security.py | 20 ++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/CHANGES b/CHANGES
index 93719ae4..423806ab 100644
--- a/CHANGES
+++ b/CHANGES
@@ -6,6 +6,15 @@ Version 2.1 (in development)
 
 **Security fixes**
 
+* Convert control characters (backspace particularly) to "?" preventing
+  malicious copy-and-paste situations. (#298)
+
+  See `<https://github.com/mozilla/bleach/issues/298>`_ for more details.
+
+  This affects all previous versions of Bleach. Check the comments on that
+  issue for ways to alleviate the issue if you can't upgrade to Bleach 2.1.
+
+
 **Backwards incompatible changes**
 
 * Redid versioning. ``bleach.VERSION`` is no longer available. Use the string
@@ -17,8 +26,10 @@ Version 2.1 (in development)
 
 * clean, linkify: accept only unicode or utf-8-encoded str (#176)
 
+
 **Features**
 
+
 **Bug fixes**
 
 * ``bleach.clean()`` no longer unescapes entities including ones that are missing
@@ -39,6 +50,7 @@ Version 2.1 (in development)
 * add test website and scripts to test ``bleach.clean()`` output in browser;
   thank you, Greg Guthe!
 
+
 Version 2.0 (March 8th, 2017)
 -----------------------------
 
@@ -46,6 +58,7 @@ Version 2.0 (March 8th, 2017)
 
 * None
 
+
 **Backwards incompatible changes**
 
 * Removed support for Python 2.6. #206
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 439c43b4..09d945c9 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,4 +1,5 @@
 from __future__ import unicode_literals
+from itertools import chain
 import re
 import string
 
@@ -60,6 +61,19 @@
 
 AMP_SPLIT_RE = re.compile('(&)')
 
+#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
+INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
+
+#: Regexp for characters that are invisible
+INVISIBLE_CHARACTERS_RE = re.compile(
+    '[' + INVISIBLE_CHARACTERS + ']',
+    re.UNICODE
+)
+
+#: String to replace invisible characters with. This can be a character, a
+#: string, or even a function that takes a Python re matchobj
+INVISIBLE_REPLACEMENT_CHAR = '?'
+
 
 class BleachHTMLTokenizer(HTMLTokenizer):
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
@@ -435,6 +449,12 @@ def sanitize_characters(self, token):
         """
         data = token.get('data', '')
 
+        if not data:
+            return token
+
+        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
+        token['data'] = data
+
         # If there isn't a & in the data, we can return now
         if '&' not in data:
             return token
diff --git a/tests/test_security.py b/tests/test_security.py
index 8238c044..4c710775 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -156,6 +156,26 @@ def test_feed_protocol():
     assert clean('<a href="feed:file:///tmp/foo">foo</a>') == '<a>foo</a>'
 
 
+@pytest.mark.parametrize('data, expected', [
+    # Convert bell
+    ('1\a23', '1?23'),
+
+    # Convert backpsace
+    ('1\b23', '1?23'),
+
+    # Convert formfeed
+    ('1\v23', '1?23'),
+
+    # Convert vertical tab
+    ('1\f23', '1?23'),
+
+    # Convert a bunch of characters in a string
+    ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
+])
+def test_invisible_characters(data, expected):
+    assert clean(data) == expected
+
+
 def get_tests():
     """Retrieves regression tests from data/ directory
 

From cb5f2b865649cebc7bca5c9fba2dfe94aa982b6c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 28 Sep 2017 11:02:34 -0400
Subject: [PATCH 128/314] Prep for Bleach 2.1 release

---
 CHANGES            | 4 ++--
 CONTRIBUTORS       | 1 +
 MANIFEST.in        | 3 +++
 bleach/__init__.py | 4 ++--
 docs/goals.rst     | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CHANGES b/CHANGES
index 423806ab..d9a2d0d5 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach Changes
 ==============
 
-Version 2.1 (in development)
-----------------------------
+Version 2.1 (September 28th, 2017)
+----------------------------------
 
 **Security fixes**
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 9638b48a..94276246 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -38,6 +38,7 @@ Contributors:
 - Jannis Leidel
 - Janusz Kamieński
 - Jeff Balogh
+- Jonathan Vanasco
 - Lee, Cheon-il
 - Les Orchard
 - Lorenz Schori
diff --git a/MANIFEST.in b/MANIFEST.in
index d8329f63..1ae68e20 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 include CHANGES
 include CONTRIBUTORS
 include CONTRIBUTING.rst
+include CODE_OF_CONDUCT.rst
 include requirements.txt
 include tox.ini
 include LICENSE
@@ -12,3 +13,5 @@ include docs/Makefile
 recursive-include docs *.rst
 
 recursive-include tests *.py *.test *.out
+
+recursive-include tests_website *.html *.py *.rst
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8097787d..cd0dedf1 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20170928'
 # x.y or x.y.dev0
-__version__ = '2.1.dev0'
+__version__ = '2.1'
 VERSION = parse_version(__version__)
 
 
diff --git a/docs/goals.rst b/docs/goals.rst
index 16e4d317..4bbe5ebc 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -76,7 +76,7 @@ approach (e.g. forbidding ``<script>`` or ``<object>``) may be more appropriate.
 
 
 Sanitize for use in HTML attributes, CSS, JSON, xhtml, SVG, or other contexts
-----------------------------------------------------------------------------
+-----------------------------------------------------------------------------
 
 Bleach's ``clean`` is used for sanitizing content to be used in an HTML
 context--not for HTML attributes, CSS, JSON, xhtml, SVG, or other contexts.

From b7cea23b22288ec2603705e3fc36fbc385709ca3 Mon Sep 17 00:00:00 2001
From: Anton Backer <anton.backer@newsela.com>
Date: Mon, 2 Oct 2017 09:48:06 -0700
Subject: [PATCH 129/314] Fix grammar in error message, "argument must of text
 type"

---
 bleach/linkifier.py | 2 +-
 bleach/sanitizer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 0aa37e5e..849443ce 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -138,7 +138,7 @@ def linkify(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            raise TypeError('argument must of text type')
+            raise TypeError('argument must be of text type')
 
         text = force_unicode(text)
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 09d945c9..31fe21ac 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -194,7 +194,7 @@ def clean(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            raise TypeError('argument must of text type')
+            raise TypeError('argument must be of text type')
 
         if not text:
             return u''

From 821088c9ce30fbe691344bf89af3d6ac9ea05444 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 2 Oct 2017 12:57:05 -0400
Subject: [PATCH 130/314] Prep for v2.1.1 development

---
 CHANGES            | 20 ++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index d9a2d0d5..6c8527c3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
 Bleach Changes
 ==============
 
+Version 2.1.1 (in development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+
 Version 2.1 (September 28th, 2017)
 ----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index cd0dedf1..7166ac09 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20170928'
+__releasedate__ = ''
 # x.y or x.y.dev0
-__version__ = '2.1'
+__version__ = '2.1.1.dev0'
 VERSION = parse_version(__version__)
 
 

From 42629f7a39fb20f7d45b004bb2cb727d6e6c9fae Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 2 Oct 2017 13:28:10 -0400
Subject: [PATCH 131/314] Add lint, docs, and LANG= build rules to tox.ini

This adds a bunch of rules to tox.ini to improve release quality:

* docs: builds the docs and verifies no errors
* lint: lints the bleach codebase and verifies no issues
* py{27,33,34,35,36}-build-no-lang: builds bleach with LANG=
---
 tox.ini | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 3b2a28df..6ada567f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,12 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10},pypy-html5lib99999999
+envlist =
+    py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10}
+    pypy-html5lib99999999
+    py{27,33,34,35,36}-build-no-lang
+    docs
+    lint
 
 [testenv]
 basepython =
@@ -21,3 +26,54 @@ deps =
     html5lib10b10: html5lib==1.0b10
 commands =
     py.test {posargs:-v}
+    python setup.py build
+
+[testenv:py27-build-no-lang]
+basepython = python2.7
+setenv =
+    LANG=
+commands =
+    python setup.py build
+
+[testenv:py33-build-no-lang]
+basepython = python3.3
+setenv =
+    LANG=
+commands =
+    python setup.py build
+
+[testenv:py34-build-no-lang]
+basepython = python3.4
+setenv =
+    LANG=
+commands =
+    python setup.py build
+
+[testenv:py35-build-no-lang]
+basepython = python3.5
+setenv =
+    LANG=
+commands =
+     python setup.py build
+
+[testenv:py36-build-no-lang]
+basepython = python3.6
+setenv =
+    LANG=
+commands =
+    python setup.py build
+
+[testenv:lint]
+basepython = python
+deps =
+    -rrequirements.txt
+commands =
+    flake8 bleach/
+
+[testenv:docs]
+basepython = python
+changedir = docs
+deps =
+    -rrequirements.txt
+commands =
+    sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html

From c64dd261d10f8f0b80a6ac7e99ba39fe6b074bfa Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 2 Oct 2017 13:35:12 -0400
Subject: [PATCH 132/314] Change how setup.py opens files

In Python3 environments where LANG=, then Python uses the ascii codec to decode
files. That broke when we added a non-ascii character to CHANGES in the last
release.

Fixes #324.
---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 2bd04207..2ac01001 100755
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+import codecs
 import os
 import re
 import sys
@@ -25,16 +26,16 @@
 
 
 def get_long_desc():
-    desc = open('README.rst').read()
+    desc = codecs.open('README.rst', encoding='utf-8').read()
     desc += '\n\n'
-    desc += open('CHANGES').read()
+    desc += codecs.open('CHANGES', encoding='utf-8').read()
     return desc
 
 
 def get_version():
     fn = os.path.join('bleach', '__init__.py')
     vsre = r"""^__version__ = ['"]([^'"]*)['"]"""
-    version_file = open(fn, 'rt').read()
+    version_file = codecs.open(fn, mode='r', encoding='utf-8').read()
     return re.search(vsre, version_file, re.M).group(1)
 
 

From ad5e6e739588502383da754b7e2130484d0a314d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 2 Oct 2017 14:38:48 -0400
Subject: [PATCH 133/314] Prep for 2.1.1 release

---
 CHANGES            | 6 +++---
 bleach/__init__.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6c8527c3..8555f77c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach Changes
 ==============
 
-Version 2.1.1 (in development)
-------------------------------
+Version 2.1.1 (October 2nd, 2017)
+---------------------------------
 
 **Security fixes**
 
@@ -18,7 +18,7 @@ None
 
 **Bug fixes**
 
-None
+* Fix ``setup.py`` opening files when ``LANG=``. (#324)
 
 
 Version 2.1 (September 28th, 2017)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 7166ac09..96b4ef4e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20171002'
 # x.y or x.y.dev0
-__version__ = '2.1.1.dev0'
+__version__ = '2.1.1'
 VERSION = parse_version(__version__)
 
 

From 3d4eaebb0888e2f5f11c8fe66ccb2cce9fcdcaa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Blondon?= <stephane.blondon@gmail.com>
Date: Fri, 10 Nov 2017 14:10:42 +0100
Subject: [PATCH 134/314] Insert the provided type in error message when it's a
 wrong one

---
 bleach/sanitizer.py | 4 +++-
 tests/test_clean.py | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 31fe21ac..252726bf 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -194,7 +194,9 @@ def clean(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            raise TypeError('argument must be of text type')
+            message = 'argument cannot be of {name} type, must be of text type'.format(
+                name=text.__class__.__name__)
+            raise TypeError(message)
 
         if not text:
             return u''
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 8c63ef22..f0893e25 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -427,16 +427,18 @@ def test_clean_idempotent():
 
 def test_only_text_is_cleaned():
     some_text = 'text'
-    some_type = int
+    some_type = 42
     no_type = None
 
     assert bleach.clean(some_text) == some_text
 
-    with pytest.raises(TypeError):
+    with pytest.raises(TypeError) as e:
         bleach.clean(some_type)
+    assert "int" in str(e)
 
-    with pytest.raises(TypeError):
+    with pytest.raises(TypeError) as e:
         bleach.clean(no_type)
+    assert "NoneType" in str(e)
 
 
 class TestCleaner:

From 39a69127a1e8f6a9d3952f880390325c3d6cf2a7 Mon Sep 17 00:00:00 2001
From: Stephane Blondon <stephane.blondon@gmail.com>
Date: Sat, 11 Nov 2017 20:47:33 +0100
Subject: [PATCH 135/314] The wrong type is wrapped in quote in error message
 and better assertion in test

---
 bleach/sanitizer.py | 2 +-
 tests/test_clean.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 252726bf..20353f13 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -194,7 +194,7 @@ def clean(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            message = 'argument cannot be of {name} type, must be of text type'.format(
+            message = "argument cannot be of '{name}' type, must be of text type".format(
                 name=text.__class__.__name__)
             raise TypeError(message)
 
diff --git a/tests/test_clean.py b/tests/test_clean.py
index f0893e25..c5f78f73 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -427,14 +427,14 @@ def test_clean_idempotent():
 
 def test_only_text_is_cleaned():
     some_text = 'text'
-    some_type = 42
+    some_type = int
     no_type = None
 
     assert bleach.clean(some_text) == some_text
 
     with pytest.raises(TypeError) as e:
         bleach.clean(some_type)
-    assert "int" in str(e)
+    assert "argument cannot be of 'type' type" in str(e)
 
     with pytest.raises(TypeError) as e:
         bleach.clean(no_type)

From 98addf884d0d0d7e2f91f9bdf0468aab49e968b1 Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Tue, 21 Nov 2017 18:11:45 +0200
Subject: [PATCH 136/314] Fix typo

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 863772e8..96687893 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Bleach
 .. image:: https://badge.fury.io/py/bleach.svg
    :target: http://badge.fury.io/py/bleach
 
-Bleach is a allowed-list-based HTML sanitizing library that escapes or strips
+Bleach is an allowed-list-based HTML sanitizing library that escapes or strips
 markup and attributes.
 
 Bleach can also linkify text safely, applying filters that Django's ``urlize``

From 404679fb4956523dc40ee372213c570843c204c3 Mon Sep 17 00:00:00 2001
From: hugovk <hugovk@users.noreply.github.com>
Date: Tue, 21 Nov 2017 18:28:51 +0200
Subject: [PATCH 137/314] Drop support for EOL Python 3.3

---
 .travis.yml |  1 -
 setup.py    |  3 ++-
 tox.ini     | 12 ++----------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 14015378..b6eea407 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,7 +5,6 @@ cache:
   - "~/.cache/pip"
 python:
 - "2.7"
-- "3.3"
 - "3.4"
 - "3.5"
 - "3.6"
diff --git a/setup.py b/setup.py
index 2ac01001..eaa43b72 100755
--- a/setup.py
+++ b/setup.py
@@ -65,10 +65,11 @@ def get_version():
         'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ]
 )
diff --git a/tox.ini b/tox.ini
index 6ada567f..5436dd4e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,16 +5,15 @@
 
 [tox]
 envlist =
-    py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10}
+    py{27,34,35,36}-html5lib{99999999,999999999,10b9,10b10}
     pypy-html5lib99999999
-    py{27,33,34,35,36}-build-no-lang
+    py{27,34,35,36}-build-no-lang
     docs
     lint
 
 [testenv]
 basepython =
     py27: python2.7
-    py33: python3.3
     py34: python3.4
     py35: python3.5
     py36: python3.6
@@ -35,13 +34,6 @@ setenv =
 commands =
     python setup.py build
 
-[testenv:py33-build-no-lang]
-basepython = python3.3
-setenv =
-    LANG=
-commands =
-    python setup.py build
-
 [testenv:py34-build-no-lang]
 basepython = python3.4
 setenv =

From 0070fe79dd6f60d8f8583a01c2e8a4bc4f03cdef Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Dec 2017 10:52:22 -0500
Subject: [PATCH 138/314] Support html5lib-python 1.0.1

Fixes #337
---
 CHANGES             | 22 ++++++++++++++++++++++
 bleach/__init__.py  | 15 +++++++++++++++
 bleach/sanitizer.py |  6 +++++-
 tox.ini             |  3 ++-
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 8555f77c..6247cc97 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,28 @@
 Bleach Changes
 ==============
 
+Version 2.1.2 (December 7th, 2017)
+----------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Support html5lib-python 1.0.1. (#337)
+
+* Add deprecation warning for supporting html5lib-python < 1.0.
+
+
 Version 2.1.1 (October 2nd, 2017)
 ---------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 96b4ef4e..8ab84e6a 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,6 +2,7 @@
 
 from __future__ import unicode_literals
 
+import warnings
 from pkg_resources import parse_version
 
 from bleach.linkifier import (
@@ -17,6 +18,20 @@
 )
 
 
+import html5lib
+try:
+    _html5lib_version = html5lib.__version__.split('.')
+    if len(_html5lib_version) < 2:
+        _html5lib_version = _html5lib_version + ['0']
+except Exception:
+    _h5ml5lib_version = ['unknown', 'unknown']
+
+
+# Bleach 3.0.0 won't support html5lib-python < 1.0.0.
+if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]:
+    warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning)
+
+
 # yyyymmdd
 __releasedate__ = '20171002'
 # x.y or x.y.dev0
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 20353f13..81df765b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -9,11 +9,15 @@
 import html5lib
 from html5lib.constants import (
     entities,
-    ReparseException,
     namespaces,
     prefixes,
     tokenTypes,
 )
+try:
+    from html5lib.constants import ReparseException
+except ImportError:
+    # html5lib-python 1.0 changed the name
+    from html5lib.constants import _ReparseException as ReparseException
 from html5lib.filters.base import Filter
 from html5lib.filters import sanitizer
 from html5lib.serializer import HTMLSerializer
diff --git a/tox.ini b/tox.ini
index 6ada567f..7ed9d327 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10}
+    py{27,33,34,35,36}-html5lib{99999999,999999999,10b9,10b10,101}
     pypy-html5lib99999999
     py{27,33,34,35,36}-build-no-lang
     docs
@@ -24,6 +24,7 @@ deps =
     html5lib999999999: html5lib==0.999999999
     html5lib10b9: html5lib==1.0b9
     html5lib10b10: html5lib==1.0b10
+    html5lib101: html5lib==1.0.1
 commands =
     py.test {posargs:-v}
     python setup.py build

From 35d051ca4b54a3a3b16c243cac3f3bb6678e69bb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Dec 2017 10:59:31 -0500
Subject: [PATCH 139/314] Update for 2.1.2 release

---
 CHANGES            | 2 ++
 bleach/__init__.py | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6247cc97..47bf3906 100644
--- a/CHANGES
+++ b/CHANGES
@@ -22,6 +22,8 @@ None
 
 * Add deprecation warning for supporting html5lib-python < 1.0.
 
+* Switch to semver.
+
 
 Version 2.1.1 (October 2nd, 2017)
 ---------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8ab84e6a..6ebdc20e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20171002'
-# x.y or x.y.dev0
-__version__ = '2.1.1'
+__releasedate__ = '20171207'
+# x.y.z or x.y.z.dev0 -- semver
+__version__ = '2.1.2'
 VERSION = parse_version(__version__)
 
 

From e90b85e3c0c8b0019bfcc5042d43c9d937dd0600 Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Mon, 18 Dec 2017 15:06:26 +0200
Subject: [PATCH 140/314] Add python_requires to help pip

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index eaa43b72..e078c80f 100755
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@ def get_version():
     include_package_data=True,
     package_data={'': ['README.rst']},
     zip_safe=False,
+    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
     install_requires=install_requires,
     setup_requires=setup_requires,
     tests_require=tests_require,

From a418d02e8039078e2028f440344df17dbc4d0938 Mon Sep 17 00:00:00 2001
From: Nikita Sobolev <mail@sobolevn.me>
Date: Mon, 8 Jan 2018 12:13:05 +0300
Subject: [PATCH 141/314] Updates README.rst with svg badge

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 96687893..5f151dc7 100644
--- a/README.rst
+++ b/README.rst
@@ -2,7 +2,7 @@
 Bleach
 ======
 
-.. image:: https://travis-ci.org/mozilla/bleach.png?branch=master
+.. image:: https://travis-ci.org/mozilla/bleach.svg?branch=master
    :target: https://travis-ci.org/mozilla/bleach
 
 .. image:: https://badge.fury.io/py/bleach.svg

From 33e926c25b7c587383e6a54ab2963d204146dfd0 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Dec 2017 11:10:51 -0500
Subject: [PATCH 142/314] Fix release process versions

---
 docs/dev.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/dev.rst b/docs/dev.rst
index 98707048..d27a62ed 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -55,7 +55,7 @@ Release process
 
 3. Update version numbers in ``bleach/__init__.py``.
 
-   1. Set ``__version__`` to something like ``2.0``.
+   1. Set ``__version__`` to something like ``2.0.0``. Use semver.
    2. Set ``__releasedate__`` to something like ``20120731``.
 
 4. Update ``CONTRIBUTORS``, ``CHANGES`` and ``MANIFEST.in``.
@@ -84,7 +84,7 @@ Release process
 
 8. After Travis is happy, create a signed tag for the release::
 
-     $ git tag -s v0.4
+     $ git tag -s v0.4.0
 
    Copy the details from ``CHANGES`` into the tag comment.
 

From 86acc45023c73462d0d75cb5148702ec2e88f242 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 2 Mar 2018 17:36:03 -0500
Subject: [PATCH 143/314] Update to 2.1.3dev0 for dev work

---
 bleach/__init__.py     | 4 ++--
 tests/data/1.test.out  | 1 -
 tests/data/10.test.out | 1 -
 tests/data/11.test.out | 1 -
 tests/data/12.test.out | 1 -
 tests/data/13.test.out | 1 -
 tests/data/14.test.out | 1 -
 tests/data/15.test.out | 1 -
 tests/data/16.test.out | 1 -
 tests/data/17.test.out | 1 -
 tests/data/18.test.out | 1 -
 tests/data/19.test.out | 1 -
 tests/data/2.test.out  | 1 -
 tests/data/20.test.out | 1 -
 tests/data/3.test.out  | 1 -
 tests/data/4.test.out  | 1 -
 tests/data/5.test.out  | 1 -
 tests/data/7.test.out  | 1 -
 tests/data/8.test.out  | 1 -
 tests/data/9.test.out  | 1 -
 20 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 tests/data/1.test.out
 delete mode 100644 tests/data/10.test.out
 delete mode 100644 tests/data/11.test.out
 delete mode 100644 tests/data/12.test.out
 delete mode 100644 tests/data/13.test.out
 delete mode 100644 tests/data/14.test.out
 delete mode 100644 tests/data/15.test.out
 delete mode 100644 tests/data/16.test.out
 delete mode 100644 tests/data/17.test.out
 delete mode 100644 tests/data/18.test.out
 delete mode 100644 tests/data/19.test.out
 delete mode 100644 tests/data/2.test.out
 delete mode 100644 tests/data/20.test.out
 delete mode 100644 tests/data/3.test.out
 delete mode 100644 tests/data/4.test.out
 delete mode 100644 tests/data/5.test.out
 delete mode 100644 tests/data/7.test.out
 delete mode 100644 tests/data/8.test.out
 delete mode 100644 tests/data/9.test.out

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 6ebdc20e..8ed2c516 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20171207'
+__releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.2'
+__version__ = '2.1.3.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/tests/data/1.test.out b/tests/data/1.test.out
deleted file mode 100644
index d89228ad..00000000
--- a/tests/data/1.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&gt;"&gt;&lt;script&gt;alert("XSS")&lt;/script&gt;&amp;
\ No newline at end of file
diff --git a/tests/data/10.test.out b/tests/data/10.test.out
deleted file mode 100644
index 29998a1f..00000000
--- a/tests/data/10.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="javascript:alert('XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/11.test.out b/tests/data/11.test.out
deleted file mode 100644
index 52a02dc6..00000000
--- a/tests/data/11.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="javascript:alert('XSS')"&gt;
\ No newline at end of file
diff --git a/tests/data/12.test.out b/tests/data/12.test.out
deleted file mode 100644
index fb0807ae..00000000
--- a/tests/data/12.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="JaVaScRiPt:alert('XSS')"&gt;
\ No newline at end of file
diff --git a/tests/data/13.test.out b/tests/data/13.test.out
deleted file mode 100644
index 70be2ba4..00000000
--- a/tests/data/13.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&quot;)&gt;
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
deleted file mode 100644
index 4800a4df..00000000
--- a/tests/data/14.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&#99;&#114;&#105;&#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&#58;&#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
deleted file mode 100644
index 4a572463..00000000
--- a/tests/data/15.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&#108;&#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&#116;&#40;&#39;&#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&#83;&#39;&amp;#41&gt;
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
deleted file mode 100644
index c8e31d88..00000000
--- a/tests/data/16.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118&amp;#0000097&amp;#0000115&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099&amp;#0000114&amp;#0000105&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112&amp;#0000116&amp;#0000058&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097&amp;#0000108&amp;#0000101&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114&amp;#0000116&amp;#0000040&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039&amp;#0000088&amp;#0000083&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083&amp;#0000039&amp;#0000041&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
deleted file mode 100644
index 8d47f574..00000000
--- a/tests/data/17.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63&amp;#x72&amp;#x69&amp;#x70&amp;#x74&amp;#x3A&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61&amp;#x6C&amp;#x65&amp;#x72&amp;#x74&amp;#x28&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27&amp;#x58&amp;#x53&amp;#x53&amp;#x27&amp;#x29&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
diff --git a/tests/data/18.test.out b/tests/data/18.test.out
deleted file mode 100644
index e4fe2cf3..00000000
--- a/tests/data/18.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="jav&amp;#x09;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/19.test.out b/tests/data/19.test.out
deleted file mode 100644
index 4daa11ad..00000000
--- a/tests/data/19.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="jav&amp;#x0A;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
\ No newline at end of file
diff --git a/tests/data/2.test.out b/tests/data/2.test.out
deleted file mode 100644
index 0b32b6a4..00000000
--- a/tests/data/2.test.out
+++ /dev/null
@@ -1 +0,0 @@
-"&gt;&lt;style&gt;@import"javascript:alert('XSS')";&lt;/style&gt;
\ No newline at end of file
diff --git a/tests/data/20.test.out b/tests/data/20.test.out
deleted file mode 100644
index de3a6a14..00000000
--- a/tests/data/20.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/3.test.out b/tests/data/3.test.out
deleted file mode 100644
index f0d69629..00000000
--- a/tests/data/3.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;&lt;/img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
diff --git a/tests/data/4.test.out b/tests/data/4.test.out
deleted file mode 100644
index 88ea86b2..00000000
--- a/tests/data/4.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&lt;scr&lt;script&gt;ipt type="text/javascript"&gt;alert("foo");script&lt;del&gt;&lt;/del&gt;&gt;&lt;/scr&lt;script&gt;
diff --git a/tests/data/5.test.out b/tests/data/5.test.out
deleted file mode 100644
index 0d88a88a..00000000
--- a/tests/data/5.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;&lt;/img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
diff --git a/tests/data/7.test.out b/tests/data/7.test.out
deleted file mode 100644
index 41fd4322..00000000
--- a/tests/data/7.test.out
+++ /dev/null
@@ -1 +0,0 @@
-"&gt;
\ No newline at end of file
diff --git a/tests/data/8.test.out b/tests/data/8.test.out
deleted file mode 100644
index bc1ffd44..00000000
--- a/tests/data/8.test.out
+++ /dev/null
@@ -1 +0,0 @@
-&gt;"
\ No newline at end of file
diff --git a/tests/data/9.test.out b/tests/data/9.test.out
deleted file mode 100644
index 5c5eb6ba..00000000
--- a/tests/data/9.test.out
+++ /dev/null
@@ -1 +0,0 @@
-'';!--"&lt;xss&gt;=&amp;{()}&lt;/xss&gt;
\ No newline at end of file

From 138e5d7084c533cc3a2420942df47281c6e4e1e0 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 2 Mar 2018 17:36:49 -0500
Subject: [PATCH 144/314] Rework tests

This switches it so the "in" and "out" for tests are in the same file. This
makes them easier to read through on GitHub and in an editor.
---
 tests/data/1.test      |  2 ++
 tests/data/10.test     |  2 ++
 tests/data/11.test     |  2 ++
 tests/data/12.test     |  2 ++
 tests/data/13.test     |  2 ++
 tests/data/14.test     |  2 ++
 tests/data/15.test     |  2 ++
 tests/data/16.test     |  2 ++
 tests/data/17.test     |  2 ++
 tests/data/18.test     |  2 ++
 tests/data/19.test     |  2 ++
 tests/data/2.test      |  2 ++
 tests/data/20.test     |  2 ++
 tests/data/3.test      |  2 ++
 tests/data/4.test      |  2 ++
 tests/data/5.test      |  2 ++
 tests/data/7.test      |  2 ++
 tests/data/8.test      |  2 ++
 tests/data/9.test      |  2 ++
 tests/test_security.py | 11 +++++++----
 20 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/tests/data/1.test b/tests/data/1.test
index c900eccd..aba4c026 100644
--- a/tests/data/1.test
+++ b/tests/data/1.test
@@ -1 +1,3 @@
 >"><script>alert("XSS")</script>&
+--
+&gt;"&gt;&lt;script&gt;alert("XSS")&lt;/script&gt;&amp;
diff --git a/tests/data/10.test b/tests/data/10.test
index 268771bc..a6db9f98 100644
--- a/tests/data/10.test
+++ b/tests/data/10.test
@@ -1 +1,3 @@
 <IMG SRC="javascript:alert('XSS');">
+--
+&lt;img src="javascript:alert('XSS');"&gt;
diff --git a/tests/data/11.test b/tests/data/11.test
index 16a49c70..37cbbfaf 100644
--- a/tests/data/11.test
+++ b/tests/data/11.test
@@ -1 +1,3 @@
 <IMG SRC=javascript:alert('XSS')>
+--
+&lt;img src="javascript:alert('XSS')"&gt;
diff --git a/tests/data/12.test b/tests/data/12.test
index d4b96e6f..04c7ea8a 100644
--- a/tests/data/12.test
+++ b/tests/data/12.test
@@ -1 +1,3 @@
 <IMG SRC=JaVaScRiPt:alert('XSS')>
+--
+&lt;img src="JaVaScRiPt:alert('XSS')"&gt;
diff --git a/tests/data/13.test b/tests/data/13.test
index 07279a83..36d4aaee 100644
--- a/tests/data/13.test
+++ b/tests/data/13.test
@@ -1 +1,3 @@
 <IMG SRC=JaVaScRiPt:alert(&quot;XSS<WBR>&quot;)>
+--
+&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&quot;)&gt;
diff --git a/tests/data/14.test b/tests/data/14.test
index b704c0b4..f154c73e 100644
--- a/tests/data/14.test
+++ b/tests/data/14.test
@@ -1 +1,3 @@
 <IMGSRC=&#106;&#97;&#118;&#97;&<WBR>#115;&#99;&#114;&#105;&#112;&<WBR>#116;&#58;&#97;
+--
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&#99;&#114;&#105;&#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&#58;&#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
diff --git a/tests/data/15.test b/tests/data/15.test
index b6a2de6b..c48c3e41 100644
--- a/tests/data/15.test
+++ b/tests/data/15.test
@@ -1 +1,3 @@
 &#108;&#101;&<WBR>#114;&#116;&#40;&#39;&#88;&#83<WBR>;&#83;&#39;&#41>
+--
+&#108;&#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&#116;&#40;&#39;&#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&#83;&#39;&amp;#41&gt;
diff --git a/tests/data/16.test b/tests/data/16.test
index d66b5921..938240be 100644
--- a/tests/data/16.test
+++ b/tests/data/16.test
@@ -1 +1,3 @@
 <IMGSRC=&#0000106&#0000097&<WBR>#0000118&#0000097&#0000115&<WBR>#0000099&#0000114&#0000105&<WBR>#0000112&#0000116&#0000058&<WBR>#0000097&#0000108&#0000101&<WBR>#0000114&#0000116&#0000040&<WBR>#0000039&#0000088&#0000083&<WBR>#0000083&#0000039&#0000041>
+--
+&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118&amp;#0000097&amp;#0000115&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099&amp;#0000114&amp;#0000105&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112&amp;#0000116&amp;#0000058&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097&amp;#0000108&amp;#0000101&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114&amp;#0000116&amp;#0000040&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039&amp;#0000088&amp;#0000083&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083&amp;#0000039&amp;#0000041&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
diff --git a/tests/data/17.test b/tests/data/17.test
index 6e71b152..166e8845 100644
--- a/tests/data/17.test
+++ b/tests/data/17.test
@@ -1 +1,3 @@
 <IMGSRC=&#x6A&#x61&#x76&#x61&#x73&<WBR>#x63&#x72&#x69&#x70&#x74&#x3A&<WBR>#x61&#x6C&#x65&#x72&#x74&#x28&<WBR>#x27&#x58&#x53&#x53&#x27&#x29>
+--
+&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63&amp;#x72&amp;#x69&amp;#x70&amp;#x74&amp;#x3A&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61&amp;#x6C&amp;#x65&amp;#x72&amp;#x74&amp;#x28&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27&amp;#x58&amp;#x53&amp;#x53&amp;#x27&amp;#x29&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
diff --git a/tests/data/18.test b/tests/data/18.test
index 1c173723..635461f8 100644
--- a/tests/data/18.test
+++ b/tests/data/18.test
@@ -1 +1,3 @@
 <IMG SRC="jav&#x09;ascript:alert(<WBR>'XSS');">
+--
+&lt;img src="jav&amp;#x09;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/19.test b/tests/data/19.test
index e6e79742..1a1ebe41 100644
--- a/tests/data/19.test
+++ b/tests/data/19.test
@@ -1 +1,3 @@
 <IMG SRC="jav&#x0A;ascript:alert(<WBR>'XSS');">
+--
+&lt;img src="jav&amp;#x0A;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/2.test b/tests/data/2.test
index 21b93db3..aefcbe26 100644
--- a/tests/data/2.test
+++ b/tests/data/2.test
@@ -1 +1,3 @@
 "><STYLE>@import"javascript:alert('XSS')";</STYLE>
+--
+"&gt;&lt;style&gt;@import"javascript:alert('XSS')";&lt;/style&gt;
diff --git a/tests/data/20.test b/tests/data/20.test
index 614b544f..ceae0bd8 100644
--- a/tests/data/20.test
+++ b/tests/data/20.test
@@ -1 +1,3 @@
 <IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">
+--
+&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/3.test b/tests/data/3.test
index 8dc3a4ee..67f3591b 100644
--- a/tests/data/3.test
+++ b/tests/data/3.test
@@ -1 +1,3 @@
 >"'><img%20src%3D%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;XSS%26%23x20;Test%26%23x20;Successful%26quot;)>
+--
+&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;&lt;/img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
diff --git a/tests/data/4.test b/tests/data/4.test
index c4cf51cd..10438d81 100644
--- a/tests/data/4.test
+++ b/tests/data/4.test
@@ -1 +1,3 @@
 <scr<script></script>ipt type="text/javascript">alert("foo");</<script></script>script<del></del>>
+--
+&lt;scr&lt;script&gt;ipt type="text/javascript"&gt;alert("foo");script&lt;del&gt;&lt;/del&gt;&gt;&lt;/scr&lt;script&gt;
diff --git a/tests/data/5.test b/tests/data/5.test
index 0b03876b..dd45837a 100644
--- a/tests/data/5.test
+++ b/tests/data/5.test
@@ -1 +1,3 @@
 >%22%27><img%20src%3d%22javascript:alert(%27%20XSS%27)%22>
+--
+&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;&lt;/img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
diff --git a/tests/data/7.test b/tests/data/7.test
index 827f9b9e..73f5cab1 100644
--- a/tests/data/7.test
+++ b/tests/data/7.test
@@ -1 +1,3 @@
 ">
+--
+"&gt;
diff --git a/tests/data/8.test b/tests/data/8.test
index ddf33a96..f5be4f25 100644
--- a/tests/data/8.test
+++ b/tests/data/8.test
@@ -1 +1,3 @@
 >"
+--
+&gt;"
diff --git a/tests/data/9.test b/tests/data/9.test
index 9cf58659..26d27f78 100644
--- a/tests/data/9.test
+++ b/tests/data/9.test
@@ -1 +1,3 @@
 '';!--"<XSS>=&{()}
+--
+'';!--"&lt;xss&gt;=&amp;{()}&lt;/xss&gt;
diff --git a/tests/test_security.py b/tests/test_security.py
index 4c710775..9dd49338 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -197,12 +197,15 @@ def get_tests():
     return testcases
 
 
-@pytest.mark.parametrize('fn, text', get_tests())
-def test_regressions(fn, text):
+@pytest.mark.parametrize('fn, test_case', get_tests())
+def test_regressions(fn, test_case):
     """Regression tests for clean so we can see if there are issues"""
-    expected = six.text_type(open(fn + '.out', 'r').read())
+    test_data, expected = test_case.split('\n--\n')
 
     # NOTE(willkg): This strips input and expected which makes it easier to
     # maintain the files. If there comes a time when the input needs whitespace
     # at the beginning or end, then we'll have to figure out something else.
-    assert clean(text.strip()) == expected.strip()
+    test_data = test_data.strip()
+    expected = expected.strip()
+
+    assert clean(test_data) == expected

From 588286152b0c24d2d2c9e68d4761c14f00ce88b6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 3 Mar 2018 10:57:04 -0500
Subject: [PATCH 145/314] Merge all the clean tests into one file and clean up

* Moves test_security.py tests into test_clean.py
* Removes duplicate tests and unhelpful tests
* Adds additional helpful test cases
* Reworks some tests to be easier and run to read by parametrizing them
* Adds comments and adjusts function names to be more helpful
---
 tests/test_clean.py    | 549 ++++++++++++++++++++++++++++++-----------
 tests/test_security.py | 211 ----------------
 2 files changed, 405 insertions(+), 355 deletions(-)
 delete mode 100644 tests/test_security.py

diff --git a/tests/test_clean.py b/tests/test_clean.py
index c5f78f73..799ae186 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,96 +1,210 @@
+import os
+
 from html5lib.filters.base import Filter
 import pytest
 
-import bleach
+from bleach import clean
 from bleach.sanitizer import Cleaner
 
 
-def test_empty():
-    assert bleach.clean('') == ''
+def test_clean_idempotent():
+    """Make sure that applying the filter twice doesn't change anything."""
+    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+    assert clean(clean(dirty)) == clean(dirty)
 
 
-def test_nbsp():
-    assert bleach.clean('&nbsp;test string&nbsp;') == '&nbsp;test string&nbsp;'
+def test_only_text_is_cleaned():
+    some_text = 'text'
+    some_type = int
+    no_type = None
 
+    assert clean(some_text) == some_text
 
-def test_comments_only():
-    comment = '<!-- this is a comment -->'
-    assert bleach.clean(comment) == ''
-    assert bleach.clean(comment, strip_comments=False) == comment
+    with pytest.raises(TypeError) as e:
+        clean(some_type)
+    assert "argument cannot be of 'type' type" in str(e)
 
-    open_comment = '<!-- this is an open comment'
-    assert bleach.clean(open_comment) == ''
-    assert (
-        bleach.clean(open_comment, strip_comments=False) ==
-        '{0!s}-->'.format(open_comment)
-    )
+    with pytest.raises(TypeError) as e:
+        clean(no_type)
+    assert "NoneType" in str(e)
 
 
-def test_with_comments():
-    text = '<!-- comment -->Just text'
-    assert bleach.clean(text) == 'Just text'
-    assert bleach.clean(text, strip_comments=False) == text
+def test_empty():
+    assert clean('') == ''
 
 
-def test_no_html():
-    assert bleach.clean('no html string') == 'no html string'
+def test_content_has_no_html():
+    assert clean('no html string') == 'no html string'
 
 
-def test_allowed_html():
-    assert (
-        bleach.clean('an <strong>allowed</strong> tag') ==
+@pytest.mark.parametrize('data, expected', [
+    (
+        'an <strong>allowed</strong> tag',
         'an <strong>allowed</strong> tag'
-    )
-    assert (
-        bleach.clean('another <em>good</em> tag') ==
+    ),
+
+    (
+        'another <em>good</em> tag',
         'another <em>good</em> tag'
     )
+])
+def test_content_has_allowed_html(data, expected):
+    assert clean(data) == expected
 
 
-def test_bad_html():
+def test_html_is_lowercased():
     assert (
-        bleach.clean('a <em>fixed tag') ==
-        'a <em>fixed tag</em>'
+        clean('<A HREF="http://example.com">foo</A>') ==
+        '<a href="http://example.com">foo</a>'
     )
 
 
-def test_function_arguments():
-    TAGS = ['span', 'br']
-    ATTRS = {'span': ['style']}
+@pytest.mark.parametrize('data, should_strip, expected', [
+    # Regular comment
+    (
+        '<!-- this is a comment -->',
+        True,
+        ''
+    ),
 
-    text = 'a <br/><span style="color:red">test</span>'
-    assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
-        'a <br><span style="">test</span>'
+    # Open comment with no close comment bit
+    (
+        '<!-- open comment',
+        True,
+        ''
+    ),
+    (
+        '<!--open comment',
+        True,
+        ''
+    ),
+    (
+        '<!-- open comment',
+        False,
+        '<!-- open comment-->'
+    ),
+    (
+        '<!--open comment',
+        False,
+        '<!--open comment-->'
+    ),
+
+    # Comment with text to the right
+    (
+        '<!-- comment -->text',
+        True,
+        'text'
+    ),
+    (
+        '<!--comment-->text',
+        True,
+        'text'
+    ),
+    (
+        '<!-- comment -->text',
+        False,
+        '<!-- comment -->text'
+    ),
+    (
+        '<!--comment-->text',
+        False,
+        '<!--comment-->text'
+    ),
+
+    # Comment with text to the left
+    (
+        'text<!-- comment -->',
+        True,
+        'text'
+    ),
+    (
+        'text<!--comment-->',
+        True,
+        'text'
+    ),
+    (
+        'text<!-- comment -->',
+        False,
+        'text<!-- comment -->'
+    ),
+    (
+        'text<!--comment-->',
+        False,
+        'text<!--comment-->'
     )
+])
+def test_comments(data, should_strip, expected):
+    assert clean(data, strip_comments=should_strip) == expected
 
 
-def test_named_arguments():
-    ATTRS = {'a': ['rel', 'href']}
+@pytest.mark.parametrize('data, expected', [
+    # Disallowed tag is escaped
+    ('<img src="javascript:alert(\'XSS\');">', '&lt;img src="javascript:alert(\'XSS\');"&gt;'),
+
+    # Test with parens
+    ('a <script>safe()</script> test', 'a &lt;script&gt;safe()&lt;/script&gt; test'),
+
+    # Test with braces
+    ('a <style>body{}</style> test', 'a &lt;style&gt;body{}&lt;/style&gt; test'),
+])
+def test_disallowed_tags(data, expected):
+    assert clean(data) == expected
 
-    text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
-    assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
+
+def test_invalid_char_in_tag():
+    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
+    assert (
+        clean('<script/xss src="http://xx.com/xss.js"></script>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" xss=""&gt;&lt;/script&gt;',
+            '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
+    )
     assert (
-        bleach.clean(text, attributes=ATTRS) ==
-        '<a href="http://xx.com" rel="alternate">xx.com</a>'
+        clean('<script/src="http://xx.com/xss.js"></script>') ==
+        '&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
     )
 
 
-def test_disallowed_html():
+def test_unclosed_tag():
+    assert (
+        clean('a <em>fixed tag') ==
+        'a <em>fixed tag</em>'
+    )
     assert (
-        bleach.clean('a <script>safe()</script> test') ==
-        'a &lt;script&gt;safe()&lt;/script&gt; test'
+        clean('<script src=http://xx.com/xss.js<b>') ==
+        '&lt;script src="http://xx.com/xss.js&lt;b"&gt;&lt;/script&gt;'
     )
+    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
     assert (
-        bleach.clean('a <style>body{}</style> test') ==
-        'a &lt;style&gt;body{}&lt;/style&gt; test'
+        clean('<script src="http://xx.com/xss.js"<b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
+    )
+    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
+    assert (
+        clean('<script src="http://xx.com/xss.js" <b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
 
 
-def test_bad_href():
+def test_nested_script_tag():
     assert (
-        bleach.clean('<em href="fail">no link</em>') ==
-        '<em>no link</em>'
+        clean('<<script>script>evil()<</script>/script>') ==
+        '&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;'
+    )
+    assert (
+        clean('<<x>script>evil()<</x>/script>') ==
+        '&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;'
+    )
+    assert (
+        clean('<script<script>>evil()</script</script>>') ==
+        '&lt;script&lt;script&gt;&gt;evil()&gt;&lt;/script&lt;script&gt;'
     )
 
 
@@ -100,13 +214,14 @@ def test_bad_href():
     ('tag < <em>and</em> entity', 'tag &lt; <em>and</em> entity'),
 ])
 def test_bare_entities(text, expected):
-    assert bleach.clean(text) == expected
+    assert clean(text) == expected
 
 
 @pytest.mark.parametrize('text, expected', [
     # Test character entities
     ('&amp;', '&amp;'),
     ('&nbsp;', '&nbsp;'),
+    ('&nbsp; test string &nbsp;', '&nbsp; test string &nbsp;'),
     ('&lt;em&gt;strong&lt;/em&gt;', '&lt;em&gt;strong&lt;/em&gt;'),
 
     # Test character entity at beginning of string
@@ -154,75 +269,160 @@ def test_bare_entities(text, expected):
 
     # Test non-numeric entities
     ('&#', '&amp;#'),
-    ('&#<', '&amp;#&lt;')
+    ('&#<', '&amp;#&lt;'),
+
+    # html5lib tokenizer unescapes character entities, so these would become '
+    # and " which makes it possible to break out of html attributes.
+    #
+    # Verify that clean() doesn't unescape entities.
+    ('&#39;&#34;', '&#39;&#34;'),
 ])
 def test_character_entities(text, expected):
-    assert bleach.clean(text) == expected
+    assert clean(text) == expected
 
 
-def test_weird_strings():
-    s = '</3'
-    assert bleach.clean(s) == ''
-
-
-def test_stripping():
-    text = 'a test <em>with</em> <b>html</b> tags'
-    assert (
-        bleach.clean(text, strip=True) ==
+@pytest.mark.parametrize('data, kwargs, expected', [
+    # All tags are allowed, so it strips nothing
+    (
+        'a test <em>with</em> <b>html</b> tags',
+        {'strip': True},
         'a test <em>with</em> <b>html</b> tags'
-    )
+    ),
 
-    text = 'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags'
-    assert (
-        bleach.clean(text, strip=True) ==
+    # img tag is disallowed, so it's stripped
+    (
+        'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
+        {'strip': True},
         'a test <em>with</em>  <b>html</b> tags'
-    )
+    ),
 
-    text = '<p><a href="http://example.com/">link text</a></p>'
-    assert (
-        bleach.clean(text, tags=['p'], strip=True) ==
+    # a tag is disallowed, so it's stripped
+    (
+        '<p><a href="http://example.com/">link text</a></p>',
+        {'tags': ['p'], 'strip': True},
         '<p>link text</p>'
-    )
-    text = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
-    assert (
-        bleach.clean(text, tags=['p'], strip=True) ==
+    ),
+
+    # handle nested disallowed tag
+    (
+        '<p><span>multiply <span>nested <span>text</span></span></span></p>',
+        {'tags': ['p'], 'strip': True},
         '<p>multiply nested text</p>'
-    )
+    ),
 
-    text = '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>'
-    assert (
-        bleach.clean(text, tags=['p', 'a'], strip=True) ==
+    # handle disallowed tag that's deep in the tree
+    (
+        '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>',
+        {'tags': ['a', 'p'], 'strip': True},
         '<p><a href="http://example.com/"></a></p>'
-    )
+    ),
+])
+def test_stripping_tags(data, kwargs, expected):
+    assert clean(data, **kwargs) == expected
+
+
+@pytest.mark.parametrize('data, expected', [
+    (
+        '<scri<script>pt>alert(1)</scr</script>ipt>',
+        'pt&gt;alert(1)ipt&gt;'
+    ),
+    (
+        '<scri<scri<script>pt>pt>alert(1)</script>',
+        'pt&gt;pt&gt;alert(1)'
+    ),
+])
+def test_stripping_tags_is_safe(data, expected):
+    """Test stripping tags shouldn't result in malicious content"""
+    assert clean(data, strip=True) == expected
 
 
 def test_allowed_styles():
+    """Test allowed styles"""
     ATTRS = ['style']
     STYLE = ['color']
 
     assert (
-        bleach.clean('<b style="top:0"></b>', attributes=ATTRS) ==
+        clean('<b style="top:0"></b>', attributes=ATTRS) ==
         '<b style=""></b>'
     )
 
     text = '<b style="color: blue;"></b>'
-    assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text
+    assert clean(text, attributes=ATTRS, styles=STYLE) == text
 
     text = '<b style="top: 0; color: blue;"></b>'
     assert (
-        bleach.clean(text, attributes=ATTRS, styles=STYLE) ==
+        clean(text, attributes=ATTRS, styles=STYLE) ==
         '<b style="color: blue;"></b>'
     )
 
 
-def test_lowercase_html():
-    """We should output lowercase HTML."""
+def test_href_with_wrong_tag():
     assert (
-        bleach.clean('<EM CLASS="FOO">BAR</EM>', attributes=['class']) ==
-        '<em class="FOO">BAR</em>'
+        clean('<em href="fail">no link</em>') ==
+        '<em>no link</em>'
     )
 
 
+def test_disallowed_attr():
+    IMG = ['img', ]
+    IMG_ATTR = ['src']
+
+    assert (
+        clean('<a onclick="evil" href="test">test</a>') ==
+        '<a href="test">test</a>'
+    )
+    assert (
+        clean('<img onclick="evil" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="test">'
+    )
+    assert (
+        clean('<img href="invalid" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="test">'
+    )
+
+
+def test_unquoted_attr_values_are_quoted():
+    assert (
+        clean('<abbr title=mytitle>myabbr</abbr>') ==
+        '<abbr title="mytitle">myabbr</abbr>'
+    )
+
+
+def test_unquoted_event_handler_attr_value():
+    assert (
+        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>') ==
+        '<a href="http://xx.com">xx.com</a>'
+    )
+
+
+def test_invalid_filter_attr():
+    IMG = ['img', ]
+    IMG_ATTR = {
+        'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
+    }
+
+    assert (
+        clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img src="http://example.com/">'
+    )
+    assert (
+        clean('<img onclick="evil" src="http://badhost.com/" />', tags=IMG, attributes=IMG_ATTR) ==
+        '<img>'
+    )
+
+
+def test_poster_attribute():
+    """Poster attributes should not allow javascript."""
+    tags = ['video']
+    attrs = {'video': ['poster']}
+
+    test = '<video poster="javascript:alert(1)"></video>'
+    assert clean(test, tags=tags, attributes=attrs) == '<video></video>'
+
+    ok = '<video poster="/foo.png"></video>'
+    assert clean(ok, tags=tags, attributes=attrs) == ok
+
+
 def test_attributes_callable():
     """Verify attributes can take a callable"""
     ATTRS = lambda tag, name, val: name == 'title'
@@ -230,7 +430,7 @@ def test_attributes_callable():
 
     text = u'<a href="/foo" title="blah">example</a>'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         u'<a title="blah">example</a>'
     )
 
@@ -245,7 +445,7 @@ def test_attributes_wildcard():
 
     text = 'both <em id="foo" style="color: black">can</em> have <img id="bar" src="foo"/>'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         'both <em id="foo">can</em> have <img id="bar" src="foo">'
     )
 
@@ -258,7 +458,7 @@ def test_attributes_wildcard_callable():
     TAGS = ['a']
 
     assert (
-        bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
         u'<a title="blah">example</a>'
     )
 
@@ -275,12 +475,12 @@ def img_test(tag, name, val):
 
     text = 'foo <img src="http://example.com" alt="blah"> baz'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         u'foo <img> baz'
     )
     text = 'foo <img src="https://example.com" alt="blah"> baz'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         u'foo <img src="https://example.com"> baz'
     )
 
@@ -293,7 +493,7 @@ def test_attributes_tag_list():
     TAGS = ['a']
 
     assert (
-        bleach.clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
         u'<a title="blah">example</a>'
     )
 
@@ -305,11 +505,44 @@ def test_attributes_list():
 
     text = u'<a href="/foo" title="blah">example</a>'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         u'<a title="blah">example</a>'
     )
 
 
+@pytest.mark.parametrize('data, kwargs, expected', [
+    # javascript: is not allowed by default
+    (
+        '<a href="javascript:alert(\'XSS\')">xss</a>',
+        {},
+        '<a>xss</a>'
+    ),
+
+    # File protocol is not allowed by default
+    (
+        '<a href="file:///tmp/foo">foo</a>',
+        {},
+        '<a>foo</a>'
+    ),
+
+    # Specified protocols are allowed
+    (
+        '<a href="myprotocol://more_text">allowed href</a>',
+        {'protocols': ['myprotocol']},
+        '<a href="myprotocol://more_text">allowed href</a>'
+    ),
+
+    # Unspecified protocols are not allowed
+    (
+        '<a href="http://xx.com">invalid href</a>',
+        {'protocols': ['myprotocol']},
+        '<a>invalid href</a>'
+    )
+])
+def test_uri_value_allowed_protocols(data, kwargs, expected):
+    assert clean(data, **kwargs) == expected
+
+
 def test_svg_attr_val_allows_ref():
     """Unescape values in svg attrs that allow url references"""
     # Local IRI, so keep it
@@ -320,7 +553,7 @@ def test_svg_attr_val_allows_ref():
 
     text = '<svg><rect fill="url(#foo)" /></svg>'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         '<svg><rect fill="url(#foo)"></rect></svg>'
     )
 
@@ -331,7 +564,7 @@ def test_svg_attr_val_allows_ref():
     }
     text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
     assert (
-        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        clean(text, tags=TAGS, attributes=ATTRS) ==
         '<svg><rect></rect></svg>'
     )
 
@@ -353,7 +586,7 @@ def test_svg_allow_local_href(text, expected):
     ATTRS = {
         'pattern': ['id', 'href'],
     }
-    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+    assert clean(text, tags=TAGS, attributes=ATTRS) == expected
 
 
 @pytest.mark.parametrize('text, expected', [
@@ -372,73 +605,77 @@ def test_svg_allow_local_href_nonlocal(text, expected):
     ATTRS = {
         'pattern': ['id', 'href'],
     }
-    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+    assert clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.xfail(reason='regression from bleach 1.4')
+def test_weird_strings():
+    s = '</3'
+    assert clean(s) == '</3'
 
 
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
+@pytest.mark.xfail(reason='regression from bleach 1.4')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
-    dirty = 'Yeah right <sarcasm/>'
-    clean = 'Yeah right &lt;sarcasm/&gt;'
-    assert bleach.clean(dirty) == clean
-
+    assert (
+        clean('Yeah right <sarcasm/>') ==
+        'Yeah right &lt;sarcasm/&gt;'
+    )
 
-def test_user_defined_protocols_valid():
-    valid_href = '<a href="myprotocol://more_text">allowed href</a>'
-    assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
 
+@pytest.mark.parametrize('data, expected', [
+    # Convert bell
+    ('1\a23', '1?23'),
 
-def test_user_defined_protocols_invalid():
-    invalid_href = '<a href="http://xx.com">invalid href</a>'
-    cleaned_href = '<a>invalid href</a>'
-    assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+    # Convert backpsace
+    ('1\b23', '1?23'),
 
+    # Convert formfeed
+    ('1\v23', '1?23'),
 
-def test_filters():
-    # Create a Filter that changes all the attr values to "moo"
-    class MooFilter(Filter):
-        def __iter__(self):
-            for token in Filter.__iter__(self):
-                if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
-                    for attr, value in token['data'].items():
-                        token['data'][attr] = 'moo'
+    # Convert vertical tab
+    ('1\f23', '1?23'),
 
-                yield token
+    # Convert a bunch of characters in a string
+    ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
+])
+def test_invisible_characters(data, expected):
+    assert clean(data) == expected
 
-    ATTRS = {
-        'img': ['rel', 'src']
-    }
-    TAGS = ['img']
 
-    cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+def get_tests():
+    """Retrieves regression tests from data/ directory
 
-    dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
-    assert (
-        cleaner.clean(dirty) ==
-        'this is cute! <img rel="moo" src="moo">'
-    )
+    :returns: list of ``(filename, filedata)`` tuples
 
+    """
+    datadir = os.path.join(os.path.dirname(__file__), 'data')
+    tests = [
+        os.path.join(datadir, fn) for fn in os.listdir(datadir)
+        if fn.endswith('.test')
+    ]
+    # Sort numerically which makes it easier to iterate through them
+    tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
 
-def test_clean_idempotent():
-    """Make sure that applying the filter twice doesn't change anything."""
-    dirty = '<span>invalid & </span> < extra http://link.com<em>'
-    assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+    testcases = [
+        (fn, open(fn, 'r').read()) for fn in tests
+    ]
 
+    return testcases
 
-def test_only_text_is_cleaned():
-    some_text = 'text'
-    some_type = int
-    no_type = None
 
-    assert bleach.clean(some_text) == some_text
+@pytest.mark.parametrize('fn, test_case', get_tests())
+def test_regressions(fn, test_case):
+    """Regression tests for clean so we can see if there are issues"""
+    test_data, expected = test_case.split('\n--\n')
 
-    with pytest.raises(TypeError) as e:
-        bleach.clean(some_type)
-    assert "argument cannot be of 'type' type" in str(e)
+    # NOTE(willkg): This strips input and expected which makes it easier to
+    # maintain the files. If there comes a time when the input needs whitespace
+    # at the beginning or end, then we'll have to figure out something else.
+    test_data = test_data.strip()
+    expected = expected.strip()
 
-    with pytest.raises(TypeError) as e:
-        bleach.clean(no_type)
-    assert "NoneType" in str(e)
+    assert clean(test_data) == expected
 
 
 class TestCleaner:
@@ -452,3 +689,27 @@ def test_basics(self):
             cleaner.clean('a <br/><span style="color:red">test</span>') ==
             'a <br><span style="">test</span>'
         )
+
+    def test_filters(self):
+        # Create a Filter that changes all the attr values to "moo"
+        class MooFilter(Filter):
+            def __iter__(self):
+                for token in Filter.__iter__(self):
+                    if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+                        for attr, value in token['data'].items():
+                            token['data'][attr] = 'moo'
+
+                    yield token
+
+        ATTRS = {
+            'img': ['rel', 'src']
+        }
+        TAGS = ['img']
+
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
+        dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
+        assert (
+            cleaner.clean(dirty) ==
+            'this is cute! <img rel="moo" src="moo">'
+        )
diff --git a/tests/test_security.py b/tests/test_security.py
deleted file mode 100644
index 9dd49338..00000000
--- a/tests/test_security.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""More advanced security tests"""
-
-import os
-
-import pytest
-import six
-
-from bleach import clean
-
-
-def test_escaped_entities():
-    # html5lib unescapes character entities, so these would become ' and "
-    # which makes it possible to break out of html attributes.
-    #
-    # Verify that bleach.clean() doesn't unescape entities.
-    assert (
-        clean('&#39;&#34;') ==
-        '&#39;&#34;'
-    )
-
-
-def test_nested_script_tag():
-    assert (
-        clean('<<script>script>evil()<</script>/script>') ==
-        '&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;'
-    )
-    assert (
-        clean('<<x>script>evil()<</x>/script>') ==
-        '&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;'
-    )
-
-
-def test_nested_script_tag_r():
-    assert (
-        clean('<script<script>>evil()</script</script>>') ==
-        '&lt;script&lt;script&gt;&gt;evil()&gt;&lt;/script&lt;script&gt;'
-    )
-
-
-def test_invalid_attr():
-    IMG = ['img', ]
-    IMG_ATTR = ['src']
-
-    assert (
-        clean('<a onclick="evil" href="test">test</a>') ==
-        '<a href="test">test</a>'
-    )
-    assert (
-        clean('<img onclick="evil" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="test">'
-    )
-    assert (
-        clean('<img href="invalid" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="test">'
-    )
-
-
-def test_unquoted_attr():
-    assert (
-        clean('<abbr title=mytitle>myabbr</abbr>') ==
-        '<abbr title="mytitle">myabbr</abbr>'
-    )
-
-
-def test_unquoted_event_handler():
-    assert (
-        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>') ==
-        '<a href="http://xx.com">xx.com</a>'
-    )
-
-
-def test_invalid_attr_value():
-    assert (
-        clean('<img src="javascript:alert(\'XSS\');">') ==
-        '&lt;img src="javascript:alert(\'XSS\');"&gt;'
-    )
-
-
-def test_invalid_href_attr():
-    assert (
-        clean('<a href="javascript:alert(\'XSS\')">xss</a>') ==
-        '<a>xss</a>'
-    )
-
-
-def test_invalid_filter_attr():
-    IMG = ['img', ]
-    IMG_ATTR = {
-        'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
-    }
-
-    assert (
-        clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="http://example.com/">'
-    )
-    assert (
-        clean('<img onclick="evil" src="http://badhost.com/" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img>'
-    )
-
-
-def test_invalid_tag_char():
-    assert (
-        clean('<script/xss src="http://xx.com/xss.js"></script>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" xss=""&gt;&lt;/script&gt;',
-            '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
-    )
-    assert (
-        clean('<script/src="http://xx.com/xss.js"></script>') ==
-        '&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-    )
-
-
-def test_unclosed_tag():
-    assert (
-        clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src="http://xx.com/xss.js&lt;b"&gt;&lt;/script&gt;'
-    )
-    assert (
-        clean('<script src="http://xx.com/xss.js"<b>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
-            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
-    )
-    assert (
-        clean('<script src="http://xx.com/xss.js" <b>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
-            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
-    )
-
-
-def test_strip():
-    """Using strip=True shouldn't result in malicious content."""
-    s = '<scri<script>pt>alert(1)</scr</script>ipt>'
-    assert clean(s, strip=True) == 'pt&gt;alert(1)ipt&gt;'
-    s = '<scri<scri<script>pt>pt>alert(1)</script>'
-    assert clean(s, strip=True) == 'pt&gt;pt&gt;alert(1)'
-
-
-def test_poster_attribute():
-    """Poster attributes should not allow javascript."""
-    tags = ['video']
-    attrs = {'video': ['poster']}
-    test = '<video poster="javascript:alert(1)"></video>'
-    assert clean(test, tags=tags, attributes=attrs) == '<video></video>'
-    ok = '<video poster="/foo.png"></video>'
-    assert clean(ok, tags=tags, attributes=attrs) == ok
-
-
-def test_feed_protocol():
-    assert clean('<a href="feed:file:///tmp/foo">foo</a>') == '<a>foo</a>'
-
-
-@pytest.mark.parametrize('data, expected', [
-    # Convert bell
-    ('1\a23', '1?23'),
-
-    # Convert backpsace
-    ('1\b23', '1?23'),
-
-    # Convert formfeed
-    ('1\v23', '1?23'),
-
-    # Convert vertical tab
-    ('1\f23', '1?23'),
-
-    # Convert a bunch of characters in a string
-    ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
-])
-def test_invisible_characters(data, expected):
-    assert clean(data) == expected
-
-
-def get_tests():
-    """Retrieves regression tests from data/ directory
-
-    :returns: list of ``(filename, filedata)`` tuples
-
-    """
-    datadir = os.path.join(os.path.dirname(__file__), 'data')
-    tests = [
-        os.path.join(datadir, fn) for fn in os.listdir(datadir)
-        if fn.endswith('.test')
-    ]
-    # Sort numerically which makes it easier to iterate through them
-    tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
-
-    testcases = [
-        (fn, open(fn, 'r').read()) for fn in tests
-    ]
-
-    return testcases
-
-
-@pytest.mark.parametrize('fn, test_case', get_tests())
-def test_regressions(fn, test_case):
-    """Regression tests for clean so we can see if there are issues"""
-    test_data, expected = test_case.split('\n--\n')
-
-    # NOTE(willkg): This strips input and expected which makes it easier to
-    # maintain the files. If there comes a time when the input needs whitespace
-    # at the beginning or end, then we'll have to figure out something else.
-    test_data = test_data.strip()
-    expected = expected.strip()
-
-    assert clean(test_data) == expected

From 18ecceb5f61896e1a88e8d965b1e61e860ded2a5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 3 Mar 2018 11:15:49 -0500
Subject: [PATCH 146/314] Correct a regression comment and fix a test I
 misunderstood

---
 tests/test_clean.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index 799ae186..221addba 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -608,13 +608,12 @@ def test_svg_allow_local_href_nonlocal(text, expected):
     assert clean(text, tags=TAGS, attributes=ATTRS) == expected
 
 
-@pytest.mark.xfail(reason='regression from bleach 1.4')
 def test_weird_strings():
     s = '</3'
-    assert clean(s) == '</3'
+    assert clean(s) == ''
 
 
-@pytest.mark.xfail(reason='regression from bleach 1.4')
+@pytest.mark.xfail(reason='regression from bleach 1.5')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     assert (

From d580f0abba6ae62da22e59be4355ea1d690eb1f5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 3 Mar 2018 18:15:22 -0500
Subject: [PATCH 147/314] Fix MANIFEST and data_to_json.py related to recent
 changes

I squashed test cases into single files--no more .out files. This carries
that change through to MANIFEST.in and our tests_website system.
---
 MANIFEST.in                   |  2 +-
 tests_website/data_to_json.py | 38 +++++++++++++++++++----------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 1ae68e20..14ad79c7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -12,6 +12,6 @@ include docs/Makefile
 
 recursive-include docs *.rst
 
-recursive-include tests *.py *.test *.out
+recursive-include tests *.py *.test
 
 recursive-include tests_website *.html *.py *.rst
diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py
index debe5a9d..5870d64c 100755
--- a/tests_website/data_to_json.py
+++ b/tests_website/data_to_json.py
@@ -2,12 +2,12 @@
 
 """
 Util to write a directory of test cases with input filenames
-<testcase>.test and output filenames <testcase>.test.out as JSON to
-stdout.
+<testcase>.test as JSON to stdout.
 
-example:
+example::
+
+    $ python tests/data_to_json.py tests/data > testcases.json
 
-python tests/data_to_json.py tests/data > testcases.json
 """
 
 import argparse
@@ -21,29 +21,33 @@
 
 def main():
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument('data_dir',
-                        help='directory containing test cases with input files'
-                        ' named <testcase>.test and output <testcase>.test.out')
+    parser.add_argument(
+        'data_dir',
+        help=(
+            'directory containing test cases with names like <testcase>.test'
+        )
+    )
 
     args = parser.parse_args()
 
     filenames = os.listdir(args.data_dir)
     ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')]
-    outs = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test.out')]
 
     testcases = []
-    for infn, outfn in zip(ins, outs):
+    for infn in ins:
         case_name = infn.rsplit('.test', 1)[0]
 
-        with open(infn, 'r') as fin, open(outfn, 'r') as fout:
-            payload = fin.read()[:-1]
+        with open(infn, 'r') as fin:
+            data, expected = fin.read().split('\n--\n')
+            data = data.strip()
+            expected = expected.strip()
+
             testcases.append({
-                "title": case_name,
-                "input_filename": infn,
-                "output_filename": outfn,
-                "payload": payload,
-                "actual": bleach.clean(payload),
-                "expected": fout.read(),
+                'title': case_name,
+                'input_filename': infn,
+                'payload': data,
+                'actual': bleach.clean(data),
+                'expected': expected,
             })
 
     print(json.dumps(testcases, indent=4, sort_keys=True))

From 73dfef1d3b96c2e432660d8d2f2e9d0eaa230e36 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sat, 3 Mar 2018 09:58:37 -0500
Subject: [PATCH 148/314] Fix url sanitizing

Fixes a security issue where url sanitizing wouldn't work if there were
character entities breaking up the scheme. This allowed javascript urls
even when they were not explicitly allowed.
---
 bleach/sanitizer.py | 134 ++++++++++++++++++++++++++++++++++++++------
 tests/test_clean.py |  98 ++++++++++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 22 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 81df765b..ac6a55cb 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -4,6 +4,7 @@
 import string
 
 import six
+from six.moves.urllib.parse import urlparse
 from xml.sax.saxutils import unescape
 
 import html5lib
@@ -27,8 +28,11 @@
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
+#: Map of entity name to expanded entity
+ENTITIES = entities
+
 #: Trie of html entity string -> character representation
-ENTITIES_TRIE = Trie(entities)
+ENTITIES_TRIE = Trie(ENTITIES)
 
 #: List of allowed tags
 ALLOWED_TAGS = [
@@ -79,13 +83,61 @@
 INVISIBLE_REPLACEMENT_CHAR = '?'
 
 
+def convert_entity(value):
+    """Convert an entity (minus the & and ; part) into what it represents
+
+    This handles numeric, hex, and text entities.
+
+    :arg value: the string (minus the ``&`` and ``;`` part) to convert
+
+    :returns: unicode character
+
+    """
+    if value[0] == '#':
+        if value[1] in ('x', 'X'):
+            return six.unichr(int(value[2:], 16))
+        return six.unichr(int(value[1:], 10))
+
+    return ENTITIES[value]
+
+
+def convert_entities(text):
+    """Converts all found entities in the text
+
+    :arg text: the text to convert entities in
+
+    :returns: unicode text with converted entities
+
+    """
+    if '&' not in text:
+        return text
+
+    new_text = []
+    for part in next_possible_entity(text):
+        if not part:
+            continue
+
+        if part.startswith('&'):
+            entity = match_entity(part)
+            if entity is not None:
+                new_text.append(convert_entity(entity))
+                remainder = part[len(entity) + 2:]
+                if part:
+                    new_text.append(remainder)
+                continue
+
+        new_text.append(part)
+
+    return u''.join(new_text)
+
+
 class BleachHTMLTokenizer(HTMLTokenizer):
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # We don't want to consume and convert entities, so this overrides the
         # html5lib tokenizer's consumeEntity so that it's now a no-op.
         #
         # However, when that gets called, it's consumed an &, so we put that in
-        # the steam.
+        # the stream.
         if fromAttribute:
             self.currentToken['data'][-1][1] += '&'
 
@@ -479,15 +531,69 @@ def sanitize_characters(self, token):
                     new_tokens.append({'type': 'Entity', 'name': entity})
                     # Length of the entity plus 2--one for & at the beginning
                     # and and one for ; at the end
-                    part = part[len(entity) + 2:]
-                    if part:
-                        new_tokens.append({'type': 'Characters', 'data': part})
+                    remainder = part[len(entity) + 2:]
+                    if remainder:
+                        new_tokens.append({'type': 'Characters', 'data': remainder})
                     continue
 
             new_tokens.append({'type': 'Characters', 'data': part})
 
         return new_tokens
 
+    def sanitize_uri_value(self, value, allowed_protocols):
+        """Checks a uri value to see if it's allowed
+
+        :arg value: the uri value to sanitize
+        :arg allowed_protocols: list of allowed protocols
+
+        :returns: allowed value or None
+
+        """
+        # NOTE(willkg): This transforms the value into one that's easier to
+        # match and verify, but shouldn't get returned since it's vastly
+        # different than the original value.
+
+        # Convert all character entities in the value
+        new_value = convert_entities(value)
+
+        # Nix single quote, whitespace, and non-printable charcters
+        new_value = re.sub(
+            "[`\000-\040\177-\240\s]+",
+            '',
+            new_value
+        )
+
+        # Remove REPLACEMENT characters
+        new_value = new_value.replace('\ufffd', '')
+
+        # Lowercase it--this breaks the value, but makes it easier to match
+        # against
+        new_value = new_value.lower()
+
+        # Drop attributes with uri values that have protocols that aren't
+        # allowed
+        parsed = urlparse(new_value)
+        if parsed.scheme:
+            # If urlparse found a scheme, check that
+            if parsed.scheme in allowed_protocols:
+                return value
+
+        else:
+            # Allow uris that are just an anchor
+            if new_value.startswith('#'):
+                return value
+
+            # Handle protocols that urlparse doesn't recognize like "myprotocol"
+            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
+                return value
+
+            # If there's no protocol/scheme specified, then assume it's "http"
+            # and see if that's allowed
+            if 'http' in allowed_protocols:
+                return value
+
+        return None
+
     def allow_token(self, token):
         """Handles the case where we're allowing the tag"""
         if 'data' in token:
@@ -508,21 +614,13 @@ def allow_token(self, token):
                 if not self.attr_filter(token['name'], name, val):
                     continue
 
-                # Look at attributes that have uri values
+                # Drop attributes with uri values that use a disallowed protocol
+                # Sanitize attributes with uri values
                 if namespaced_name in self.attr_val_is_uri:
-                    val_unescaped = re.sub(
-                        "[`\000-\040\177-\240\s]+",
-                        '',
-                        unescape(val)).lower()
-
-                    # Remove replacement characters from unescaped characters.
-                    val_unescaped = val_unescaped.replace("\ufffd", "")
-
-                    # Drop attributes with uri values that have protocols that
-                    # aren't allowed
-                    if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
-                            (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
+                    if new_value is None:
                         continue
+                    val = new_value
 
                 # Drop values in svg attrs with non-local IRIs
                 if namespaced_name in self.svg_attr_val_allows_ref:
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 221addba..f680e8e1 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -213,7 +213,7 @@ def test_nested_script_tag():
     ('an < entity', 'an &lt; entity'),
     ('tag < <em>and</em> entity', 'tag &lt; <em>and</em> entity'),
 ])
-def test_bare_entities(text, expected):
+def test_bare_entities_get_escaped_correctly(text, expected):
     assert clean(text) == expected
 
 
@@ -277,7 +277,7 @@ def test_bare_entities(text, expected):
     # Verify that clean() doesn't unescape entities.
     ('&#39;&#34;', '&#39;&#34;'),
 ])
-def test_character_entities(text, expected):
+def test_character_entities_handling(text, expected):
     assert clean(text) == expected
 
 
@@ -534,10 +534,100 @@ def test_attributes_list():
 
     # Unspecified protocols are not allowed
     (
-        '<a href="http://xx.com">invalid href</a>',
+        '<a href="http://example.com">invalid href</a>',
         {'protocols': ['myprotocol']},
         '<a>invalid href</a>'
-    )
+    ),
+
+    # Anchors are ok
+    (
+        '<a href="#example.com">foo</a>',
+        {'protocols': []},
+        '<a href="#example.com">foo</a>'
+    ),
+
+    # Allow implicit http if allowed
+    (
+        '<a href="example.com">valid</a>',
+        {'protocols': ['http']},
+        '<a href="example.com">valid</a>'
+    ),
+    (
+        '<a href="example.com:8000">valid</a>',
+        {'protocols': ['http']},
+        '<a href="example.com:8000">valid</a>'
+    ),
+    (
+        '<a href="localhost">valid</a>',
+        {'protocols': ['http']},
+        '<a href="localhost">valid</a>'
+    ),
+    (
+        '<a href="localhost:8000">valid</a>',
+        {'protocols': ['http']},
+        '<a href="localhost:8000">valid</a>'
+    ),
+    (
+        '<a href="192.168.100.100">valid</a>',
+        {'protocols': ['http']},
+        '<a href="192.168.100.100">valid</a>'
+    ),
+    (
+        '<a href="192.168.100.100:8000">valid</a>',
+        {'protocols': ['http']},
+        '<a href="192.168.100.100:8000">valid</a>'
+    ),
+
+    # Disallow implicit http if disallowed
+    (
+        '<a href="example.com">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+    (
+        '<a href="example.com:8000">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+    (
+        '<a href="localhost">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+    (
+        '<a href="localhost:8000">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+    (
+        '<a href="192.168.100.100">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+    (
+        '<a href="192.168.100.100:8000">foo</a>',
+        {'protocols': []},
+        '<a>foo</a>'
+    ),
+
+    # Disallowed protocols with sneaky character entities
+    (
+        '<a href="javas&#x09;cript:alert(1)">alert</a>',
+        {},
+        '<a>alert</a>'
+    ),
+    (
+        '<a href="&#14;javascript:alert(1)">alert</a>',
+        {},
+        '<a>alert</a>'
+    ),
+
+    # Checking the uri should change it at all
+    (
+        '<a href="http://example.com/?foo&nbsp;bar">foo</a>',
+        {},
+        '<a href="http://example.com/?foo&nbsp;bar">foo</a>'
+    ),
 ])
 def test_uri_value_allowed_protocols(data, kwargs, expected):
     assert clean(data, **kwargs) == expected

From 61bf0e6db3bdce6294633555e08dd061af465c3c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 5 Mar 2018 16:08:49 -0500
Subject: [PATCH 149/314] Fix errant comment

---
 bleach/sanitizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index ac6a55cb..56f6d960 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -556,7 +556,7 @@ def sanitize_uri_value(self, value, allowed_protocols):
         # Convert all character entities in the value
         new_value = convert_entities(value)
 
-        # Nix single quote, whitespace, and non-printable charcters
+        # Nix backtick, space characters, and control characters
         new_value = re.sub(
             "[`\000-\040\177-\240\s]+",
             '',

From 9584f42051c0039cb0f27a617e8ab3e945018cc6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 5 Mar 2018 16:33:03 -0500
Subject: [PATCH 150/314] Prep for 2.1.3 release

---
 CHANGES            | 30 +++++++++++++++++++++++++++++-
 bleach/__init__.py |  4 ++--
 docs/dev.rst       |  2 +-
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/CHANGES b/CHANGES
index 47bf3906..25789814 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,34 @@
-Bleach Changes
+Bleach changes
 ==============
 
+Version 2.1.3 (March 5th, 2018)
+-------------------------------
+
+**Security fixes**
+
+* Attributes that have URI values weren't properly sanitized if the
+  values contained character entities. Using character entities, it
+  was possible to construct a URI value with a scheme that was not
+  allowed that would slide through unsanitized.
+
+  This security issue was introduced in Bleach 2.1. Anyone using
+  Bleach 2.1 is highly encouraged to upgrade.
+
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Fixed some other edge cases for attribute URI value sanitizing and
+  improved testing of this code.
+
+
 Version 2.1.2 (December 7th, 2017)
 ----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8ed2c516..b81b0bbe 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20180305'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.3.dev0'
+__version__ = '2.1.3'
 VERSION = parse_version(__version__)
 
 
diff --git a/docs/dev.rst b/docs/dev.rst
index d27a62ed..b0302524 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -74,7 +74,7 @@ Release process
    3. Run the doctests::
 
          $ cd docs/
-         $ make doctests
+         $ make doctest
 
    4. Verify everything works
 

From 3e9b9ec55bbec5906800c3838d0840b4741f74d9 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 6 Mar 2018 09:26:05 -0500
Subject: [PATCH 151/314] Add tests for sanitizing urls in css properties

---
 bleach/sanitizer.py |  4 ++--
 tests/test_css.py   | 52 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 56f6d960..09cae199 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -684,10 +684,10 @@ def disallowed_token(self, token):
 
     def sanitize_css(self, style):
         """Sanitizes css in style tags"""
-        # disallow urls
+        # Drop any url values
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
-        # gauntlet
+        # The gauntlet of sanitization
 
         # Validate the css in the style tag and if it's not valid, then drop
         # the whole thing.
diff --git a/tests/test_css.py b/tests/test_css.py
index d8880d78..ad81f594 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -66,7 +66,6 @@
     ),
 ])
 def test_allowed_css(data, styles, expected):
-
     p_single = '<p style="{0!s}">bar</p>'
     p_double = "<p style='{0!s}'>bar</p>"
 
@@ -89,6 +88,57 @@ def test_valid_css():
     )
 
 
+@pytest.mark.parametrize('data, expected', [
+    # No url--unchanged
+    (
+        '<p style="background: #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+
+    # Verify urls with no quotes, single quotes, and double quotes are all dropped
+    (
+        '<p style="background: url(topbanner.png) #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+    (
+        '<p style="background: url(\'topbanner.png\') #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+    (
+        '<p style=\'background: url("topbanner.png") #00D;\'>foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+
+    # Verify urls with spacing
+    (
+        '<p style="background: url(  \'topbanner.png\') #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+    (
+        '<p style="background: url(\'topbanner.png\'  ) #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+    (
+        '<p style="background: url(  \'topbanner.png\'  ) #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+    (
+        '<p style="background: url (  \'topbanner.png\'  ) #00D;">foo</p>',
+        '<p style="background: #00D;">foo</p>'
+    ),
+
+    # Verify urls with character entities--this isn't valid, so the entire
+    # property is dropped
+    (
+        '<p style="background: url&#x09;(\'topbanner.png\') #00D;">foo</p>',
+        '<p style="">foo</p>'
+    ),
+
+])
+def test_urls(data, expected):
+    assert clean(data, styles=['background']) == expected
+
+
 def test_style_hang():
     """The sanitizer should not hang on any inline styles"""
     style = [

From 28e7c3292bded1e91d194117e7d4d93ce855d698 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 16 Mar 2018 11:34:38 -0400
Subject: [PATCH 152/314] Handle ambiguous ampersands correctly

This fixes the ambiguous ampersand case in character entity handling in
attribute values.

Fixes #359
---
 bleach/sanitizer.py | 24 ++++++++++++++++--------
 tests/test_clean.py | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 09cae199..12225efd 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -90,7 +90,8 @@ def convert_entity(value):
 
     :arg value: the string (minus the ``&`` and ``;`` part) to convert
 
-    :returns: unicode character
+    :returns: unicode character or None if it's an ambiguous ampersand that
+        doesn't match a character entity
 
     """
     if value[0] == '#':
@@ -98,7 +99,7 @@ def convert_entity(value):
             return six.unichr(int(value[2:], 16))
         return six.unichr(int(value[1:], 10))
 
-    return ENTITIES[value]
+    return ENTITIES.get(value, None)
 
 
 def convert_entities(text):
@@ -120,11 +121,16 @@ def convert_entities(text):
         if part.startswith('&'):
             entity = match_entity(part)
             if entity is not None:
-                new_text.append(convert_entity(entity))
-                remainder = part[len(entity) + 2:]
-                if part:
-                    new_text.append(remainder)
-                continue
+                converted = convert_entity(entity)
+
+                # If it's not an ambiguous ampersand, then replace with the
+                # unicode character. Otherwise, we leave the entity in.
+                if converted is not None:
+                    new_text.append(converted)
+                    remainder = part[len(entity) + 2:]
+                    if part:
+                        new_text.append(remainder)
+                    continue
 
         new_text.append(part)
 
@@ -731,7 +737,9 @@ def escape_base_amp(self, stoken):
 
             if part.startswith('&'):
                 entity = match_entity(part)
-                if entity is not None:
+                # Only leave entities in that are not ambiguous. If they're
+                # ambiguous, then we escape the ampersand.
+                if entity is not None and convert_entity(entity) is not None:
                     yield '&' + entity + ';'
 
                     # Length of the entity plus 2--one for & at the beginning
diff --git a/tests/test_clean.py b/tests/test_clean.py
index f680e8e1..1f3cbfc8 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -4,7 +4,7 @@
 import pytest
 
 from bleach import clean
-from bleach.sanitizer import Cleaner
+from bleach.sanitizer import convert_entities, Cleaner
 
 
 def test_clean_idempotent():
@@ -246,7 +246,7 @@ def test_bare_entities_get_escaped_correctly(text, expected):
         'http://example.com?active=true&amp;current=true'
     ),
 
-    # Test entities in HTML attributes
+    # Test character entities in attribute values are left alone
     (
         '<a href="?art&amp;copy">foo</a>',
         '<a href="?art&amp;copy">foo</a>'
@@ -255,11 +255,20 @@ def test_bare_entities_get_escaped_correctly(text, expected):
         '<a href="?this=&gt;that">foo</a>',
         '<a href="?this=&gt;that">foo</a>'
     ),
+
+    # Ambiguous ampersands get escaped in attributes
+    (
+        '<a href="http://example.com/&xx;">foo</a>',
+        '<a href="http://example.com/&amp;xx;">foo</a>'
+    ),
     (
         '<a href="http://example.com?active=true&current=true">foo</a>',
         '<a href="http://example.com?active=true&amp;current=true">foo</a>'
     ),
 
+    # Ambiguous ampersands in text are not escaped
+    ('&xx;', '&xx;'),
+
     # Test numeric entities
     ('&#39;', '&#39;'),
     ('&#34;', '&#34;'),
@@ -732,6 +741,24 @@ def test_invisible_characters(data, expected):
     assert clean(data) == expected
 
 
+@pytest.mark.parametrize('data, expected', [
+    # Strings without character entities pass through as is
+    ('', ''),
+    ('abc', 'abc'),
+
+    # Handles character entities--both named and numeric
+    ('&nbsp;', u'\xa0'),
+    ('&#32;', ' '),
+    ('&#x20;', ' '),
+
+    # Handles ambiguous ampersand
+    ('&xx;', '&xx;'),
+])
+def test_convert_entities(data, expected):
+    print(repr(convert_entities(data)))
+    assert convert_entities(data) == expected
+
+
 def get_tests():
     """Retrieves regression tests from data/ directory
 

From 9818ffb81a362f4d141835a291225c2e65706ae2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sun, 18 Mar 2018 09:02:59 -0400
Subject: [PATCH 153/314] Add regression test with character entity in url

---
 tests/data/6.test | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tests/data/6.test

diff --git a/tests/data/6.test b/tests/data/6.test
new file mode 100644
index 00000000..7755c813
--- /dev/null
+++ b/tests/data/6.test
@@ -0,0 +1,3 @@
+<a href="javas&#x09;cript:alert(1)">hi</a>
+--
+<a>hi</a>

From a65f5c8ea664abbd54b4c711ebd0ca26c3509b7e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 19 Mar 2018 14:39:15 -0400
Subject: [PATCH 154/314] Update CHANGES

---
 CHANGES | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/CHANGES b/CHANGES
index 25789814..5a9d5f84 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
 Bleach changes
 ==============
 
+Version 2.1.4 (In development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Handle ambiguous ampersands in correctly. (#359)
+
+
 Version 2.1.3 (March 5th, 2018)
 -------------------------------
 
@@ -14,6 +34,7 @@ Version 2.1.3 (March 5th, 2018)
   This security issue was introduced in Bleach 2.1. Anyone using
   Bleach 2.1 is highly encouraged to upgrade.
 
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1442745
 
 **Backwards incompatible changes**
 

From 3f2270e42582d8f2d7392a54edff997b8675c797 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 19 Mar 2018 17:46:31 -0400
Subject: [PATCH 155/314] Handle nonexistent namespaces better

Issue 352 has a string that manages to tokenize an html attribute with
a namespace, but no name. Then the namespace doesn't exist in prefixes
and that throws a KeyError.

This alleviates that a bit such that if there's a namespace, but no
name, it swaps the two values. Further, if prefixes doesn't have the
namespace, then it ignores the namespace.

Fixes #352
---
 bleach/sanitizer.py | 14 +++++++++++++-
 tests/test_clean.py | 13 +++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 12225efd..faf8fd7a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -668,8 +668,20 @@ def disallowed_token(self, token):
             assert token_type in ("StartTag", "EmptyTag")
             attrs = []
             for (ns, name), v in token["data"].items():
+                # If we end up with a namespace, but no name, switch them so we
+                # have a valid name to use.
+                if ns and not name:
+                    ns, name = name, ns
+
+                # Figure out namespaced name if the namespace is appropriate
+                # and exists; if the ns isn't in prefixes, then drop it.
+                if ns is None or ns not in prefixes:
+                    namespaced_name = name
+                else:
+                    namespaced_name = '%s:%s' % (prefixes[ns], name)
+
                 attrs.append(' %s="%s"' % (
-                    name if ns is None else "%s:%s" % (prefixes[ns], name),
+                    namespaced_name,
                     # NOTE(willkg): HTMLSerializer escapes attribute values
                     # already, so if we do it here (like HTMLSerializer does),
                     # then we end up double-escaping.
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 1f3cbfc8..9547d631 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -759,6 +759,19 @@ def test_convert_entities(data, expected):
     assert convert_entities(data) == expected
 
 
+def test_nonexistent_namespace():
+    """Verify if the namespace doesn't exist, it doesn't fail with a KeyError
+
+    The tokenizer creates "c" as a namespace and that doesn't exist in the map
+    of namespaces, so then it fails with a KeyError. I don't understand why the
+    tokenizer makes "c" into a namespace in this string.
+
+    Issue #352.
+
+    """
+    assert clean('<d {c}>') == '&lt;d c=""&gt;&lt;/d&gt;'
+
+
 def get_tests():
     """Retrieves regression tests from data/ directory
 

From 46fa500e2b3275af09e888feb495d1fcd541fb00 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 21 Mar 2018 21:16:19 -0400
Subject: [PATCH 156/314] Convert entities in CSS values before sanitizing

The CSS is in an HTML attribute value, so we need to convert character
entities in it which makes it proper CSS before we can sanitize it.

Fixes #363
---
 bleach/sanitizer.py |  5 ++++-
 tests/test_css.py   | 22 +++++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index faf8fd7a..7e5d0361 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -702,7 +702,10 @@ def disallowed_token(self, token):
 
     def sanitize_css(self, style):
         """Sanitizes css in style tags"""
-        # Drop any url values
+        # Convert entities in the style so that it can be parsed as CSS
+        style = convert_entities(style)
+
+        # Drop any url values before we do anything else
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # The gauntlet of sanitization
diff --git a/tests/test_css.py b/tests/test_css.py
index ad81f594..12f27f3c 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -127,11 +127,10 @@ def test_valid_css():
         '<p style="background: #00D;">foo</p>'
     ),
 
-    # Verify urls with character entities--this isn't valid, so the entire
-    # property is dropped
+    # Verify urls with character entities
     (
         '<p style="background: url&#x09;(\'topbanner.png\') #00D;">foo</p>',
-        '<p style="">foo</p>'
+        '<p style="background: #00D;">foo</p>'
     ),
 
 ])
@@ -201,3 +200,20 @@ def test_style_hang():
     )
 
     assert clean(html, styles=styles) == expected
+
+
+@pytest.mark.parametrize('data, styles, expected', [
+    (
+        '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>',
+        ['font-family', 'white-space'],
+        '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>'
+    ),
+    (
+        '<p style="font-family: &quot;Droid Sans&quot;, serif; white-space: pre-wrap;">text</p>',
+        ['font-family', 'white-space'],
+        '<p style=\'font-family: "Droid Sans", serif; white-space: pre-wrap;\'>text</p>'
+    ),
+])
+def test_css_parsing_with_entities(data, styles, expected):
+    """The sanitizer should be ok with character entities"""
+    assert clean(data, tags=['p'], attributes={'p': ['style']}, styles=styles) == expected

From f1f04f6580e24bd1b977b8be0a1bc1e5d5f944da Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Jun 2018 13:59:46 -0400
Subject: [PATCH 157/314] Nix pinning in dev requirements

---
 requirements.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5cfec7f1..758459aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
 -e .
 
-# Requirements to run the test suite:
-pytest==3.0.6
+# Requirements to run the test suite
+pytest
 pytest-wholenodeid
-flake8==3.3.0
-tox==2.4.1
+flake8
+tox
 
 # Requirements for building docs
-Sphinx==1.5.2
+Sphinx
 
 # Requirements for updating package
-twine==1.8.1
+twine

From 8f6c2ea0b1155716ced070d87dc2c9d4f664ddcb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Jun 2018 14:08:11 -0400
Subject: [PATCH 158/314] Change requirements.txt to requirements-dev.txt

This change makes it clearer what the file is for.
---
 MANIFEST.in                              | 2 +-
 docs/dev.rst                             | 2 +-
 requirements.txt => requirements-dev.txt | 0
 tox.ini                                  | 6 +++---
 4 files changed, 5 insertions(+), 5 deletions(-)
 rename requirements.txt => requirements-dev.txt (100%)

diff --git a/MANIFEST.in b/MANIFEST.in
index 14ad79c7..5a0f3385 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,7 @@ include CHANGES
 include CONTRIBUTORS
 include CONTRIBUTING.rst
 include CODE_OF_CONDUCT.rst
-include requirements.txt
+include requirements-dev.txt
 include tox.ini
 include LICENSE
 include README.rst
diff --git a/docs/dev.rst b/docs/dev.rst
index b0302524..abeaf913 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -50,7 +50,7 @@ Release process
 
 1. Checkout master tip.
 
-2. Check to make sure ``setup.py`` and ``requirements.txt`` are
+2. Check to make sure ``setup.py`` and ``requirements-dev.txt`` are
    correct and match requirements-wise.
 
 3. Update version numbers in ``bleach/__init__.py``.
diff --git a/requirements.txt b/requirements-dev.txt
similarity index 100%
rename from requirements.txt
rename to requirements-dev.txt
diff --git a/tox.ini b/tox.ini
index d44521c9..c58bd532 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,7 +18,7 @@ basepython =
     py35: python3.5
     py36: python3.6
 deps =
-    -rrequirements.txt
+    -rrequirements-dev.txt
     html5lib99999999: html5lib==0.99999999
     html5lib999999999: html5lib==0.999999999
     html5lib10b9: html5lib==1.0b9
@@ -59,7 +59,7 @@ commands =
 [testenv:lint]
 basepython = python
 deps =
-    -rrequirements.txt
+    -rrequirements-dev.txt
 commands =
     flake8 bleach/
 
@@ -67,6 +67,6 @@ commands =
 basepython = python
 changedir = docs
 deps =
-    -rrequirements.txt
+    -rrequirements-dev.txt
 commands =
     sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html

From 63076f4420498571027bb853703f06b3bfd469ff Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Jun 2018 14:33:19 -0400
Subject: [PATCH 159/314] Fix lint and docs tox environments

---
 tox.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index c58bd532..a5538a7b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -57,14 +57,14 @@ commands =
     python setup.py build
 
 [testenv:lint]
-basepython = python
+basepython = python3.6
 deps =
     -rrequirements-dev.txt
 commands =
     flake8 bleach/
 
 [testenv:docs]
-basepython = python
+basepython = python3.6
 changedir = docs
 deps =
     -rrequirements-dev.txt

From 9959a1a57c1574806e24ea29209af882c5bdbd95 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Jun 2018 14:35:53 -0400
Subject: [PATCH 160/314] Update CHANGES re: Python 3.3 support

---
 CHANGES | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES b/CHANGES
index 5a9d5f84..5a01cd8a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,7 +10,7 @@ None
 
 **Backwards incompatible changes**
 
-None
+* Dropped support for Python 3.3. (#328)
 
 **Features**
 

From b8aae5660693f4d30d76a0b8e7525af1adcbc3cc Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 7 Jun 2018 15:13:27 -0400
Subject: [PATCH 161/314] Fix requirements file name in travis

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index b6eea407..dfecccf7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,7 +15,7 @@ env:
 install:
   # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
   - pip install -U pip setuptools>=18.5
-  - pip install -r requirements.txt
+  - pip install -r requirements-dev.txt
   # stomp on html5lib install with the specified one
   - pip install html5lib==$HTML5LIB
 script:

From 9319ec77a06c582bd5e7726c0b3c69139ad67732 Mon Sep 17 00:00:00 2001
From: Antoine Leclair <antoineleclair@gmail.com>
Date: Fri, 29 Jun 2018 13:12:31 -0400
Subject: [PATCH 162/314] Fix error when parsing invalid URI

---
 CONTRIBUTORS        |  1 +
 bleach/sanitizer.py | 11 ++++++++---
 tests/test_clean.py |  3 +++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 94276246..5783ab17 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -25,6 +25,7 @@ Contributors:
 - Alireza Savand
 - Andreas Malecki
 - Andy Freeland
+- Antoine Leclair
 - Anton Kovalyov
 - Chris Beaven
 - Dan Gayle
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 7e5d0361..31f12400 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -576,9 +576,14 @@ def sanitize_uri_value(self, value, allowed_protocols):
         # against
         new_value = new_value.lower()
 
-        # Drop attributes with uri values that have protocols that aren't
-        # allowed
-        parsed = urlparse(new_value)
+        try:
+            # Drop attributes with uri values that have protocols that aren't
+            # allowed
+            parsed = urlparse(new_value)
+        except ValueError:
+            # URI is impossible to parse, therefore it's not allowed
+            return None
+
         if parsed.scheme:
             # If urlparse found a scheme, check that
             if parsed.scheme in allowed_protocols:
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 9547d631..951d5b2a 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -58,6 +58,9 @@ def test_html_is_lowercased():
         '<a href="http://example.com">foo</a>'
     )
 
+def test_invalid_uri_does_not_raise_error():
+    assert clean('<a href="http://example.com]">text</a>') == '<a>text</a>'
+
 
 @pytest.mark.parametrize('data, should_strip, expected', [
     # Regular comment

From 8f88b41810ef82f5a1204e45ad8d6c9329b0c0b1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 19 Jul 2018 20:56:38 -0400
Subject: [PATCH 163/314] Sync travis and tox environments

This makes sure travis and tox are testing Bleach with the same
configurations.
---
 .travis.yml | 22 +++++++++++++---------
 tox.ini     |  2 ++
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index dfecccf7..cd05d9aa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,21 @@
+# Note: If you update this, make sure to update tox.ini, too.
 sudo: false
 language: python
 cache:
   directories:
   - "~/.cache/pip"
 python:
-- "2.7"
-- "3.4"
-- "3.5"
-- "3.6"
-- "pypy"
+  - "2.7"
+  - "3.4"
+  - "3.5"
+  - "3.6"
+  - "pypy"
 env:
-- HTML5LIB=0.99999999   # 8
-- HTML5LIB=0.999999999  # 9
+  - HTML5LIB=0.99999999   # 8
+  - HTML5LIB=0.999999999  # 9
+  - HTML5LIB=1.0b9
+  - HTML5LIB=1.0b10
+  - HTML5LIB=1.0.1
 install:
   # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
   - pip install -U pip setuptools>=18.5
@@ -19,8 +23,8 @@ install:
   # stomp on html5lib install with the specified one
   - pip install html5lib==$HTML5LIB
 script:
-- py.test
-- flake8 bleach/
+  - py.test
+  - flake8 bleach/
 deploy:
   provider: pypi
   user: jezdez
diff --git a/tox.ini b/tox.ini
index a5538a7b..d5539644 100644
--- a/tox.ini
+++ b/tox.ini
@@ -3,6 +3,8 @@
 # test suite on all supported python versions. To use it, "pip install tox"
 # and then run "tox" from this directory.
 
+# Note: If you update this, make sure to update .travis.yml, too.
+
 [tox]
 envlist =
     py{27,34,35,36}-html5lib{99999999,999999999,10b9,10b10,101}

From 9960da4ddd777627fc39d8c1f4a36923102af06d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 16 Aug 2018 16:30:33 -0400
Subject: [PATCH 164/314] Update for v2.1.4 release

---
 CHANGES            | 4 ++--
 bleach/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGES b/CHANGES
index 5a01cd8a..fd17745b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version 2.1.4 (In development)
-------------------------------
+Version 2.1.4 (August 16th, 2018)
+---------------------------------
 
 **Security fixes**
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index b81b0bbe..d0d84029 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20180305'
+__releasedate__ = '20180816'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.3'
+__version__ = '2.1.4'
 VERSION = parse_version(__version__)
 
 

From ff6e5c53d8888570f06d905cf31f2132b3b946a6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 16 Aug 2018 16:47:58 -0400
Subject: [PATCH 165/314] Update for 2.1.5 development

---
 CHANGES            | 20 ++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index fd17745b..ddd3e0a2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
 Bleach changes
 ==============
 
+Version 2.1.5 (in development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+
 Version 2.1.4 (August 16th, 2018)
 ---------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index d0d84029..367fbf42 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20180816'
+__releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.4'
+__version__ = '2.1.5.dev0'
 VERSION = parse_version(__version__)
 
 

From a507a4ed7e37cd594b8af5b4722bd6b058e9c2c2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 27 Aug 2018 09:08:47 -0400
Subject: [PATCH 166/314] Drop easy_install instructions

Fixes #373
---
 README.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.rst b/README.rst
index 5f151dc7..6622ee46 100644
--- a/README.rst
+++ b/README.rst
@@ -58,10 +58,6 @@ Bleach is available on PyPI_, so you can install it with ``pip``::
 
     $ pip install bleach
 
-Or with ``easy_install``::
-
-    $ easy_install bleach
-
 
 Upgrading Bleach
 ================

From 7970857c78bec0060f527277a91a8ca72aaabe8d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sun, 26 Aug 2018 15:21:35 -0400
Subject: [PATCH 167/314] vendor html5lib 1.0.1

This vendors html5lib 1.0.1 and in doing that, drops the requirement to
install html5lib.

Fixes #386
---
 .gitignore                                    |    6 +-
 CHANGES                                       |    3 +-
 MANIFEST.in                                   |    3 +-
 bleach/__init__.py                            |   15 -
 bleach/_vendor/README.rst                     |   21 +
 bleach/_vendor/__init__.py                    |    0
 .../html5lib-1.0.1.dist-info/DESCRIPTION.rst  |  489 +++
 .../html5lib-1.0.1.dist-info/INSTALLER        |    1 +
 .../html5lib-1.0.1.dist-info/LICENSE.txt      |   20 +
 .../_vendor/html5lib-1.0.1.dist-info/METADATA |  530 +++
 .../_vendor/html5lib-1.0.1.dist-info/RECORD   |   42 +
 bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL |    6 +
 .../html5lib-1.0.1.dist-info/metadata.json    |    1 +
 .../html5lib-1.0.1.dist-info/top_level.txt    |    1 +
 bleach/_vendor/html5lib/__init__.py           |   35 +
 bleach/_vendor/html5lib/_ihatexml.py          |  288 ++
 bleach/_vendor/html5lib/_inputstream.py       |  923 ++++++
 bleach/_vendor/html5lib/_tokenizer.py         | 1721 ++++++++++
 bleach/_vendor/html5lib/_trie/__init__.py     |   14 +
 bleach/_vendor/html5lib/_trie/_base.py        |   37 +
 bleach/_vendor/html5lib/_trie/datrie.py       |   44 +
 bleach/_vendor/html5lib/_trie/py.py           |   67 +
 bleach/_vendor/html5lib/_utils.py             |  124 +
 bleach/_vendor/html5lib/constants.py          | 2947 +++++++++++++++++
 bleach/_vendor/html5lib/filters/__init__.py   |    0
 .../filters/alphabeticalattributes.py         |   29 +
 bleach/_vendor/html5lib/filters/base.py       |   12 +
 .../html5lib/filters/inject_meta_charset.py   |   73 +
 bleach/_vendor/html5lib/filters/lint.py       |   93 +
 .../_vendor/html5lib/filters/optionaltags.py  |  207 ++
 bleach/_vendor/html5lib/filters/sanitizer.py  |  896 +++++
 bleach/_vendor/html5lib/filters/whitespace.py |   38 +
 bleach/_vendor/html5lib/html5parser.py        | 2791 ++++++++++++++++
 bleach/_vendor/html5lib/serializer.py         |  409 +++
 .../_vendor/html5lib/treeadapters/__init__.py |   30 +
 .../_vendor/html5lib/treeadapters/genshi.py   |   54 +
 bleach/_vendor/html5lib/treeadapters/sax.py   |   50 +
 .../_vendor/html5lib/treebuilders/__init__.py |   88 +
 bleach/_vendor/html5lib/treebuilders/base.py  |  417 +++
 bleach/_vendor/html5lib/treebuilders/dom.py   |  236 ++
 bleach/_vendor/html5lib/treebuilders/etree.py |  340 ++
 .../html5lib/treebuilders/etree_lxml.py       |  366 ++
 .../_vendor/html5lib/treewalkers/__init__.py  |  154 +
 bleach/_vendor/html5lib/treewalkers/base.py   |  252 ++
 bleach/_vendor/html5lib/treewalkers/dom.py    |   43 +
 bleach/_vendor/html5lib/treewalkers/etree.py  |  130 +
 .../html5lib/treewalkers/etree_lxml.py        |  213 ++
 bleach/_vendor/html5lib/treewalkers/genshi.py |   69 +
 bleach/_vendor/pip_install_vendor.sh          |    4 +
 bleach/_vendor/vendor.txt                     |    1 +
 bleach/linkifier.py                           |    8 +-
 bleach/sanitizer.py                           |   20 +-
 setup.cfg                                     |    4 +
 setup.py                                      |    5 +-
 tests/test_clean.py                           |    2 +-
 tests/test_linkify.py                         |    5 +-
 tox.ini                                       |    9 +-
 57 files changed, 14336 insertions(+), 50 deletions(-)
 create mode 100644 bleach/_vendor/README.rst
 create mode 100644 bleach/_vendor/__init__.py
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
 create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt
 create mode 100644 bleach/_vendor/html5lib/__init__.py
 create mode 100644 bleach/_vendor/html5lib/_ihatexml.py
 create mode 100644 bleach/_vendor/html5lib/_inputstream.py
 create mode 100644 bleach/_vendor/html5lib/_tokenizer.py
 create mode 100644 bleach/_vendor/html5lib/_trie/__init__.py
 create mode 100644 bleach/_vendor/html5lib/_trie/_base.py
 create mode 100644 bleach/_vendor/html5lib/_trie/datrie.py
 create mode 100644 bleach/_vendor/html5lib/_trie/py.py
 create mode 100644 bleach/_vendor/html5lib/_utils.py
 create mode 100644 bleach/_vendor/html5lib/constants.py
 create mode 100644 bleach/_vendor/html5lib/filters/__init__.py
 create mode 100644 bleach/_vendor/html5lib/filters/alphabeticalattributes.py
 create mode 100644 bleach/_vendor/html5lib/filters/base.py
 create mode 100644 bleach/_vendor/html5lib/filters/inject_meta_charset.py
 create mode 100644 bleach/_vendor/html5lib/filters/lint.py
 create mode 100644 bleach/_vendor/html5lib/filters/optionaltags.py
 create mode 100644 bleach/_vendor/html5lib/filters/sanitizer.py
 create mode 100644 bleach/_vendor/html5lib/filters/whitespace.py
 create mode 100644 bleach/_vendor/html5lib/html5parser.py
 create mode 100644 bleach/_vendor/html5lib/serializer.py
 create mode 100644 bleach/_vendor/html5lib/treeadapters/__init__.py
 create mode 100644 bleach/_vendor/html5lib/treeadapters/genshi.py
 create mode 100644 bleach/_vendor/html5lib/treeadapters/sax.py
 create mode 100644 bleach/_vendor/html5lib/treebuilders/__init__.py
 create mode 100644 bleach/_vendor/html5lib/treebuilders/base.py
 create mode 100644 bleach/_vendor/html5lib/treebuilders/dom.py
 create mode 100644 bleach/_vendor/html5lib/treebuilders/etree.py
 create mode 100644 bleach/_vendor/html5lib/treebuilders/etree_lxml.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/__init__.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/base.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/dom.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/etree.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/etree_lxml.py
 create mode 100644 bleach/_vendor/html5lib/treewalkers/genshi.py
 create mode 100755 bleach/_vendor/pip_install_vendor.sh
 create mode 100644 bleach/_vendor/vendor.txt

diff --git a/.gitignore b/.gitignore
index 26bbdf8e..c4abbd13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,10 +4,14 @@ pip-log.txt
 .coverage
 dist
 *.egg-info
-.noseids
 build
 .tox
 docs/_build/
 .cache/
 .eggs/
 .*env*/
+.pytest_cache/
+.python-version
+*~
+*.swp
+__pycache__
diff --git a/CHANGES b/CHANGES
index ddd3e0a2..423c1ecb 100644
--- a/CHANGES
+++ b/CHANGES
@@ -14,7 +14,8 @@ None
 
 **Features**
 
-None
+* No longer depends on html5lib. html5lib==1.0.1 was vendored into Bleach.
+  (#386)
 
 **Bug fixes**
 
diff --git a/MANIFEST.in b/MANIFEST.in
index 5a0f3385..2a85593e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -10,8 +10,7 @@ include README.rst
 include docs/conf.py
 include docs/Makefile
 
+recursive-include bleach *.py *.json *.rst *.sh *.txt INSTALLER METADATA RECORD WHEEL
 recursive-include docs *.rst
-
 recursive-include tests *.py *.test
-
 recursive-include tests_website *.html *.py *.rst
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 367fbf42..f953fc51 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,7 +2,6 @@
 
 from __future__ import unicode_literals
 
-import warnings
 from pkg_resources import parse_version
 
 from bleach.linkifier import (
@@ -18,20 +17,6 @@
 )
 
 
-import html5lib
-try:
-    _html5lib_version = html5lib.__version__.split('.')
-    if len(_html5lib_version) < 2:
-        _html5lib_version = _html5lib_version + ['0']
-except Exception:
-    _h5ml5lib_version = ['unknown', 'unknown']
-
-
-# Bleach 3.0.0 won't support html5lib-python < 1.0.0.
-if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]:
-    warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning)
-
-
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst
new file mode 100644
index 00000000..41c1d13e
--- /dev/null
+++ b/bleach/_vendor/README.rst
@@ -0,0 +1,21 @@
+=======================
+Vendored library policy
+=======================
+
+To simplify Bleach development, we're now vendoring certain libraries that
+we use.
+
+Vendored libraries must follow these rules:
+
+1. Vendored libraries must be pure Python--no compiling.
+2. Source code for the libary is included in this directory.
+3. License must be included in this repo and in the Bleach distribution.
+4. Requirements of the library become requirements of Bleach.
+5. No modifications to the library may be made.
+
+Way to vendor a library or update a version:
+
+1. Update ``vendor.txt`` with the library and version.
+2. Remove old files and directories.
+3. Run ``pip_install_vendor.sh`` and check everything it produced in including
+   the ``.dist-info`` directory and contents.
diff --git a/bleach/_vendor/__init__.py b/bleach/_vendor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
new file mode 100644
index 00000000..c05f8c00
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
@@ -0,0 +1,489 @@
+html5lib
+========
+
+.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
+  :target: https://travis-ci.org/html5lib/html5lib-python
+
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the WHATWG HTML specification, as is implemented by all major
+web browsers.
+
+
+Usage
+-----
+
+Simple usage follows this pattern:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      document = html5lib.parse(f)
+
+or:
+
+.. code-block:: python
+
+  import html5lib
+  document = html5lib.parse("<p>Hello World!")
+
+By default, the ``document`` will be an ``xml.etree`` element instance.
+Whenever possible, html5lib chooses the accelerated ``ElementTree``
+implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+
+Two other tree types are supported: ``xml.dom.minidom`` and
+``lxml.etree``. To use an alternative format, specify the name of
+a treebuilder:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+When using with ``urllib2`` (Python 2), the charset from HTTP should be
+pass into html5lib as follows:
+
+.. code-block:: python
+
+  from contextlib import closing
+  from urllib2 import urlopen
+  import html5lib
+
+  with closing(urlopen("http://example.com/")) as f:
+      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
+
+When using with ``urllib.request`` (Python 3), the charset from HTTP
+should be pass into html5lib as follows:
+
+.. code-block:: python
+
+  from urllib.request import urlopen
+  import html5lib
+
+  with urlopen("http://example.com/") as f:
+      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
+
+To have more control over the parser, create a parser object explicitly.
+For instance, to make the parser raise exceptions on parse errors, use:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      parser = html5lib.HTMLParser(strict=True)
+      document = parser.parse(f)
+
+When you're instantiating parser objects explicitly, pass a treebuilder
+class as the ``tree`` keyword argument to use an alternative document
+format:
+
+.. code-block:: python
+
+  import html5lib
+  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
+  minidom_document = parser.parse("<p>Hello World!")
+
+More documentation is available at https://html5lib.readthedocs.io/.
+
+
+Installation
+------------
+
+html5lib works on CPython 2.7+, CPython 3.3+ and PyPy.  To install it,
+use:
+
+.. code-block:: bash
+
+    $ pip install html5lib
+
+
+Optional Dependencies
+---------------------
+
+The following third-party libraries may be used for additional
+functionality:
+
+- ``datrie`` can be used under CPython to improve parsing performance
+  (though in almost all cases the improvement is marginal);
+
+- ``lxml`` is supported as a tree format (for both building and
+  walking) under CPython (but *not* PyPy where it is known to cause
+  segfaults);
+
+- ``genshi`` has a treewalker (but not builder); and
+
+- ``chardet`` can be used as a fallback when character encoding cannot
+  be determined.
+
+
+Bugs
+----
+
+Please report any bugs on the `issue tracker
+<https://github.com/html5lib/html5lib-python/issues>`_.
+
+
+Tests
+-----
+
+Unit tests require the ``pytest`` and ``mock`` libraries and can be
+run using the ``py.test`` command in the root directory.
+
+Test data are contained in a separate `html5lib-tests
+<https://github.com/html5lib/html5lib-tests>`_ repository and included
+as a submodule, thus for git checkouts they must be initialized::
+
+  $ git submodule init
+  $ git submodule update
+
+If you have all compatible Python implementations available on your
+system, you can run tests on all of them using the ``tox`` utility,
+which can be found on PyPI.
+
+
+Questions?
+----------
+
+There's a mailing list available for support on Google Groups,
+`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
+though you may get a quicker response asking on IRC in `#whatwg on
+irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
+
+Change Log
+----------
+
+1.0.1
+~~~~~
+
+Released on December 7, 2017
+
+Breaking changes:
+
+* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
+* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
+
+Features:
+
+* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
+  Will Kahn-Greene!)
+* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
+* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
+* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
+* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
+* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
+  Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
+* Semver-compliant version number.
+
+Bug fixes:
+
+* Add support for setuptools < 18.5 to support environment markers. (Thank you,
+  John Vandenberg!)
+* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
+* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
+  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
+* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
+  Kahn-Greene!)
+* Include license file in generated wheel package. (#350) (Thank you, Jon
+  Dufresne!)
+* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
+* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
+  Komal Dembla, Hugo!)
+
+
+1.0
+~~~
+
+Released and unreleased on December 7, 2017. Badly packaged release.
+
+
+0.999999999/1.0b10
+~~~~~~~~~~~~~~~~~~
+
+Released on July 15, 2016
+
+* Fix attribute order going to the tree builder to be document order
+  instead of reverse document order(!).
+
+
+0.99999999/1.0b9
+~~~~~~~~~~~~~~~~
+
+Released on July 14, 2016
+
+* **Added ordereddict as a mandatory dependency on Python 2.6.**
+
+* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
+  extras that will do the right thing based on the specific
+  interpreter implementation.
+
+* Now requires the ``mock`` package for the testsuite.
+
+* Cease supporting DATrie under PyPy.
+
+* **Remove PullDOM support, as this hasn't ever been properly
+  tested, doesn't entirely work, and as far as I can tell is
+  completely unused by anyone.**
+
+* Move testsuite to ``py.test``.
+
+* **Fix #124: move to webencodings for decoding the input byte stream;
+  this makes html5lib compliant with the Encoding Standard, and
+  introduces a required dependency on webencodings.**
+
+* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
+
+* **Fix comments containing double-dash with lxml 3.5 and above.**
+
+* **Use scripting disabled by default (as we don't implement
+  scripting).**
+
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
+* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
+  (instead of the tokenizer); as such, this will require amending all
+  callers of it to use it via the treewalker API.**
+
+* **Drop support of charade, now that chardet is supported once more.**
+
+* **Replace the charset keyword argument on parse and related methods
+  with a set of keyword arguments: override_encoding, transport_encoding,
+  same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+  to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+  sanitizer.htmlsanitizer module and move that to sanitizer. This means
+  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+  code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+  treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+  utils) to be underscore prefixed to clarify their status as private.
+
+
+0.9999999/1.0b8
+~~~~~~~~~~~~~~~
+
+Released on September 10, 2015
+
+* Fix #195: fix the sanitizer to drop broken URLs (it threw an
+  exception between 0.9999 and 0.999999).
+
+
+0.999999/1.0b7
+~~~~~~~~~~~~~~
+
+Released on July 7, 2015
+
+* Fix #189: fix the sanitizer to allow relative URLs again (as it did
+  prior to 0.9999/1.0b5).
+
+
+0.99999/1.0b6
+~~~~~~~~~~~~~
+
+Released on April 30, 2015
+
+* Fix #188: fix the sanitizer to not throw an exception when sanitizing
+  bogus data URLs.
+
+
+0.9999/1.0b5
+~~~~~~~~~~~~
+
+Released on April 29, 2015
+
+* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
+  this sounds, this has no known security implications.  No known version
+  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
+  Chrome (1 to current), or Opera (12 to current) will run any script
+  provided in these attributes.
+
+* Pass error message to the ParseError exception in strict parsing mode.
+
+* Allow data URIs in the sanitizer, with a whitelist of content-types.
+
+* Add support for Python implementations that don't support lone
+  surrogates (read: Jython). Fixes #2.
+
+* Remove localization of error messages. This functionality was totally
+  unused (and untested that everything was localizable), so we may as
+  well follow numerous browsers in not supporting translating technical
+  strings.
+
+* Expose treewalkers.pprint as a public API.
+
+* Add a documentEncoding property to HTML5Parser, fix #121.
+
+
+0.999
+~~~~~
+
+Released on December 23, 2013
+
+* Fix #127: add work-around for CPython issue #20007: .read(0) on
+  http.client.HTTPResponse drops the rest of the content.
+
+* Fix #115: lxml treewalker can now deal with fragments containing, at
+  their root level, text nodes with non-ASCII characters on Python 2.
+
+
+0.99
+~~~~
+
+Released on September 10, 2013
+
+* No library changes from 1.0b3; released as 0.99 as pip has changed
+  behaviour from 1.4 to avoid installing pre-release versions per
+  PEP 440.
+
+
+1.0b3
+~~~~~
+
+Released on July 24, 2013
+
+* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
+  implementation using it should be moved to
+  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
+  for years.
+
+* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
+  object, thereby fixing any case where html5lib is passed a
+  non-seekable RawIOBase-like object.
+
+
+1.0b2
+~~~~~
+
+Released on June 27, 2013
+
+* Removed reordering of attributes within the serializer. There is now
+  an ``alphabetical_attributes`` option which preserves the previous
+  behaviour through a new filter. This allows attribute order to be
+  preserved through html5lib if the tree builder preserves order.
+
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+  ``treeadapters.sax.to_sax`` which is generic and supports any
+  treewalker; it also resolves all known bugs with ``dom2sax``.
+
+* Fix treewalker assertions on hitting bytes strings on
+  Python 2. Previous to 1.0b1, treewalkers coped with mixed
+  bytes/unicode data on Python 2; this reintroduces this prior
+  behaviour on Python 2. Behaviour is unchanged on Python 3.
+
+
+1.0b1
+~~~~~
+
+Released on May 17, 2013
+
+* Implementation updated to implement the `HTML specification
+  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
+  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
+
+* Python 3.2+ supported in a single codebase using the ``six`` library.
+
+* Removed support for Python 2.5 and older.
+
+* Removed the deprecated Beautiful Soup 3 treebuilder.
+  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
+  since it doesn't support namespaces, foreign content like SVG and
+  MathML is parsed incorrectly.
+
+* Removed ``simpletree`` from the package. The default tree builder is
+  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
+  available, and ``xml.etree.ElementTree`` otherwise).
+
+* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
+  output was well-formed XML, and hence provided little of use.
+
+* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
+  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
+  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+
+* Optional heuristic character encoding detection now based on
+  ``charade`` for Python 2.6 - 3.3 compatibility.
+
+* Optional ``Genshi`` treewalker support fixed.
+
+* Many bugfixes, including:
+
+  * #33: null in attribute value breaks XML AttValue;
+
+  * #4: nested, indirect descendant, <button> causes infinite loop;
+
+  * `Google Code 215
+    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
+    detect seekable streams;
+
+  * `Google Code 206
+    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
+    support for <video preload=...>, <audio preload=...>;
+
+  * `Google Code 205
+    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
+    support for <video poster=...>;
+
+  * `Google Code 202
+    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
+    file breaks InputStream.
+
+* Source code is now mostly PEP 8 compliant.
+
+* Test harness has been improved and now depends on ``nose``.
+
+* Documentation updated and moved to https://html5lib.readthedocs.io/.
+
+
+0.95
+~~~~
+
+Released on February 11, 2012
+
+
+0.90
+~~~~
+
+Released on January 17, 2010
+
+
+0.11.1
+~~~~~~
+
+Released on June 12, 2008
+
+
+0.11
+~~~~
+
+Released on June 10, 2008
+
+
+0.10
+~~~~
+
+Released on October 7, 2007
+
+
+0.9
+~~~
+
+Released on March 11, 2007
+
+
+0.2
+~~~
+
+Released on January 8, 2007
+
+
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER b/bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER
new file mode 100644
index 00000000..a1b589e3
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt b/bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt
new file mode 100644
index 00000000..c87fa7a0
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2013 James Graham and other contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/METADATA b/bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
new file mode 100644
index 00000000..f8131d7b
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
@@ -0,0 +1,530 @@
+Metadata-Version: 2.0
+Name: html5lib
+Version: 1.0.1
+Summary: HTML parser based on the WHATWG HTML specification
+Home-page: https://github.com/html5lib/html5lib-python
+Author: James Graham
+Author-email: james@hoppipolla.co.uk
+License: MIT License
+Description-Content-Type: UNKNOWN
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Requires-Dist: six (>=1.9)
+Requires-Dist: webencodings
+Provides-Extra: all
+Requires-Dist: genshi; extra == 'all'
+Requires-Dist: chardet (>=2.2); extra == 'all'
+Provides-Extra: all
+Requires-Dist: datrie; platform_python_implementation == 'CPython' and extra == 'all'
+Requires-Dist: lxml; platform_python_implementation == 'CPython' and extra == 'all'
+Provides-Extra: chardet
+Requires-Dist: chardet (>=2.2); extra == 'chardet'
+Provides-Extra: datrie
+Requires-Dist: datrie; platform_python_implementation == 'CPython' and extra == 'datrie'
+Provides-Extra: genshi
+Requires-Dist: genshi; extra == 'genshi'
+Provides-Extra: lxml
+Requires-Dist: lxml; platform_python_implementation == 'CPython' and extra == 'lxml'
+
+html5lib
+========
+
+.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
+  :target: https://travis-ci.org/html5lib/html5lib-python
+
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the WHATWG HTML specification, as is implemented by all major
+web browsers.
+
+
+Usage
+-----
+
+Simple usage follows this pattern:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      document = html5lib.parse(f)
+
+or:
+
+.. code-block:: python
+
+  import html5lib
+  document = html5lib.parse("<p>Hello World!")
+
+By default, the ``document`` will be an ``xml.etree`` element instance.
+Whenever possible, html5lib chooses the accelerated ``ElementTree``
+implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+
+Two other tree types are supported: ``xml.dom.minidom`` and
+``lxml.etree``. To use an alternative format, specify the name of
+a treebuilder:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+When using with ``urllib2`` (Python 2), the charset from HTTP should be
+pass into html5lib as follows:
+
+.. code-block:: python
+
+  from contextlib import closing
+  from urllib2 import urlopen
+  import html5lib
+
+  with closing(urlopen("http://example.com/")) as f:
+      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
+
+When using with ``urllib.request`` (Python 3), the charset from HTTP
+should be pass into html5lib as follows:
+
+.. code-block:: python
+
+  from urllib.request import urlopen
+  import html5lib
+
+  with urlopen("http://example.com/") as f:
+      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
+
+To have more control over the parser, create a parser object explicitly.
+For instance, to make the parser raise exceptions on parse errors, use:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      parser = html5lib.HTMLParser(strict=True)
+      document = parser.parse(f)
+
+When you're instantiating parser objects explicitly, pass a treebuilder
+class as the ``tree`` keyword argument to use an alternative document
+format:
+
+.. code-block:: python
+
+  import html5lib
+  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
+  minidom_document = parser.parse("<p>Hello World!")
+
+More documentation is available at https://html5lib.readthedocs.io/.
+
+
+Installation
+------------
+
+html5lib works on CPython 2.7+, CPython 3.3+ and PyPy.  To install it,
+use:
+
+.. code-block:: bash
+
+    $ pip install html5lib
+
+
+Optional Dependencies
+---------------------
+
+The following third-party libraries may be used for additional
+functionality:
+
+- ``datrie`` can be used under CPython to improve parsing performance
+  (though in almost all cases the improvement is marginal);
+
+- ``lxml`` is supported as a tree format (for both building and
+  walking) under CPython (but *not* PyPy where it is known to cause
+  segfaults);
+
+- ``genshi`` has a treewalker (but not builder); and
+
+- ``chardet`` can be used as a fallback when character encoding cannot
+  be determined.
+
+
+Bugs
+----
+
+Please report any bugs on the `issue tracker
+<https://github.com/html5lib/html5lib-python/issues>`_.
+
+
+Tests
+-----
+
+Unit tests require the ``pytest`` and ``mock`` libraries and can be
+run using the ``py.test`` command in the root directory.
+
+Test data are contained in a separate `html5lib-tests
+<https://github.com/html5lib/html5lib-tests>`_ repository and included
+as a submodule, thus for git checkouts they must be initialized::
+
+  $ git submodule init
+  $ git submodule update
+
+If you have all compatible Python implementations available on your
+system, you can run tests on all of them using the ``tox`` utility,
+which can be found on PyPI.
+
+
+Questions?
+----------
+
+There's a mailing list available for support on Google Groups,
+`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
+though you may get a quicker response asking on IRC in `#whatwg on
+irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
+
+Change Log
+----------
+
+1.0.1
+~~~~~
+
+Released on December 7, 2017
+
+Breaking changes:
+
+* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
+* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
+
+Features:
+
+* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
+  Will Kahn-Greene!)
+* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
+* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
+* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
+* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
+* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
+  Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
+* Semver-compliant version number.
+
+Bug fixes:
+
+* Add support for setuptools < 18.5 to support environment markers. (Thank you,
+  John Vandenberg!)
+* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
+* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
+  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
+* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
+  Kahn-Greene!)
+* Include license file in generated wheel package. (#350) (Thank you, Jon
+  Dufresne!)
+* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
+* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
+  Komal Dembla, Hugo!)
+
+
+1.0
+~~~
+
+Released and unreleased on December 7, 2017. Badly packaged release.
+
+
+0.999999999/1.0b10
+~~~~~~~~~~~~~~~~~~
+
+Released on July 15, 2016
+
+* Fix attribute order going to the tree builder to be document order
+  instead of reverse document order(!).
+
+
+0.99999999/1.0b9
+~~~~~~~~~~~~~~~~
+
+Released on July 14, 2016
+
+* **Added ordereddict as a mandatory dependency on Python 2.6.**
+
+* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
+  extras that will do the right thing based on the specific
+  interpreter implementation.
+
+* Now requires the ``mock`` package for the testsuite.
+
+* Cease supporting DATrie under PyPy.
+
+* **Remove PullDOM support, as this hasn't ever been properly
+  tested, doesn't entirely work, and as far as I can tell is
+  completely unused by anyone.**
+
+* Move testsuite to ``py.test``.
+
+* **Fix #124: move to webencodings for decoding the input byte stream;
+  this makes html5lib compliant with the Encoding Standard, and
+  introduces a required dependency on webencodings.**
+
+* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
+
+* **Fix comments containing double-dash with lxml 3.5 and above.**
+
+* **Use scripting disabled by default (as we don't implement
+  scripting).**
+
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
+* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
+  (instead of the tokenizer); as such, this will require amending all
+  callers of it to use it via the treewalker API.**
+
+* **Drop support of charade, now that chardet is supported once more.**
+
+* **Replace the charset keyword argument on parse and related methods
+  with a set of keyword arguments: override_encoding, transport_encoding,
+  same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+  to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+  sanitizer.htmlsanitizer module and move that to sanitizer. This means
+  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+  code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+  treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+  utils) to be underscore prefixed to clarify their status as private.
+
+
+0.9999999/1.0b8
+~~~~~~~~~~~~~~~
+
+Released on September 10, 2015
+
+* Fix #195: fix the sanitizer to drop broken URLs (it threw an
+  exception between 0.9999 and 0.999999).
+
+
+0.999999/1.0b7
+~~~~~~~~~~~~~~
+
+Released on July 7, 2015
+
+* Fix #189: fix the sanitizer to allow relative URLs again (as it did
+  prior to 0.9999/1.0b5).
+
+
+0.99999/1.0b6
+~~~~~~~~~~~~~
+
+Released on April 30, 2015
+
+* Fix #188: fix the sanitizer to not throw an exception when sanitizing
+  bogus data URLs.
+
+
+0.9999/1.0b5
+~~~~~~~~~~~~
+
+Released on April 29, 2015
+
+* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
+  this sounds, this has no known security implications.  No known version
+  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
+  Chrome (1 to current), or Opera (12 to current) will run any script
+  provided in these attributes.
+
+* Pass error message to the ParseError exception in strict parsing mode.
+
+* Allow data URIs in the sanitizer, with a whitelist of content-types.
+
+* Add support for Python implementations that don't support lone
+  surrogates (read: Jython). Fixes #2.
+
+* Remove localization of error messages. This functionality was totally
+  unused (and untested that everything was localizable), so we may as
+  well follow numerous browsers in not supporting translating technical
+  strings.
+
+* Expose treewalkers.pprint as a public API.
+
+* Add a documentEncoding property to HTML5Parser, fix #121.
+
+
+0.999
+~~~~~
+
+Released on December 23, 2013
+
+* Fix #127: add work-around for CPython issue #20007: .read(0) on
+  http.client.HTTPResponse drops the rest of the content.
+
+* Fix #115: lxml treewalker can now deal with fragments containing, at
+  their root level, text nodes with non-ASCII characters on Python 2.
+
+
+0.99
+~~~~
+
+Released on September 10, 2013
+
+* No library changes from 1.0b3; released as 0.99 as pip has changed
+  behaviour from 1.4 to avoid installing pre-release versions per
+  PEP 440.
+
+
+1.0b3
+~~~~~
+
+Released on July 24, 2013
+
+* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
+  implementation using it should be moved to
+  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
+  for years.
+
+* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
+  object, thereby fixing any case where html5lib is passed a
+  non-seekable RawIOBase-like object.
+
+
+1.0b2
+~~~~~
+
+Released on June 27, 2013
+
+* Removed reordering of attributes within the serializer. There is now
+  an ``alphabetical_attributes`` option which preserves the previous
+  behaviour through a new filter. This allows attribute order to be
+  preserved through html5lib if the tree builder preserves order.
+
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+  ``treeadapters.sax.to_sax`` which is generic and supports any
+  treewalker; it also resolves all known bugs with ``dom2sax``.
+
+* Fix treewalker assertions on hitting bytes strings on
+  Python 2. Previous to 1.0b1, treewalkers coped with mixed
+  bytes/unicode data on Python 2; this reintroduces this prior
+  behaviour on Python 2. Behaviour is unchanged on Python 3.
+
+
+1.0b1
+~~~~~
+
+Released on May 17, 2013
+
+* Implementation updated to implement the `HTML specification
+  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
+  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
+
+* Python 3.2+ supported in a single codebase using the ``six`` library.
+
+* Removed support for Python 2.5 and older.
+
+* Removed the deprecated Beautiful Soup 3 treebuilder.
+  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
+  since it doesn't support namespaces, foreign content like SVG and
+  MathML is parsed incorrectly.
+
+* Removed ``simpletree`` from the package. The default tree builder is
+  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
+  available, and ``xml.etree.ElementTree`` otherwise).
+
+* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
+  output was well-formed XML, and hence provided little of use.
+
+* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
+  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
+  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+
+* Optional heuristic character encoding detection now based on
+  ``charade`` for Python 2.6 - 3.3 compatibility.
+
+* Optional ``Genshi`` treewalker support fixed.
+
+* Many bugfixes, including:
+
+  * #33: null in attribute value breaks XML AttValue;
+
+  * #4: nested, indirect descendant, <button> causes infinite loop;
+
+  * `Google Code 215
+    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
+    detect seekable streams;
+
+  * `Google Code 206
+    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
+    support for <video preload=...>, <audio preload=...>;
+
+  * `Google Code 205
+    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
+    support for <video poster=...>;
+
+  * `Google Code 202
+    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
+    file breaks InputStream.
+
+* Source code is now mostly PEP 8 compliant.
+
+* Test harness has been improved and now depends on ``nose``.
+
+* Documentation updated and moved to https://html5lib.readthedocs.io/.
+
+
+0.95
+~~~~
+
+Released on February 11, 2012
+
+
+0.90
+~~~~
+
+Released on January 17, 2010
+
+
+0.11.1
+~~~~~~
+
+Released on June 12, 2008
+
+
+0.11
+~~~~
+
+Released on June 10, 2008
+
+
+0.10
+~~~~
+
+Released on October 7, 2007
+
+
+0.9
+~~~
+
+Released on March 11, 2007
+
+
+0.2
+~~~
+
+Released on January 8, 2007
+
+
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD b/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
new file mode 100644
index 00000000..bad12d0a
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
@@ -0,0 +1,42 @@
+html5lib/__init__.py,sha256=q1D20NqqzRVgmTHW2xiVtaQT2eKna-iit3tL62Yn5OI,1145
+html5lib/_ihatexml.py,sha256=3LBtJMlzgwM8vpQiU1TvGmEEmNH72sV0yD8yS53y07A,16705
+html5lib/_inputstream.py,sha256=WtC-hb3nS7Du6XvdL9JACOQgD5ydPKb7f9z0q4OIvRM,32499
+html5lib/_tokenizer.py,sha256=JFZ4kiYfas1f62q2bdXH8Ch5DtXAWEZg0KYkRF4boRQ,76568
+html5lib/_utils.py,sha256=UHC4fXEZRJ0YM44Z4DeLem66auCjb08vSPcN6Y714Iw,4003
+html5lib/constants.py,sha256=4lmZWLtEPRLnl8NzftOoYTJdo6jpeMtP6dqQC0g_bWQ,83518
+html5lib/html5parser.py,sha256=eeMsctZUonbJZPegB_CElFye2lGufMcMsxQxsJtf7Mg,118951
+html5lib/serializer.py,sha256=cmZQjjaXThEe2_6yzDqeb3yXS_hUggv0cCa2VBD9e2Y,15746
+html5lib/_trie/__init__.py,sha256=8VR1bcgD2OpeS2XExpu5yBhP_Q1K-lwKbBKICBPf1kU,289
+html5lib/_trie/_base.py,sha256=uJHVhzif9S0MJXgy9F98iEev5evi_rgUk5BmEbUSp8c,930
+html5lib/_trie/datrie.py,sha256=rGMj61020CBiR97e4kyMyqn_FSIJzgDcYT2uj7PZkoo,1166
+html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
+html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
+html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
+html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
+html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
+html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
+html5lib/filters/sanitizer.py,sha256=V6_cpCq9EXgXkL1CblWEUxSgHy466Hy8k0453x8PSs8,26236
+html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
+html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
+html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
+html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
+html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
+html5lib/treebuilders/base.py,sha256=JEFLxUEsluRl7vY-6cnAk44HxgCAkaj4GpEOBpg8tao,14567
+html5lib/treebuilders/dom.py,sha256=SY3MsijXyzdNPc8aK5IQsupBoM8J67y56DgNtGvsb9g,8835
+html5lib/treebuilders/etree.py,sha256=R0zaNrdtPel3XHV8PUVcQzVnMuiOm_8fpZof7tU7ips,12752
+html5lib/treebuilders/etree_lxml.py,sha256=9V0dXxbJYYq-Skgb5-_OL2NkVYpjioEb4CHajo0e9yI,14122
+html5lib/treewalkers/__init__.py,sha256=yhXxHpjlSqfQyUag3v8-vWjMPriFBU8YRAPNpDgBTn8,5714
+html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
+html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
+html5lib/treewalkers/etree.py,sha256=gRzfuNnWg6r-fvtXRp4xPVTC1CHPowcn8Dc4-WcDoOg,4538
+html5lib/treewalkers/etree_lxml.py,sha256=AR07dDrdkDqrQT4yNK_5WeGiZMHfOrM3ZmmII6YrSgs,6297
+html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
+html5lib-1.0.1.dist-info/DESCRIPTION.rst,sha256=1QkiA38mSikkzyQO1kAQXkBUtQSTl-MR63Zd2TMe06s,13763
+html5lib-1.0.1.dist-info/LICENSE.txt,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
+html5lib-1.0.1.dist-info/METADATA,sha256=ViKKPHTrTam-_oHIB2cxmtg4gKgqdfl4ahDnIWBdyUE,15484
+html5lib-1.0.1.dist-info/RECORD,,
+html5lib-1.0.1.dist-info/WHEEL,sha256=5wvfB7GvgZAbKBSE9uX9Zbi6LCL-_KgezgHblXhCRnM,113
+html5lib-1.0.1.dist-info/metadata.json,sha256=bbLAvG6pYvgK2EdWNVi1mNa5pu9bI4qLwGm0IvuesFA,1731
+html5lib-1.0.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
+html5lib-1.0.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL b/bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
new file mode 100644
index 00000000..7bf9daa1
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
@@ -0,0 +1,6 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.30.0.a0)
+Root-Is-Purelib: true
+Tag: py2-none-any
+Tag: py3-none-any
+
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json b/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
new file mode 100644
index 00000000..23cd6e41
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
@@ -0,0 +1 @@
+{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Markup :: HTML"], "description_content_type": "UNKNOWN", "extensions": {"python.details": {"contacts": [{"email": "james@hoppipolla.co.uk", "name": "James Graham", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst", "license": "LICENSE.txt"}, "project_urls": {"Home": "https://github.com/html5lib/html5lib-python"}}}, "extras": ["all", "chardet", "datrie", "genshi", "lxml"], "generator": "bdist_wheel (0.30.0.a0)", "license": "MIT License", "metadata_version": "2.0", "name": "html5lib", "run_requires": [{"extra": "all", "requires": ["chardet (>=2.2)", "genshi"]}, {"extra": "chardet", "requires": ["chardet (>=2.2)"]}, {"extra": "genshi", "requires": ["genshi"]}, {"requires": ["six (>=1.9)", "webencodings"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "all", "requires": ["datrie", "lxml"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "datrie", "requires": ["datrie"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "lxml", "requires": ["lxml"]}], "summary": "HTML parser based on the WHATWG HTML specification", "version": "1.0.1"}
\ No newline at end of file
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt b/bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt
new file mode 100644
index 00000000..6bcef8ab
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt
@@ -0,0 +1 @@
+html5lib
diff --git a/bleach/_vendor/html5lib/__init__.py b/bleach/_vendor/html5lib/__init__.py
new file mode 100644
index 00000000..ba01065e
--- /dev/null
+++ b/bleach/_vendor/html5lib/__init__.py
@@ -0,0 +1,35 @@
+"""
+HTML parsing library based on the `WHATWG HTML specification
+<https://whatwg.org/html>`_. The parser is designed to be compatible with
+existing HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage::
+
+    import html5lib
+    with open("my_document.html", "rb") as f:
+        tree = html5lib.parse(f)
+
+For convenience, this module re-exports the following names:
+
+* :func:`~.html5parser.parse`
+* :func:`~.html5parser.parseFragment`
+* :class:`~.html5parser.HTMLParser`
+* :func:`~.treebuilders.getTreeBuilder`
+* :func:`~.treewalkers.getTreeWalker`
+* :func:`~.serializer.serialize`
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+
+# this has to be at the top level, see how setup.py parses this
+#: Distribution version number.
+__version__ = "1.0.1"
diff --git a/bleach/_vendor/html5lib/_ihatexml.py b/bleach/_vendor/html5lib/_ihatexml.py
new file mode 100644
index 00000000..4c77717b
--- /dev/null
+++ b/bleach/_vendor/html5lib/_ihatexml.py
@@ -0,0 +1,288 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+import warnings
+
+from .constants import DataLossWarning
+
+baseChar = """
+[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
+[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
+[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
+[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
+[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
+[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
+[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
+[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
+[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
+[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
+[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
+[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
+[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
+[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
+[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
+[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
+[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
+[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
+[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
+[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
+[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
+[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
+[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
+[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
+[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
+[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
+[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
+[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
+[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
+[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
+#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
+#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
+#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
+[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
+[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
+#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
+[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
+[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
+[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
+[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
+[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
+#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
+[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
+[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
+[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
+[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """
+[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
+[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
+[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
+[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
+#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
+[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
+[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
+#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
+[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
+[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
+#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
+[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
+[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
+[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
+[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
+[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
+#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
+[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
+#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
+[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
+[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
+#x3099 | #x309A"""
+
+digit = """
+[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
+[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
+[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
+[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """
+#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
+#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+# Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+                   extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1] * 2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i + j][1]
+            j += 1
+        i += j
+    return rv
+
+# We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+
+
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1] + 1, charList[i + 1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+            rv.append(escapeRegexp(chr(item[0])))
+        else:
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
+                      escapeRegexp(chr(item[1])))
+    return "[%s]" % "".join(rv)
+
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                         "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, "\\" + char)
+
+    return string
+
+# output from the above
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
+
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+
+    def __init__(self,
+                 dropXmlnsLocalName=False,
+                 dropXmlnsAttrNs=False,
+                 preventDoubleDashComments=False,
+                 preventDashAtCommentEnd=False,
+                 replaceFormFeedCharacters=True,
+                 preventSingleQuotePubid=False):
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.preventSingleQuotePubid = preventSingleQuotePubid
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
+            return None
+        elif (self.dropXmlnsAttrNs and
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                data = data.replace("--", "- -")
+            if data.endswith("-"):
+                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                data += " "
+        return data
+
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            for _ in range(data.count("\x0C")):
+                warnings.warn("Text cannot contain U+000C", DataLossWarning)
+            data = data.replace("\x0C", " ")
+        # Other non-xml characters
+        return data
+
+    def coercePubid(self, data):
+        dataOutput = data
+        for char in nonPubidCharRegexp.findall(data):
+            warnings.warn("Coercing non-XML pubid", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            dataOutput = dataOutput.replace(char, replacement)
+        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+        return dataOutput
+
+    def toXmlName(self, name):
+        nameFirst = name[0]
+        nameRest = name[1:]
+        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+        if m:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            nameFirstOutput = self.getReplacementCharacter(nameFirst)
+        else:
+            nameFirstOutput = nameFirst
+
+        nameRestOutput = nameRest
+        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+        for char in replaceChars:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            nameRestOutput = nameRestOutput.replace(char, replacement)
+        return nameFirstOutput + nameRestOutput
+
+    def getReplacementCharacter(self, char):
+        if char in self.replaceCache:
+            replacement = self.replaceCache[char]
+        else:
+            replacement = self.escapeChar(char)
+        return replacement
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U%05X" % ord(char)
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return chr(int(charcode[1:], 16))
diff --git a/bleach/_vendor/html5lib/_inputstream.py b/bleach/_vendor/html5lib/_inputstream.py
new file mode 100644
index 00000000..177f0ab9
--- /dev/null
+++ b/bleach/_vendor/html5lib/_inputstream.py
@@ -0,0 +1,923 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type, binary_type
+from six.moves import http_client, urllib
+
+import codecs
+import re
+
+import webencodings
+
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import _ReparseException
+from . import _utils
+
+from io import StringIO
+
+try:
+    from io import BytesIO
+except ImportError:
+    BytesIO = StringIO
+
+# Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
+
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
+
+if _utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # eval. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+
+
+class BufferedStream(object):
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that
+    joining many strings will be slow since it is O(n**2)
+    """
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1, 0]  # chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos <= self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= len(self.buffer[i])
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+
+        return b"".join(rv)
+
+
+def HTMLInputStream(source, **kwargs):
+    # Work around Python bug #20007: read(0) closes the connection.
+    # http://bugs.python.org/issue20007
+    if (isinstance(source, http_client.HTTPResponse) or
+        # Also check for addinfourl wrapping HTTPResponse
+        (isinstance(source, urllib.response.addbase) and
+         isinstance(source.fp, http_client.HTTPResponse))):
+        isUnicode = False
+    elif hasattr(source, "read"):
+        isUnicode = isinstance(source.read(0), text_type)
+    else:
+        isUnicode = isinstance(source, text_type)
+
+    if isUnicode:
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
+
+        return HTMLUnicodeInputStream(source, **kwargs)
+    else:
+        return HTMLBinaryInputStream(source, **kwargs)
+
+
+class HTMLUnicodeInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    _defaultChunkSize = 10240
+
+    def __init__(self, source):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+
+        if not _utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+        elif len("\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
+        # List of where new lines occur
+        self.newLines = [0]
+
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
+        self.dataStream = self.openStream(source)
+
+        self.reset()
+
+    def reset(self):
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+        self.errors = []
+
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
+
+        # Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = StringIO(source)
+
+        return stream
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count('\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind('\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        line, col = self._position(self.chunkOffset)
+        return (line + 1, col)
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
+        return char
+
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
+        data = self.dataStream.read(chunkSize)
+
+        # Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
+            return False
+
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)
+
+        # Replace invalid characters
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for _ in range(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        # Someone picked the wrong compile option
+        # You lose
+        skip = False
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            # Pretty sure there should be endianness issues here
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
+                # We have a surrogate pair!
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+
+    def charsUntil(self, characters, opposite=False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
+        """
+
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters:
+                    assert(ord(c) < 128)
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = "^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
+
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
+            else:
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
+
+        r = "".join(rv)
+        return r
+
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
+
+
+class HTMLBinaryInputStream(HTMLUnicodeInputStream):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        HTMLUnicodeInputStream.__init__(self, self.rawStream)
+
+        # Encoding Information
+        # Number of bytes to use when looking for a meta element with
+        # encoding information
+        self.numBytesMeta = 1024
+        # Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding
+
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None
+
+        # Call superclass
+        self.reset()
+
+    def reset(self):
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
+        HTMLUnicodeInputStream.reset(self)
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = BytesIO(source)
+
+        try:
+            stream.seek(stream.tell())
+        except:  # pylint:disable=bare-except
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
+        # This will also read past the BOM if present
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Guess with chardet, if available
+        if chardet:
+            try:
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    assert isinstance(buffer, bytes)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = lookupEncoding(detector.result['encoding'])
+                self.rawStream.seek(0)
+                if encoding is not None:
+                    return encoding, "tentative"
+
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"
+
+    def changeEncoding(self, newEncoding):
+        assert self.charEncoding[1] != "certain"
+        newEncoding = lookupEncoding(newEncoding)
+        if newEncoding is None:
+            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.charEncoding = (newEncoding, "certain")
+            self.reset()
+            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+        assert isinstance(string, bytes)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2])  # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        assert isinstance(buffer, bytes)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")
+
+        return encoding
+
+
+class EncodingBytes(bytes):
+    """String-like object with an associated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __new__(self, value):
+        assert isinstance(value, bytes)
+        return bytes.__new__(self, value.lower())
+
+    def __init__(self, value):
+        # pylint:disable=unused-argument
+        self._position = -1
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p:p + 1]
+
+    def next(self):
+        # Py2 compat
+        return self.__next__()
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p:p + 1]
+
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
+        else:
+            return None
+
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position:self.position + 1]
+
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharactersBytes):
+        """Skip past a list of characters"""
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p:p + 1]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p:p + 1]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def matchBytes(self, bytes):
+        """Look for a sequence of bytes at the start of a string. If the bytes
+        are found return True and advance the position to the byte after the
+        match. Otherwise return False and leave the position alone"""
+        p = self.position
+        data = self[p:p + len(bytes)]
+        rv = data.startswith(bytes)
+        if rv:
+            self.position += len(bytes)
+        return rv
+
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        newPosition = self[self.position:].find(bytes)
+        if newPosition > -1:
+            # XXX: This is ugly, but I can't see a nicer way to fix this.
+            if self._position == -1:
+                self._position = 0
+            self._position += (newPosition + len(bytes) - 1)
+            return True
+        else:
+            raise StopIteration
+
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        methodDispatch = (
+            (b"<!--", self.handleComment),
+            (b"<meta", self.handleMeta),
+            (b"</", self.handlePossibleEndTag),
+            (b"<!", self.handleOther),
+            (b"<?", self.handleOther),
+            (b"<", self.handlePossibleStartTag))
+        for _ in self.data:
+            keepParsing = True
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key):
+                    try:
+                        keepParsing = method()
+                        break
+                    except StopIteration:
+                        keepParsing = False
+                        break
+            if not keepParsing:
+                break
+
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo(b"-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharactersBytes:
+            # if we have <meta not followed by a space so just keep going
+            return True
+        # We have a valid meta element we want to search for attributes
+        hasPragma = False
+        pendingEncoding = None
+        while True:
+            # Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == b"http-equiv":
+                    hasPragma = attr[1] == b"content-type"
+                    if hasPragma and pendingEncoding is not None:
+                        self.encoding = pendingEncoding
+                        return False
+                elif attr[0] == b"charset":
+                    tentativeEncoding = attr[1]
+                    codec = lookupEncoding(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+                elif attr[0] == b"content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    if tentativeEncoding is not None:
+                        codec = lookupEncoding(tentativeEncoding)
+                        if codec is not None:
+                            if hasPragma:
+                                self.encoding = codec
+                                return False
+                            else:
+                                pendingEncoding = codec
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        next(self.data)
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
+            # If the next byte is not an ascii letter either ignore this
+            # fragment (possible start tag case) or treat it according to
+            # handleOther
+            if endTag:
+                data.previous()
+                self.handleOther()
+            return True
+
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == b"<":
+            # return to the first step in the overall "two step" algorithm
+            # reprocessing the < byte
+            data.previous()
+        else:
+            # Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(b">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream,
+        if one is found, or None"""
+        data = self.data
+        # Step 1 (skip chars)
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
+        assert c is None or len(c) == 1
+        # Step 2
+        if c in (b">", None):
+            return None
+        # Step 3
+        attrName = []
+        attrValue = []
+        # Step 4 attribute name
+        while True:
+            if c == b"=" and attrName:
+                break
+            elif c in spaceCharactersBytes:
+                # Step 6!
+                c = data.skip()
+                break
+            elif c in (b"/", b">"):
+                return b"".join(attrName), b""
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrName.append(c)
+            # Step 5
+            c = next(data)
+        # Step 7
+        if c != b"=":
+            data.previous()
+            return b"".join(attrName), b""
+        # Step 8
+        next(data)
+        # Step 9
+        c = data.skip()
+        # Step 10
+        if c in (b"'", b'"'):
+            # 10.1
+            quoteChar = c
+            while True:
+                # 10.2
+                c = next(data)
+                # 10.3
+                if c == quoteChar:
+                    next(data)
+                    return b"".join(attrName), b"".join(attrValue)
+                # 10.4
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
+                # 10.5
+                else:
+                    attrValue.append(c)
+        elif c == b">":
+            return b"".join(attrName), b""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
+        else:
+            attrValue.append(c)
+        # Step 11
+        while True:
+            c = next(data)
+            if c in spacesAngleBrackets:
+                return b"".join(attrName), b"".join(attrValue)
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrValue.append(c)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        assert isinstance(data, bytes)
+        self.data = data
+
+    def parse(self):
+        try:
+            # Check if the attr name is charset
+            # otherwise return
+            self.data.jumpTo(b"charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == b"=":
+                # If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            # Look for an encoding between matching quote marks
+            if self.data.currentByte in (b'"', b"'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                if self.data.jumpTo(quoteMark):
+                    return self.data[oldPosition:self.data.position]
+                else:
+                    return None
+            else:
+                # Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.skipUntil(spaceCharactersBytes)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    # Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+
+def lookupEncoding(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if isinstance(encoding, binary_type):
+        try:
+            encoding = encoding.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
+    else:
+        return None
diff --git a/bleach/_vendor/html5lib/_tokenizer.py b/bleach/_vendor/html5lib/_tokenizer.py
new file mode 100644
index 00000000..6078f66a
--- /dev/null
+++ b/bleach/_vendor/html5lib/_tokenizer.py
@@ -0,0 +1,1721 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import unichr as chr
+
+from collections import deque
+
+from .constants import spaceCharacters
+from .constants import entities
+from .constants import asciiLetters, asciiUpper2Lower
+from .constants import digits, hexDigits, EOF
+from .constants import tokenTypes, tagTokenTypes
+from .constants import replacementCharacters
+
+from ._inputstream import HTMLInputStream
+
+from ._trie import Trie
+
+entitiesTrie = Trie(entities)
+
+
+class HTMLTokenizer(object):
+    """ This class takes care of tokenizing HTML.
+
+    * self.currentToken
+      Holds the token that is currently being processed.
+
+    * self.state
+      Holds a reference to the method to be invoked... XXX
+
+    * self.stream
+      Points to HTMLInputStream object.
+    """
+
+    def __init__(self, stream, parser=None, **kwargs):
+
+        self.stream = HTMLInputStream(stream, **kwargs)
+        self.parser = parser
+
+        # Setup the initial tokenizer state
+        self.escapeFlag = False
+        self.lastFourChars = []
+        self.state = self.dataState
+        self.escape = False
+
+        # The current token being created
+        self.currentToken = None
+        super(HTMLTokenizer, self).__init__()
+
+    def __iter__(self):
+        """ This is where the magic happens.
+
+        We do our usually processing through the states and when we have a token
+        to return we yield the token which pauses processing until the next token
+        is requested.
+        """
+        self.tokenQueue = deque([])
+        # Start processing. When EOF is reached self.state will return False
+        # instead of True and the loop will terminate.
+        while self.state():
+            while self.stream.errors:
+                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
+            while self.tokenQueue:
+                yield self.tokenQueue.popleft()
+
+    def consumeNumberEntity(self, isHex):
+        """This function returns either U+FFFD or the character based on the
+        decimal or hexadecimal representation. It also discards ";" if present.
+        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
+        """
+
+        allowed = digits
+        radix = 10
+        if isHex:
+            allowed = hexDigits
+            radix = 16
+
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = self.stream.char()
+        while c in allowed and c is not EOF:
+            charStack.append(c)
+            c = self.stream.char()
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = int("".join(charStack), radix)
+
+        # Certain characters get replaced with others
+        if charAsInt in replacementCharacters:
+            char = replacementCharacters[charAsInt]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
+        elif ((0xD800 <= charAsInt <= 0xDFFF) or
+              (charAsInt > 0x10FFFF)):
+            char = "\uFFFD"
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
+        else:
+            # Should speed up this check somehow (e.g. move the set to a constant)
+            if ((0x0001 <= charAsInt <= 0x0008) or
+                (0x000E <= charAsInt <= 0x001F) or
+                (0x007F <= charAsInt <= 0x009F) or
+                (0xFDD0 <= charAsInt <= 0xFDEF) or
+                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+                                        0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+                                        0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+                                        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+                                        0xFFFFF, 0x10FFFE, 0x10FFFF])):
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data":
+                                        "illegal-codepoint-for-numeric-entity",
+                                        "datavars": {"charAsInt": charAsInt}})
+            try:
+                # Try/except needed as UCS-2 Python builds' unichar only works
+                # within the BMP.
+                char = chr(charAsInt)
+            except ValueError:
+                v = charAsInt - 0x10000
+                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != ";":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "numeric-entity-without-semicolon"})
+            self.stream.unget(c)
+
+        return char
+
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # Initialise to the default output for when no entity is matched
+        output = "&"
+
+        charStack = [self.stream.char()]
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
+            self.stream.unget(charStack[0])
+
+        elif charStack[0] == "#":
+            # Read the next character to see if it's hex or decimal
+            hex = False
+            charStack.append(self.stream.char())
+            if charStack[-1] in ("x", "X"):
+                hex = True
+                charStack.append(self.stream.char())
+
+            # charStack[-1] should be the first digit
+            if (hex and charStack[-1] in hexDigits) \
+                    or (not hex and charStack[-1] in digits):
+                # At least one digit found, so consume the whole number
+                self.stream.unget(charStack[-1])
+                output = self.consumeNumberEntity(hex)
+            else:
+                # No digits found
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data": "expected-numeric-entity"})
+                self.stream.unget(charStack.pop())
+                output = "&" + "".join(charStack)
+
+        else:
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            while (charStack[-1] is not EOF):
+                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
+                    break
+                charStack.append(self.stream.char())
+
+            # At this point we have a string that starts with some characters
+            # that may match an entity
+            # Try to find the longest entity the string will match to take care
+            # of &noti for instance.
+            try:
+                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
+                entityLength = len(entityName)
+            except KeyError:
+                entityName = None
+
+            if entityName is not None:
+                if entityName[-1] != ";":
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                            "named-entity-without-semicolon"})
+                if (entityName[-1] != ";" and fromAttribute and
+                    (charStack[entityLength] in asciiLetters or
+                     charStack[entityLength] in digits or
+                     charStack[entityLength] == "=")):
+                    self.stream.unget(charStack.pop())
+                    output = "&" + "".join(charStack)
+                else:
+                    output = entities[entityName]
+                    self.stream.unget(charStack.pop())
+                    output += "".join(charStack[entityLength:])
+            else:
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                        "expected-named-entity"})
+                self.stream.unget(charStack.pop())
+                output = "&" + "".join(charStack)
+
+        if fromAttribute:
+            self.currentToken["data"][-1][1] += output
+        else:
+            if output in spaceCharacters:
+                tokenType = "SpaceCharacters"
+            else:
+                tokenType = "Characters"
+            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
+
+    def processEntityInAttribute(self, allowedChar):
+        """This method replaces the need for "entityInAttributeValueState".
+        """
+        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
+
+    def emitCurrentToken(self):
+        """This method is a generic handler for emitting the tags. It also sets
+        the state to "data" because that's what's needed after a token has been
+        emitted.
+        """
+        token = self.currentToken
+        # Add token to the queue to be yielded
+        if (token["type"] in tagTokenTypes):
+            token["name"] = token["name"].translate(asciiUpper2Lower)
+            if token["type"] == tokenTypes["EndTag"]:
+                if token["data"]:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "attributes-in-end-tag"})
+                if token["selfClosing"]:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "self-closing-flag-on-end-tag"})
+        self.tokenQueue.append(token)
+        self.state = self.dataState
+
+    # Below are the various tokenizer states worked out.
+    def dataState(self):
+        data = self.stream.char()
+        if data == "&":
+            self.state = self.entityDataState
+        elif data == "<":
+            self.state = self.tagOpenState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\u0000"})
+        elif data is EOF:
+            # Tokenization ends.
+            return False
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
+            # No need to update lastFourChars here, since the first space will
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
+        else:
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def entityDataState(self):
+        self.consumeEntity()
+        self.state = self.dataState
+        return True
+
+    def rcdataState(self):
+        data = self.stream.char()
+        if data == "&":
+            self.state = self.characterReferenceInRcdata
+        elif data == "<":
+            self.state = self.rcdataLessThanSignState
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
+            # No need to update lastFourChars here, since the first space will
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
+        else:
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def characterReferenceInRcdata(self):
+        self.consumeEntity()
+        self.state = self.rcdataState
+        return True
+
+    def rawtextState(self):
+        data = self.stream.char()
+        if data == "<":
+            self.state = self.rawtextLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        else:
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def scriptDataState(self):
+        data = self.stream.char()
+        if data == "<":
+            self.state = self.scriptDataLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        else:
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def plaintextState(self):
+        data = self.stream.char()
+        if data == EOF:
+            # Tokenization ends.
+            return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + self.stream.charsUntil("\u0000")})
+        return True
+
+    def tagOpenState(self):
+        data = self.stream.char()
+        if data == "!":
+            self.state = self.markupDeclarationOpenState
+        elif data == "/":
+            self.state = self.closeTagOpenState
+        elif data in asciiLetters:
+            self.currentToken = {"type": tokenTypes["StartTag"],
+                                 "name": data, "data": [],
+                                 "selfClosing": False,
+                                 "selfClosingAcknowledged": False}
+            self.state = self.tagNameState
+        elif data == ">":
+            # XXX In theory it could be something besides a tag name. But
+            # do we really care?
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name-but-got-right-bracket"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
+            self.state = self.dataState
+        elif data == "?":
+            # XXX In theory it could be something besides a tag name. But
+            # do we really care?
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name-but-got-question-mark"})
+            self.stream.unget(data)
+            self.state = self.bogusCommentState
+        else:
+            # XXX
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        return True
+
+    def closeTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
+                                 "data": [], "selfClosing": False}
+            self.state = self.tagNameState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-right-bracket"})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-eof"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.state = self.dataState
+        else:
+            # XXX data can be _'_...
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-char",
+                                    "datavars": {"data": data}})
+            self.stream.unget(data)
+            self.state = self.bogusCommentState
+        return True
+
+    def tagNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-tag-name"})
+            self.state = self.dataState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
+        else:
+            self.currentToken["name"] += data
+            # (Don't use charsUntil here, because tag names are
+            # very short and it's faster to not do anything fancy)
+        return True
+
+    def rcdataLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.rcdataEndTagOpenState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rcdataEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.rcdataEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rcdataEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rawtextLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.rawtextEndTagOpenState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def rawtextEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.rawtextEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def rawtextEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def scriptDataLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataEndTagOpenState
+        elif data == "!":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
+            self.state = self.scriptDataEscapeStartState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.scriptDataEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapeStartState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapeStartDashState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapeStartDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashDashState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapedState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashState
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            chars = self.stream.charsUntil(("<", "-", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def scriptDataEscapedDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashDashState
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedDashDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+            self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataEscapedEndTagOpenState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
+            self.temporaryBuffer = data
+            self.state = self.scriptDataDoubleEscapeStartState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer = data
+            self.state = self.scriptDataEscapedEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataDoubleEscapeStartState(self):
+        data = self.stream.char()
+        if data in (spaceCharacters | frozenset(("/", ">"))):
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            if self.temporaryBuffer.lower() == "script":
+                self.state = self.scriptDataDoubleEscapedState
+            else:
+                self.state = self.scriptDataEscapedState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.temporaryBuffer += data
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataDoubleEscapedState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataDoubleEscapedDashState
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+        return True
+
+    def scriptDataDoubleEscapedDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataDoubleEscapedDashDashState
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapedDashDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+            self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapedLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataDoubleEscapeEndState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapeEndState(self):
+        data = self.stream.char()
+        if data in (spaceCharacters | frozenset(("/", ">"))):
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            if self.temporaryBuffer.lower() == "script":
+                self.state = self.scriptDataEscapedState
+            else:
+                self.state = self.scriptDataDoubleEscapedState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.temporaryBuffer += data
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def beforeAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data in ("'", '"', "=", "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "invalid-character-in-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-name-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        return True
+
+    def attributeNameState(self):
+        data = self.stream.char()
+        leavingThisState = True
+        emitToken = False
+        if data == "=":
+            self.state = self.beforeAttributeValueState
+        elif data in asciiLetters:
+            self.currentToken["data"][-1][0] += data +\
+                self.stream.charsUntil(asciiLetters, True)
+            leavingThisState = False
+        elif data == ">":
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+            emitToken = True
+        elif data in spaceCharacters:
+            self.state = self.afterAttributeNameState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][0] += "\uFFFD"
+            leavingThisState = False
+        elif data in ("'", '"', "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data":
+                                    "invalid-character-in-attribute-name"})
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-attribute-name"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+
+        if leavingThisState:
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            self.currentToken["data"][-1][0] = (
+                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+            for name, _ in self.currentToken["data"][:-1]:
+                if self.currentToken["data"][-1][0] == name:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                            "duplicate-attribute"})
+                    break
+            # XXX Fix for above XXX
+            if emitToken:
+                self.emitCurrentToken()
+        return True
+
+    def afterAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == "=":
+            self.state = self.beforeAttributeValueState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
+        elif data in ("'", '"', "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "invalid-character-after-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-end-of-tag-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        return True
+
+    def beforeAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == "\"":
+            self.state = self.attributeValueDoubleQuotedState
+        elif data == "&":
+            self.state = self.attributeValueUnQuotedState
+            self.stream.unget(data)
+        elif data == "'":
+            self.state = self.attributeValueSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-value-but-got-right-bracket"})
+            self.emitCurrentToken()
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+            self.state = self.attributeValueUnQuotedState
+        elif data in ("=", "<", "`"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "equals-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+            self.state = self.attributeValueUnQuotedState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-value-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data
+            self.state = self.attributeValueUnQuotedState
+        return True
+
+    def attributeValueDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterAttributeValueState
+        elif data == "&":
+            self.processEntityInAttribute('"')
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-double-quote"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data +\
+                self.stream.charsUntil(("\"", "&", "\u0000"))
+        return True
+
+    def attributeValueSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterAttributeValueState
+        elif data == "&":
+            self.processEntityInAttribute("'")
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-single-quote"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data +\
+                self.stream.charsUntil(("'", "&", "\u0000"))
+        return True
+
+    def attributeValueUnQuotedState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == "&":
+            self.processEntityInAttribute(">")
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data in ('"', "'", "=", "<", "`"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-no-quotes"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
+                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
+        return True
+
+    def afterAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-EOF-after-attribute-value"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-after-attribute-value"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
+        return True
+
+    def selfClosingStartTagState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.currentToken["selfClosing"] = True
+            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data":
+                                    "unexpected-EOF-after-solidus-in-tag"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-after-solidus-in-tag"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
+        return True
+
+    def bogusCommentState(self):
+        # Make a new comment token and give it as value all the characters
+        # until the first > or EOF (charsUntil checks for EOF automatically)
+        # and emit it.
+        data = self.stream.charsUntil(">")
+        data = data.replace("\u0000", "\uFFFD")
+        self.tokenQueue.append(
+            {"type": tokenTypes["Comment"], "data": data})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an EOF.
+        self.stream.char()
+        self.state = self.dataState
+        return True
+
+    def markupDeclarationOpenState(self):
+        charStack = [self.stream.char()]
+        if charStack[-1] == "-":
+            charStack.append(self.stream.char())
+            if charStack[-1] == "-":
+                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
+                self.state = self.commentStartState
+                return True
+        elif charStack[-1] in ('d', 'D'):
+            matched = True
+            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
+                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
+                charStack.append(self.stream.char())
+                if charStack[-1] not in expected:
+                    matched = False
+                    break
+            if matched:
+                self.currentToken = {"type": tokenTypes["Doctype"],
+                                     "name": "",
+                                     "publicId": None, "systemId": None,
+                                     "correct": True}
+                self.state = self.doctypeState
+                return True
+        elif (charStack[-1] == "[" and
+              self.parser is not None and
+              self.parser.tree.openElements and
+              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
+            matched = True
+            for expected in ["C", "D", "A", "T", "A", "["]:
+                charStack.append(self.stream.char())
+                if charStack[-1] != expected:
+                    matched = False
+                    break
+            if matched:
+                self.state = self.cdataSectionState
+                return True
+
+        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                "expected-dashes-or-doctype"})
+
+        while charStack:
+            self.stream.unget(charStack.pop())
+        self.state = self.bogusCommentState
+        return True
+
+    def commentStartState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentStartDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "incorrect-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += data
+            self.state = self.commentState
+        return True
+
+    def commentStartDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "incorrect-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "-" + data
+            self.state = self.commentState
+        return True
+
+    def commentState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += data + \
+                self.stream.charsUntil(("-", "\u0000"))
+        return True
+
+    def commentEndDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
+            self.state = self.commentState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-end-dash"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "-" + data
+            self.state = self.commentState
+        return True
+
+    def commentEndState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--\uFFFD"
+            self.state = self.commentState
+        elif data == "!":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-bang-after-double-dash-in-comment"})
+            self.state = self.commentEndBangState
+        elif data == "-":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-dash-after-double-dash-in-comment"})
+            self.currentToken["data"] += data
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-double-dash"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            # XXX
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-comment"})
+            self.currentToken["data"] += "--" + data
+            self.state = self.commentState
+        return True
+
+    def commentEndBangState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "-":
+            self.currentToken["data"] += "--!"
+            self.state = self.commentEndDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--!\uFFFD"
+            self.state = self.commentState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-end-bang-state"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "--!" + data
+            self.state = self.commentState
+        return True
+
+    def doctypeState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-eof"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "need-space-after-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeNameState
+        return True
+
+    def beforeDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-right-bracket"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] = "\uFFFD"
+            self.state = self.doctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-eof"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["name"] = data
+            self.state = self.doctypeNameState
+        return True
+
+    def doctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.state = self.afterDoctypeNameState
+        elif data == ">":
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
+            self.state = self.doctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype-name"})
+            self.currentToken["correct"] = False
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["name"] += data
+        return True
+
+    def afterDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.currentToken["correct"] = False
+            self.stream.unget(data)
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            if data in ("p", "P"):
+                matched = True
+                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
+                                 ("i", "I"), ("c", "C")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.afterDoctypePublicKeywordState
+                    return True
+            elif data in ("s", "S"):
+                matched = True
+                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
+                                 ("e", "E"), ("m", "M")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.afterDoctypeSystemKeywordState
+                    return True
+
+            # All the characters read before the current 'data' will be
+            # [a-zA-Z], so they're garbage in the bogus doctype and can be
+            # discarded; only the latest character might be '>' or EOF
+            # and needs to be ungetted
+            self.stream.unget(data)
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-space-or-right-bracket-in-doctype", "datavars":
+                                    {"data": data}})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+
+        return True
+
+    def afterDoctypePublicKeywordState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypePublicIdentifierState
+        elif data in ("'", '"'):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypePublicIdentifierState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.stream.unget(data)
+            self.state = self.beforeDoctypePublicIdentifierState
+        return True
+
+    def beforeDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["publicId"] = ""
+            self.state = self.doctypePublicIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["publicId"] = ""
+            self.state = self.doctypePublicIdentifierSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def doctypePublicIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def doctypePublicIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def afterDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.betweenDoctypePublicAndSystemIdentifiersState
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == '"':
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def betweenDoctypePublicAndSystemIdentifiersState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == '"':
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def afterDoctypeSystemKeywordState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypeSystemIdentifierState
+        elif data in ("'", '"'):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeSystemIdentifierState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeSystemIdentifierState
+        return True
+
+    def beforeDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def doctypeSystemIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def doctypeSystemIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def afterDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.state = self.bogusDoctypeState
+        return True
+
+    def bogusDoctypeState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            # XXX EMIT
+            self.stream.unget(data)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            pass
+        return True
+
+    def cdataSectionState(self):
+        data = []
+        while True:
+            data.append(self.stream.charsUntil("]"))
+            data.append(self.stream.charsUntil(">"))
+            char = self.stream.char()
+            if char == EOF:
+                break
+            else:
+                assert char == ">"
+                if data[-1][-2:] == "]]":
+                    data[-1] = data[-1][:-2]
+                    break
+                else:
+                    data.append(char)
+
+        data = "".join(data)  # pylint:disable=redefined-variable-type
+        # Deal with null here rather than in the parser
+        nullCount = data.count("\u0000")
+        if nullCount > 0:
+            for _ in range(nullCount):
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data": "invalid-codepoint"})
+            data = data.replace("\u0000", "\uFFFD")
+        if data:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": data})
+        self.state = self.dataState
+        return True
diff --git a/bleach/_vendor/html5lib/_trie/__init__.py b/bleach/_vendor/html5lib/_trie/__init__.py
new file mode 100644
index 00000000..a5ba4bf1
--- /dev/null
+++ b/bleach/_vendor/html5lib/_trie/__init__.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from .py import Trie as PyTrie
+
+Trie = PyTrie
+
+# pylint:disable=wrong-import-position
+try:
+    from .datrie import Trie as DATrie
+except ImportError:
+    pass
+else:
+    Trie = DATrie
+# pylint:enable=wrong-import-position
diff --git a/bleach/_vendor/html5lib/_trie/_base.py b/bleach/_vendor/html5lib/_trie/_base.py
new file mode 100644
index 00000000..a1158bbb
--- /dev/null
+++ b/bleach/_vendor/html5lib/_trie/_base.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import Mapping
+
+
+class Trie(Mapping):
+    """Abstract base class for tries"""
+
+    def keys(self, prefix=None):
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
+
+        if prefix is None:
+            return set(keys)
+
+        return {x for x in keys if x.startswith(prefix)}
+
+    def has_keys_with_prefix(self, prefix):
+        for key in self.keys():
+            if key.startswith(prefix):
+                return True
+
+        return False
+
+    def longest_prefix(self, prefix):
+        if prefix in self:
+            return prefix
+
+        for i in range(1, len(prefix) + 1):
+            if prefix[:-i] in self:
+                return prefix[:-i]
+
+        raise KeyError(prefix)
+
+    def longest_prefix_item(self, prefix):
+        lprefix = self.longest_prefix(prefix)
+        return (lprefix, self[lprefix])
diff --git a/bleach/_vendor/html5lib/_trie/datrie.py b/bleach/_vendor/html5lib/_trie/datrie.py
new file mode 100644
index 00000000..51f3d046
--- /dev/null
+++ b/bleach/_vendor/html5lib/_trie/datrie.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from datrie import Trie as DATrie
+from six import text_type
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        chars = set()
+        for key in data.keys():
+            if not isinstance(key, text_type):
+                raise TypeError("All keys must be strings")
+            for char in key:
+                chars.add(char)
+
+        self._data = DATrie("".join(chars))
+        for key, value in data.items():
+            self._data[key] = value
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        raise NotImplementedError()
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        return self._data.keys(prefix)
+
+    def has_keys_with_prefix(self, prefix):
+        return self._data.has_keys_with_prefix(prefix)
+
+    def longest_prefix(self, prefix):
+        return self._data.longest_prefix(prefix)
+
+    def longest_prefix_item(self, prefix):
+        return self._data.longest_prefix_item(prefix)
diff --git a/bleach/_vendor/html5lib/_trie/py.py b/bleach/_vendor/html5lib/_trie/py.py
new file mode 100644
index 00000000..c2ba3da7
--- /dev/null
+++ b/bleach/_vendor/html5lib/_trie/py.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from bisect import bisect_left
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        if not all(isinstance(x, text_type) for x in data.keys()):
+            raise TypeError("All keys must be strings")
+
+        self._data = data
+        self._keys = sorted(data.keys())
+        self._cachestr = ""
+        self._cachepoints = (0, len(data))
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        if prefix is None or prefix == "" or not self._keys:
+            return set(self._keys)
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            start = i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            start = i = bisect_left(self._keys, prefix)
+
+        keys = set()
+        if start == len(self._keys):
+            return keys
+
+        while self._keys[i].startswith(prefix):
+            keys.add(self._keys[i])
+            i += 1
+
+        self._cachestr = prefix
+        self._cachepoints = (start, i)
+
+        return keys
+
+    def has_keys_with_prefix(self, prefix):
+        if prefix in self._data:
+            return True
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            i = bisect_left(self._keys, prefix)
+
+        if i == len(self._keys):
+            return False
+
+        return self._keys[i].startswith(prefix)
diff --git a/bleach/_vendor/html5lib/_utils.py b/bleach/_vendor/html5lib/_utils.py
new file mode 100644
index 00000000..91252f2c
--- /dev/null
+++ b/bleach/_vendor/html5lib/_utils.py
@@ -0,0 +1,124 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from types import ModuleType
+
+from six import text_type
+
+try:
+    import xml.etree.cElementTree as default_etree
+except ImportError:
+    import xml.etree.ElementTree as default_etree
+
+
+__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
+        assert isinstance(_x, text_type)
+except:  # pylint:disable=bare-except
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
+
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name, value in items:
+            if isinstance(name, (list, tuple, frozenset, set)):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+
+# Some utility functions to deal with weirdness around UCS2 vs UCS4
+# python builds
+
+def isSurrogatePair(data):
+    return (len(data) == 2 and
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+
+def surrogatePairToCodepoint(data):
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
+                (ord(data[1]) - 0xDC00))
+    return char_val
+
+# Module Factory Factory (no, this isn't Java, I know)
+# Here to stop this being duplicated all over the place.
+
+
+def moduleFactoryFactory(factory):
+    moduleCache = {}
+
+    def moduleFactory(baseModule, *args, **kwargs):
+        if isinstance(ModuleType.__name__, type("")):
+            name = "_%s_factory" % baseModule.__name__
+        else:
+            name = b"_%s_factory" % baseModule.__name__
+
+        kwargs_tuple = tuple(kwargs.items())
+
+        try:
+            return moduleCache[name][args][kwargs_tuple]
+        except KeyError:
+            mod = ModuleType(name)
+            objs = factory(baseModule, *args, **kwargs)
+            mod.__dict__.update(objs)
+            if "name" not in moduleCache:
+                moduleCache[name] = {}
+            if "args" not in moduleCache[name]:
+                moduleCache[name][args] = {}
+            if "kwargs" not in moduleCache[name][args]:
+                moduleCache[name][args][kwargs_tuple] = {}
+            moduleCache[name][args][kwargs_tuple] = mod
+            return mod
+
+    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
diff --git a/bleach/_vendor/html5lib/constants.py b/bleach/_vendor/html5lib/constants.py
new file mode 100644
index 00000000..1ff80419
--- /dev/null
+++ b/bleach/_vendor/html5lib/constants.py
@@ -0,0 +1,2947 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import string
+
+EOF = None
+
+E = {
+    "null-character":
+        "Null character in input stream, replaced with U+FFFD.",
+    "invalid-codepoint":
+        "Invalid codepoint in stream.",
+    "incorrectly-placed-solidus":
+        "Solidus (/) incorrectly placed in tag.",
+    "incorrect-cr-newline-entity":
+        "Incorrect CR newline entity, replaced with LF.",
+    "illegal-windows-1252-entity":
+        "Entity used with illegal number (windows-1252 reference).",
+    "cant-convert-numeric-entity":
+        "Numeric entity couldn't be converted to character "
+        "(codepoint U+%(charAsInt)08x).",
+    "illegal-codepoint-for-numeric-entity":
+        "Numeric entity represents an illegal codepoint: "
+        "U+%(charAsInt)08x.",
+    "numeric-entity-without-semicolon":
+        "Numeric entity didn't end with ';'.",
+    "expected-numeric-entity-but-got-eof":
+        "Numeric entity expected. Got end of file instead.",
+    "expected-numeric-entity":
+        "Numeric entity expected but none found.",
+    "named-entity-without-semicolon":
+        "Named entity didn't end with ';'.",
+    "expected-named-entity":
+        "Named entity expected. Got none.",
+    "attributes-in-end-tag":
+        "End tag contains unexpected attributes.",
+    'self-closing-flag-on-end-tag':
+        "End tag contains unexpected self-closing flag.",
+    "expected-tag-name-but-got-right-bracket":
+        "Expected tag name. Got '>' instead.",
+    "expected-tag-name-but-got-question-mark":
+        "Expected tag name. Got '?' instead. (HTML doesn't "
+        "support processing instructions.)",
+    "expected-tag-name":
+        "Expected tag name. Got something else instead",
+    "expected-closing-tag-but-got-right-bracket":
+        "Expected closing tag. Got '>' instead. Ignoring '</>'.",
+    "expected-closing-tag-but-got-eof":
+        "Expected closing tag. Unexpected end of file.",
+    "expected-closing-tag-but-got-char":
+        "Expected closing tag. Unexpected character '%(data)s' found.",
+    "eof-in-tag-name":
+        "Unexpected end of file in the tag name.",
+    "expected-attribute-name-but-got-eof":
+        "Unexpected end of file. Expected attribute name instead.",
+    "eof-in-attribute-name":
+        "Unexpected end of file in attribute name.",
+    "invalid-character-in-attribute-name":
+        "Invalid character in attribute name",
+    "duplicate-attribute":
+        "Dropped duplicate attribute on tag.",
+    "expected-end-of-tag-name-but-got-eof":
+        "Unexpected end of file. Expected = or end of tag.",
+    "expected-attribute-value-but-got-eof":
+        "Unexpected end of file. Expected attribute value.",
+    "expected-attribute-value-but-got-right-bracket":
+        "Expected attribute value. Got '>' instead.",
+    'equals-in-unquoted-attribute-value':
+        "Unexpected = in unquoted attribute",
+    'unexpected-character-in-unquoted-attribute-value':
+        "Unexpected character in unquoted attribute",
+    "invalid-character-after-attribute-name":
+        "Unexpected character after attribute name.",
+    "unexpected-character-after-attribute-value":
+        "Unexpected character after attribute value.",
+    "eof-in-attribute-value-double-quote":
+        "Unexpected end of file in attribute value (\").",
+    "eof-in-attribute-value-single-quote":
+        "Unexpected end of file in attribute value (').",
+    "eof-in-attribute-value-no-quotes":
+        "Unexpected end of file in attribute value.",
+    "unexpected-EOF-after-solidus-in-tag":
+        "Unexpected end of file in tag. Expected >",
+    "unexpected-character-after-solidus-in-tag":
+        "Unexpected character after / in tag. Expected >",
+    "expected-dashes-or-doctype":
+        "Expected '--' or 'DOCTYPE'. Not found.",
+    "unexpected-bang-after-double-dash-in-comment":
+        "Unexpected ! after -- in comment",
+    "unexpected-space-after-double-dash-in-comment":
+        "Unexpected space after -- in comment",
+    "incorrect-comment":
+        "Incorrect comment.",
+    "eof-in-comment":
+        "Unexpected end of file in comment.",
+    "eof-in-comment-end-dash":
+        "Unexpected end of file in comment (-)",
+    "unexpected-dash-after-double-dash-in-comment":
+        "Unexpected '-' after '--' found in comment.",
+    "eof-in-comment-double-dash":
+        "Unexpected end of file in comment (--).",
+    "eof-in-comment-end-space-state":
+        "Unexpected end of file in comment.",
+    "eof-in-comment-end-bang-state":
+        "Unexpected end of file in comment.",
+    "unexpected-char-in-comment":
+        "Unexpected character in comment found.",
+    "need-space-after-doctype":
+        "No space after literal string 'DOCTYPE'.",
+    "expected-doctype-name-but-got-right-bracket":
+        "Unexpected > character. Expected DOCTYPE name.",
+    "expected-doctype-name-but-got-eof":
+        "Unexpected end of file. Expected DOCTYPE name.",
+    "eof-in-doctype-name":
+        "Unexpected end of file in DOCTYPE name.",
+    "eof-in-doctype":
+        "Unexpected end of file in DOCTYPE.",
+    "expected-space-or-right-bracket-in-doctype":
+        "Expected space or '>'. Got '%(data)s'",
+    "unexpected-end-of-doctype":
+        "Unexpected end of DOCTYPE.",
+    "unexpected-char-in-doctype":
+        "Unexpected character in DOCTYPE.",
+    "eof-in-innerhtml":
+        "XXX innerHTML EOF",
+    "unexpected-doctype":
+        "Unexpected DOCTYPE. Ignored.",
+    "non-html-root":
+        "html needs to be the first start tag.",
+    "expected-doctype-but-got-eof":
+        "Unexpected End of file. Expected DOCTYPE.",
+    "unknown-doctype":
+        "Erroneous DOCTYPE.",
+    "expected-doctype-but-got-chars":
+        "Unexpected non-space characters. Expected DOCTYPE.",
+    "expected-doctype-but-got-start-tag":
+        "Unexpected start tag (%(name)s). Expected DOCTYPE.",
+    "expected-doctype-but-got-end-tag":
+        "Unexpected end tag (%(name)s). Expected DOCTYPE.",
+    "end-tag-after-implied-root":
+        "Unexpected end tag (%(name)s) after the (implied) root element.",
+    "expected-named-closing-tag-but-got-eof":
+        "Unexpected end of file. Expected end tag (%(name)s).",
+    "two-heads-are-not-better-than-one":
+        "Unexpected start tag head in existing head. Ignored.",
+    "unexpected-end-tag":
+        "Unexpected end tag (%(name)s). Ignored.",
+    "unexpected-start-tag-out-of-my-head":
+        "Unexpected start tag (%(name)s) that can be in head. Moved.",
+    "unexpected-start-tag":
+        "Unexpected start tag (%(name)s).",
+    "missing-end-tag":
+        "Missing end tag (%(name)s).",
+    "missing-end-tags":
+        "Missing end tags (%(name)s).",
+    "unexpected-start-tag-implies-end-tag":
+        "Unexpected start tag (%(startName)s) "
+        "implies end tag (%(endName)s).",
+    "unexpected-start-tag-treated-as":
+        "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
+    "deprecated-tag":
+        "Unexpected start tag %(name)s. Don't use it!",
+    "unexpected-start-tag-ignored":
+        "Unexpected start tag %(name)s. Ignored.",
+    "expected-one-end-tag-but-got-another":
+        "Unexpected end tag (%(gotName)s). "
+        "Missing end tag (%(expectedName)s).",
+    "end-tag-too-early":
+        "End tag (%(name)s) seen too early. Expected other end tag.",
+    "end-tag-too-early-named":
+        "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
+    "end-tag-too-early-ignored":
+        "End tag (%(name)s) seen too early. Ignored.",
+    "adoption-agency-1.1":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 1 of the adoption agency algorithm.",
+    "adoption-agency-1.2":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 2 of the adoption agency algorithm.",
+    "adoption-agency-1.3":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 3 of the adoption agency algorithm.",
+    "adoption-agency-4.4":
+        "End tag (%(name)s) violates step 4, "
+        "paragraph 4 of the adoption agency algorithm.",
+    "unexpected-end-tag-treated-as":
+        "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
+    "no-end-tag":
+        "This element (%(name)s) has no end tag.",
+    "unexpected-implied-end-tag-in-table":
+        "Unexpected implied end tag (%(name)s) in the table phase.",
+    "unexpected-implied-end-tag-in-table-body":
+        "Unexpected implied end tag (%(name)s) in the table body phase.",
+    "unexpected-char-implies-table-voodoo":
+        "Unexpected non-space characters in "
+        "table context caused voodoo mode.",
+    "unexpected-hidden-input-in-table":
+        "Unexpected input with type hidden in table context.",
+    "unexpected-form-in-table":
+        "Unexpected form in table context.",
+    "unexpected-start-tag-implies-table-voodoo":
+        "Unexpected start tag (%(name)s) in "
+        "table context caused voodoo mode.",
+    "unexpected-end-tag-implies-table-voodoo":
+        "Unexpected end tag (%(name)s) in "
+        "table context caused voodoo mode.",
+    "unexpected-cell-in-table-body":
+        "Unexpected table cell start tag (%(name)s) "
+        "in the table body phase.",
+    "unexpected-cell-end-tag":
+        "Got table cell end tag (%(name)s) "
+        "while required end tags are missing.",
+    "unexpected-end-tag-in-table-body":
+        "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
+    "unexpected-implied-end-tag-in-table-row":
+        "Unexpected implied end tag (%(name)s) in the table row phase.",
+    "unexpected-end-tag-in-table-row":
+        "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
+    "unexpected-select-in-select":
+        "Unexpected select start tag in the select phase "
+        "treated as select end tag.",
+    "unexpected-input-in-select":
+        "Unexpected input start tag in the select phase.",
+    "unexpected-start-tag-in-select":
+        "Unexpected start tag token (%(name)s in the select phase. "
+        "Ignored.",
+    "unexpected-end-tag-in-select":
+        "Unexpected end tag (%(name)s) in the select phase. Ignored.",
+    "unexpected-table-element-start-tag-in-select-in-table":
+        "Unexpected table element start tag (%(name)s) in the select in table phase.",
+    "unexpected-table-element-end-tag-in-select-in-table":
+        "Unexpected table element end tag (%(name)s) in the select in table phase.",
+    "unexpected-char-after-body":
+        "Unexpected non-space characters in the after body phase.",
+    "unexpected-start-tag-after-body":
+        "Unexpected start tag token (%(name)s)"
+        " in the after body phase.",
+    "unexpected-end-tag-after-body":
+        "Unexpected end tag token (%(name)s)"
+        " in the after body phase.",
+    "unexpected-char-in-frameset":
+        "Unexpected characters in the frameset phase. Characters ignored.",
+    "unexpected-start-tag-in-frameset":
+        "Unexpected start tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
+    "unexpected-frameset-in-frameset-innerhtml":
+        "Unexpected end tag token (frameset) "
+        "in the frameset phase (innerHTML).",
+    "unexpected-end-tag-in-frameset":
+        "Unexpected end tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
+    "unexpected-char-after-frameset":
+        "Unexpected non-space characters in the "
+        "after frameset phase. Ignored.",
+    "unexpected-start-tag-after-frameset":
+        "Unexpected start tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
+    "unexpected-end-tag-after-frameset":
+        "Unexpected end tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
+    "unexpected-end-tag-after-body-innerhtml":
+        "Unexpected end tag after body(innerHtml)",
+    "expected-eof-but-got-char":
+        "Unexpected non-space characters. Expected end of file.",
+    "expected-eof-but-got-start-tag":
+        "Unexpected start tag (%(name)s)"
+        ". Expected end of file.",
+    "expected-eof-but-got-end-tag":
+        "Unexpected end tag (%(name)s)"
+        ". Expected end of file.",
+    "eof-in-table":
+        "Unexpected end of file. Expected table content.",
+    "eof-in-select":
+        "Unexpected end of file. Expected select content.",
+    "eof-in-frameset":
+        "Unexpected end of file. Expected frameset content.",
+    "eof-in-script-in-script":
+        "Unexpected end of file. Expected script content.",
+    "eof-in-foreign-lands":
+        "Unexpected end of file. Expected foreign content",
+    "non-void-element-with-trailing-solidus":
+        "Trailing solidus not allowed on element %(name)s",
+    "unexpected-html-element-in-foreign-content":
+        "Element %(name)s not allowed in a non-html context",
+    "unexpected-end-tag-before-html":
+        "Unexpected end tag (%(name)s) before html.",
+    "unexpected-inhead-noscript-tag":
+        "Element %(name)s not allowed in a inhead-noscript context",
+    "eof-in-head-noscript":
+        "Unexpected end of file. Expected inhead-noscript content",
+    "char-in-head-noscript":
+        "Unexpected non-space character. Expected inhead-noscript content",
+    "XXX-undefined-error":
+        "Undefined error (this sucks and should be fixed)",
+}
+
+namespaces = {
+    "html": "http://www.w3.org/1999/xhtml",
+    "mathml": "http://www.w3.org/1998/Math/MathML",
+    "svg": "http://www.w3.org/2000/svg",
+    "xlink": "http://www.w3.org/1999/xlink",
+    "xml": "http://www.w3.org/XML/1998/namespace",
+    "xmlns": "http://www.w3.org/2000/xmlns/"
+}
+
+scopingElements = frozenset([
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "th"),
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext"),
+    (namespaces["mathml"], "annotation-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title"),
+])
+
+formattingElements = frozenset([
+    (namespaces["html"], "a"),
+    (namespaces["html"], "b"),
+    (namespaces["html"], "big"),
+    (namespaces["html"], "code"),
+    (namespaces["html"], "em"),
+    (namespaces["html"], "font"),
+    (namespaces["html"], "i"),
+    (namespaces["html"], "nobr"),
+    (namespaces["html"], "s"),
+    (namespaces["html"], "small"),
+    (namespaces["html"], "strike"),
+    (namespaces["html"], "strong"),
+    (namespaces["html"], "tt"),
+    (namespaces["html"], "u")
+])
+
+specialElements = frozenset([
+    (namespaces["html"], "address"),
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "area"),
+    (namespaces["html"], "article"),
+    (namespaces["html"], "aside"),
+    (namespaces["html"], "base"),
+    (namespaces["html"], "basefont"),
+    (namespaces["html"], "bgsound"),
+    (namespaces["html"], "blockquote"),
+    (namespaces["html"], "body"),
+    (namespaces["html"], "br"),
+    (namespaces["html"], "button"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "center"),
+    (namespaces["html"], "col"),
+    (namespaces["html"], "colgroup"),
+    (namespaces["html"], "command"),
+    (namespaces["html"], "dd"),
+    (namespaces["html"], "details"),
+    (namespaces["html"], "dir"),
+    (namespaces["html"], "div"),
+    (namespaces["html"], "dl"),
+    (namespaces["html"], "dt"),
+    (namespaces["html"], "embed"),
+    (namespaces["html"], "fieldset"),
+    (namespaces["html"], "figure"),
+    (namespaces["html"], "footer"),
+    (namespaces["html"], "form"),
+    (namespaces["html"], "frame"),
+    (namespaces["html"], "frameset"),
+    (namespaces["html"], "h1"),
+    (namespaces["html"], "h2"),
+    (namespaces["html"], "h3"),
+    (namespaces["html"], "h4"),
+    (namespaces["html"], "h5"),
+    (namespaces["html"], "h6"),
+    (namespaces["html"], "head"),
+    (namespaces["html"], "header"),
+    (namespaces["html"], "hr"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "iframe"),
+    # Note that image is commented out in the spec as "this isn't an
+    # element that can end up on the stack, so it doesn't matter,"
+    (namespaces["html"], "image"),
+    (namespaces["html"], "img"),
+    (namespaces["html"], "input"),
+    (namespaces["html"], "isindex"),
+    (namespaces["html"], "li"),
+    (namespaces["html"], "link"),
+    (namespaces["html"], "listing"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "menu"),
+    (namespaces["html"], "meta"),
+    (namespaces["html"], "nav"),
+    (namespaces["html"], "noembed"),
+    (namespaces["html"], "noframes"),
+    (namespaces["html"], "noscript"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "ol"),
+    (namespaces["html"], "p"),
+    (namespaces["html"], "param"),
+    (namespaces["html"], "plaintext"),
+    (namespaces["html"], "pre"),
+    (namespaces["html"], "script"),
+    (namespaces["html"], "section"),
+    (namespaces["html"], "select"),
+    (namespaces["html"], "style"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "tbody"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "textarea"),
+    (namespaces["html"], "tfoot"),
+    (namespaces["html"], "th"),
+    (namespaces["html"], "thead"),
+    (namespaces["html"], "title"),
+    (namespaces["html"], "tr"),
+    (namespaces["html"], "ul"),
+    (namespaces["html"], "wbr"),
+    (namespaces["html"], "xmp"),
+    (namespaces["svg"], "foreignObject")
+])
+
+htmlIntegrationPointElements = frozenset([
+    (namespaces["mathml"], "annotation-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title")
+])
+
+mathmlTextIntegrationPointElements = frozenset([
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext")
+])
+
+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}
+
+adjustForeignAttributes = {
+    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+    "xlink:href": ("xlink", "href", namespaces["xlink"]),
+    "xlink:role": ("xlink", "role", namespaces["xlink"]),
+    "xlink:show": ("xlink", "show", namespaces["xlink"]),
+    "xlink:title": ("xlink", "title", namespaces["xlink"]),
+    "xlink:type": ("xlink", "type", namespaces["xlink"]),
+    "xml:base": ("xml", "base", namespaces["xml"]),
+    "xml:lang": ("xml", "lang", namespaces["xml"]),
+    "xml:space": ("xml", "space", namespaces["xml"]),
+    "xmlns": (None, "xmlns", namespaces["xmlns"]),
+    "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+}
+
+unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
+                                  adjustForeignAttributes.items()])
+
+spaceCharacters = frozenset([
+    "\t",
+    "\n",
+    "\u000C",
+    " ",
+    "\r"
+])
+
+tableInsertModeElements = frozenset([
+    "table",
+    "tbody",
+    "tfoot",
+    "thead",
+    "tr"
+])
+
+asciiLowercase = frozenset(string.ascii_lowercase)
+asciiUppercase = frozenset(string.ascii_uppercase)
+asciiLetters = frozenset(string.ascii_letters)
+digits = frozenset(string.digits)
+hexDigits = frozenset(string.hexdigits)
+
+asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
+                         for c in string.ascii_uppercase])
+
+# Heading elements need to be ordered
+headingElements = (
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6"
+)
+
+voidElements = frozenset([
+    "base",
+    "command",
+    "event-source",
+    "link",
+    "meta",
+    "hr",
+    "br",
+    "img",
+    "embed",
+    "param",
+    "area",
+    "col",
+    "input",
+    "source",
+    "track"
+])
+
+cdataElements = frozenset(['title', 'textarea'])
+
+rcdataElements = frozenset([
+    'style',
+    'script',
+    'xmp',
+    'iframe',
+    'noembed',
+    'noframes',
+    'noscript'
+])
+
+booleanAttributes = {
+    "": frozenset(["irrelevant", "itemscope"]),
+    "style": frozenset(["scoped"]),
+    "img": frozenset(["ismap"]),
+    "audio": frozenset(["autoplay", "controls"]),
+    "video": frozenset(["autoplay", "controls"]),
+    "script": frozenset(["defer", "async"]),
+    "details": frozenset(["open"]),
+    "datagrid": frozenset(["multiple", "disabled"]),
+    "command": frozenset(["hidden", "disabled", "checked", "default"]),
+    "hr": frozenset(["noshade"]),
+    "menu": frozenset(["autosubmit"]),
+    "fieldset": frozenset(["disabled", "readonly"]),
+    "option": frozenset(["disabled", "readonly", "selected"]),
+    "optgroup": frozenset(["disabled", "readonly"]),
+    "button": frozenset(["disabled", "autofocus"]),
+    "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
+    "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
+    "output": frozenset(["disabled", "readonly"]),
+    "iframe": frozenset(["seamless"]),
+}
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
+# therefore can't be a frozenset.
+entitiesWindows1252 = (
+    8364,   # 0x80  0x20AC  EURO SIGN
+    65533,  # 0x81          UNDEFINED
+    8218,   # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,    # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,   # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,   # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,   # 0x86  0x2020  DAGGER
+    8225,   # 0x87  0x2021  DOUBLE DAGGER
+    710,    # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,   # 0x89  0x2030  PER MILLE SIGN
+    352,    # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,   # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,    # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533,  # 0x8D          UNDEFINED
+    381,    # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533,  # 0x8F          UNDEFINED
+    65533,  # 0x90          UNDEFINED
+    8216,   # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,   # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,   # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,   # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,   # 0x95  0x2022  BULLET
+    8211,   # 0x96  0x2013  EN DASH
+    8212,   # 0x97  0x2014  EM DASH
+    732,    # 0x98  0x02DC  SMALL TILDE
+    8482,   # 0x99  0x2122  TRADE MARK SIGN
+    353,    # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,   # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,    # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533,  # 0x9D          UNDEFINED
+    382,    # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376     # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+)
+
+xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
+
+entities = {
+    "AElig": "\xc6",
+    "AElig;": "\xc6",
+    "AMP": "&",
+    "AMP;": "&",
+    "Aacute": "\xc1",
+    "Aacute;": "\xc1",
+    "Abreve;": "\u0102",
+    "Acirc": "\xc2",
+    "Acirc;": "\xc2",
+    "Acy;": "\u0410",
+    "Afr;": "\U0001d504",
+    "Agrave": "\xc0",
+    "Agrave;": "\xc0",
+    "Alpha;": "\u0391",
+    "Amacr;": "\u0100",
+    "And;": "\u2a53",
+    "Aogon;": "\u0104",
+    "Aopf;": "\U0001d538",
+    "ApplyFunction;": "\u2061",
+    "Aring": "\xc5",
+    "Aring;": "\xc5",
+    "Ascr;": "\U0001d49c",
+    "Assign;": "\u2254",
+    "Atilde": "\xc3",
+    "Atilde;": "\xc3",
+    "Auml": "\xc4",
+    "Auml;": "\xc4",
+    "Backslash;": "\u2216",
+    "Barv;": "\u2ae7",
+    "Barwed;": "\u2306",
+    "Bcy;": "\u0411",
+    "Because;": "\u2235",
+    "Bernoullis;": "\u212c",
+    "Beta;": "\u0392",
+    "Bfr;": "\U0001d505",
+    "Bopf;": "\U0001d539",
+    "Breve;": "\u02d8",
+    "Bscr;": "\u212c",
+    "Bumpeq;": "\u224e",
+    "CHcy;": "\u0427",
+    "COPY": "\xa9",
+    "COPY;": "\xa9",
+    "Cacute;": "\u0106",
+    "Cap;": "\u22d2",
+    "CapitalDifferentialD;": "\u2145",
+    "Cayleys;": "\u212d",
+    "Ccaron;": "\u010c",
+    "Ccedil": "\xc7",
+    "Ccedil;": "\xc7",
+    "Ccirc;": "\u0108",
+    "Cconint;": "\u2230",
+    "Cdot;": "\u010a",
+    "Cedilla;": "\xb8",
+    "CenterDot;": "\xb7",
+    "Cfr;": "\u212d",
+    "Chi;": "\u03a7",
+    "CircleDot;": "\u2299",
+    "CircleMinus;": "\u2296",
+    "CirclePlus;": "\u2295",
+    "CircleTimes;": "\u2297",
+    "ClockwiseContourIntegral;": "\u2232",
+    "CloseCurlyDoubleQuote;": "\u201d",
+    "CloseCurlyQuote;": "\u2019",
+    "Colon;": "\u2237",
+    "Colone;": "\u2a74",
+    "Congruent;": "\u2261",
+    "Conint;": "\u222f",
+    "ContourIntegral;": "\u222e",
+    "Copf;": "\u2102",
+    "Coproduct;": "\u2210",
+    "CounterClockwiseContourIntegral;": "\u2233",
+    "Cross;": "\u2a2f",
+    "Cscr;": "\U0001d49e",
+    "Cup;": "\u22d3",
+    "CupCap;": "\u224d",
+    "DD;": "\u2145",
+    "DDotrahd;": "\u2911",
+    "DJcy;": "\u0402",
+    "DScy;": "\u0405",
+    "DZcy;": "\u040f",
+    "Dagger;": "\u2021",
+    "Darr;": "\u21a1",
+    "Dashv;": "\u2ae4",
+    "Dcaron;": "\u010e",
+    "Dcy;": "\u0414",
+    "Del;": "\u2207",
+    "Delta;": "\u0394",
+    "Dfr;": "\U0001d507",
+    "DiacriticalAcute;": "\xb4",
+    "DiacriticalDot;": "\u02d9",
+    "DiacriticalDoubleAcute;": "\u02dd",
+    "DiacriticalGrave;": "`",
+    "DiacriticalTilde;": "\u02dc",
+    "Diamond;": "\u22c4",
+    "DifferentialD;": "\u2146",
+    "Dopf;": "\U0001d53b",
+    "Dot;": "\xa8",
+    "DotDot;": "\u20dc",
+    "DotEqual;": "\u2250",
+    "DoubleContourIntegral;": "\u222f",
+    "DoubleDot;": "\xa8",
+    "DoubleDownArrow;": "\u21d3",
+    "DoubleLeftArrow;": "\u21d0",
+    "DoubleLeftRightArrow;": "\u21d4",
+    "DoubleLeftTee;": "\u2ae4",
+    "DoubleLongLeftArrow;": "\u27f8",
+    "DoubleLongLeftRightArrow;": "\u27fa",
+    "DoubleLongRightArrow;": "\u27f9",
+    "DoubleRightArrow;": "\u21d2",
+    "DoubleRightTee;": "\u22a8",
+    "DoubleUpArrow;": "\u21d1",
+    "DoubleUpDownArrow;": "\u21d5",
+    "DoubleVerticalBar;": "\u2225",
+    "DownArrow;": "\u2193",
+    "DownArrowBar;": "\u2913",
+    "DownArrowUpArrow;": "\u21f5",
+    "DownBreve;": "\u0311",
+    "DownLeftRightVector;": "\u2950",
+    "DownLeftTeeVector;": "\u295e",
+    "DownLeftVector;": "\u21bd",
+    "DownLeftVectorBar;": "\u2956",
+    "DownRightTeeVector;": "\u295f",
+    "DownRightVector;": "\u21c1",
+    "DownRightVectorBar;": "\u2957",
+    "DownTee;": "\u22a4",
+    "DownTeeArrow;": "\u21a7",
+    "Downarrow;": "\u21d3",
+    "Dscr;": "\U0001d49f",
+    "Dstrok;": "\u0110",
+    "ENG;": "\u014a",
+    "ETH": "\xd0",
+    "ETH;": "\xd0",
+    "Eacute": "\xc9",
+    "Eacute;": "\xc9",
+    "Ecaron;": "\u011a",
+    "Ecirc": "\xca",
+    "Ecirc;": "\xca",
+    "Ecy;": "\u042d",
+    "Edot;": "\u0116",
+    "Efr;": "\U0001d508",
+    "Egrave": "\xc8",
+    "Egrave;": "\xc8",
+    "Element;": "\u2208",
+    "Emacr;": "\u0112",
+    "EmptySmallSquare;": "\u25fb",
+    "EmptyVerySmallSquare;": "\u25ab",
+    "Eogon;": "\u0118",
+    "Eopf;": "\U0001d53c",
+    "Epsilon;": "\u0395",
+    "Equal;": "\u2a75",
+    "EqualTilde;": "\u2242",
+    "Equilibrium;": "\u21cc",
+    "Escr;": "\u2130",
+    "Esim;": "\u2a73",
+    "Eta;": "\u0397",
+    "Euml": "\xcb",
+    "Euml;": "\xcb",
+    "Exists;": "\u2203",
+    "ExponentialE;": "\u2147",
+    "Fcy;": "\u0424",
+    "Ffr;": "\U0001d509",
+    "FilledSmallSquare;": "\u25fc",
+    "FilledVerySmallSquare;": "\u25aa",
+    "Fopf;": "\U0001d53d",
+    "ForAll;": "\u2200",
+    "Fouriertrf;": "\u2131",
+    "Fscr;": "\u2131",
+    "GJcy;": "\u0403",
+    "GT": ">",
+    "GT;": ">",
+    "Gamma;": "\u0393",
+    "Gammad;": "\u03dc",
+    "Gbreve;": "\u011e",
+    "Gcedil;": "\u0122",
+    "Gcirc;": "\u011c",
+    "Gcy;": "\u0413",
+    "Gdot;": "\u0120",
+    "Gfr;": "\U0001d50a",
+    "Gg;": "\u22d9",
+    "Gopf;": "\U0001d53e",
+    "GreaterEqual;": "\u2265",
+    "GreaterEqualLess;": "\u22db",
+    "GreaterFullEqual;": "\u2267",
+    "GreaterGreater;": "\u2aa2",
+    "GreaterLess;": "\u2277",
+    "GreaterSlantEqual;": "\u2a7e",
+    "GreaterTilde;": "\u2273",
+    "Gscr;": "\U0001d4a2",
+    "Gt;": "\u226b",
+    "HARDcy;": "\u042a",
+    "Hacek;": "\u02c7",
+    "Hat;": "^",
+    "Hcirc;": "\u0124",
+    "Hfr;": "\u210c",
+    "HilbertSpace;": "\u210b",
+    "Hopf;": "\u210d",
+    "HorizontalLine;": "\u2500",
+    "Hscr;": "\u210b",
+    "Hstrok;": "\u0126",
+    "HumpDownHump;": "\u224e",
+    "HumpEqual;": "\u224f",
+    "IEcy;": "\u0415",
+    "IJlig;": "\u0132",
+    "IOcy;": "\u0401",
+    "Iacute": "\xcd",
+    "Iacute;": "\xcd",
+    "Icirc": "\xce",
+    "Icirc;": "\xce",
+    "Icy;": "\u0418",
+    "Idot;": "\u0130",
+    "Ifr;": "\u2111",
+    "Igrave": "\xcc",
+    "Igrave;": "\xcc",
+    "Im;": "\u2111",
+    "Imacr;": "\u012a",
+    "ImaginaryI;": "\u2148",
+    "Implies;": "\u21d2",
+    "Int;": "\u222c",
+    "Integral;": "\u222b",
+    "Intersection;": "\u22c2",
+    "InvisibleComma;": "\u2063",
+    "InvisibleTimes;": "\u2062",
+    "Iogon;": "\u012e",
+    "Iopf;": "\U0001d540",
+    "Iota;": "\u0399",
+    "Iscr;": "\u2110",
+    "Itilde;": "\u0128",
+    "Iukcy;": "\u0406",
+    "Iuml": "\xcf",
+    "Iuml;": "\xcf",
+    "Jcirc;": "\u0134",
+    "Jcy;": "\u0419",
+    "Jfr;": "\U0001d50d",
+    "Jopf;": "\U0001d541",
+    "Jscr;": "\U0001d4a5",
+    "Jsercy;": "\u0408",
+    "Jukcy;": "\u0404",
+    "KHcy;": "\u0425",
+    "KJcy;": "\u040c",
+    "Kappa;": "\u039a",
+    "Kcedil;": "\u0136",
+    "Kcy;": "\u041a",
+    "Kfr;": "\U0001d50e",
+    "Kopf;": "\U0001d542",
+    "Kscr;": "\U0001d4a6",
+    "LJcy;": "\u0409",
+    "LT": "<",
+    "LT;": "<",
+    "Lacute;": "\u0139",
+    "Lambda;": "\u039b",
+    "Lang;": "\u27ea",
+    "Laplacetrf;": "\u2112",
+    "Larr;": "\u219e",
+    "Lcaron;": "\u013d",
+    "Lcedil;": "\u013b",
+    "Lcy;": "\u041b",
+    "LeftAngleBracket;": "\u27e8",
+    "LeftArrow;": "\u2190",
+    "LeftArrowBar;": "\u21e4",
+    "LeftArrowRightArrow;": "\u21c6",
+    "LeftCeiling;": "\u2308",
+    "LeftDoubleBracket;": "\u27e6",
+    "LeftDownTeeVector;": "\u2961",
+    "LeftDownVector;": "\u21c3",
+    "LeftDownVectorBar;": "\u2959",
+    "LeftFloor;": "\u230a",
+    "LeftRightArrow;": "\u2194",
+    "LeftRightVector;": "\u294e",
+    "LeftTee;": "\u22a3",
+    "LeftTeeArrow;": "\u21a4",
+    "LeftTeeVector;": "\u295a",
+    "LeftTriangle;": "\u22b2",
+    "LeftTriangleBar;": "\u29cf",
+    "LeftTriangleEqual;": "\u22b4",
+    "LeftUpDownVector;": "\u2951",
+    "LeftUpTeeVector;": "\u2960",
+    "LeftUpVector;": "\u21bf",
+    "LeftUpVectorBar;": "\u2958",
+    "LeftVector;": "\u21bc",
+    "LeftVectorBar;": "\u2952",
+    "Leftarrow;": "\u21d0",
+    "Leftrightarrow;": "\u21d4",
+    "LessEqualGreater;": "\u22da",
+    "LessFullEqual;": "\u2266",
+    "LessGreater;": "\u2276",
+    "LessLess;": "\u2aa1",
+    "LessSlantEqual;": "\u2a7d",
+    "LessTilde;": "\u2272",
+    "Lfr;": "\U0001d50f",
+    "Ll;": "\u22d8",
+    "Lleftarrow;": "\u21da",
+    "Lmidot;": "\u013f",
+    "LongLeftArrow;": "\u27f5",
+    "LongLeftRightArrow;": "\u27f7",
+    "LongRightArrow;": "\u27f6",
+    "Longleftarrow;": "\u27f8",
+    "Longleftrightarrow;": "\u27fa",
+    "Longrightarrow;": "\u27f9",
+    "Lopf;": "\U0001d543",
+    "LowerLeftArrow;": "\u2199",
+    "LowerRightArrow;": "\u2198",
+    "Lscr;": "\u2112",
+    "Lsh;": "\u21b0",
+    "Lstrok;": "\u0141",
+    "Lt;": "\u226a",
+    "Map;": "\u2905",
+    "Mcy;": "\u041c",
+    "MediumSpace;": "\u205f",
+    "Mellintrf;": "\u2133",
+    "Mfr;": "\U0001d510",
+    "MinusPlus;": "\u2213",
+    "Mopf;": "\U0001d544",
+    "Mscr;": "\u2133",
+    "Mu;": "\u039c",
+    "NJcy;": "\u040a",
+    "Nacute;": "\u0143",
+    "Ncaron;": "\u0147",
+    "Ncedil;": "\u0145",
+    "Ncy;": "\u041d",
+    "NegativeMediumSpace;": "\u200b",
+    "NegativeThickSpace;": "\u200b",
+    "NegativeThinSpace;": "\u200b",
+    "NegativeVeryThinSpace;": "\u200b",
+    "NestedGreaterGreater;": "\u226b",
+    "NestedLessLess;": "\u226a",
+    "NewLine;": "\n",
+    "Nfr;": "\U0001d511",
+    "NoBreak;": "\u2060",
+    "NonBreakingSpace;": "\xa0",
+    "Nopf;": "\u2115",
+    "Not;": "\u2aec",
+    "NotCongruent;": "\u2262",
+    "NotCupCap;": "\u226d",
+    "NotDoubleVerticalBar;": "\u2226",
+    "NotElement;": "\u2209",
+    "NotEqual;": "\u2260",
+    "NotEqualTilde;": "\u2242\u0338",
+    "NotExists;": "\u2204",
+    "NotGreater;": "\u226f",
+    "NotGreaterEqual;": "\u2271",
+    "NotGreaterFullEqual;": "\u2267\u0338",
+    "NotGreaterGreater;": "\u226b\u0338",
+    "NotGreaterLess;": "\u2279",
+    "NotGreaterSlantEqual;": "\u2a7e\u0338",
+    "NotGreaterTilde;": "\u2275",
+    "NotHumpDownHump;": "\u224e\u0338",
+    "NotHumpEqual;": "\u224f\u0338",
+    "NotLeftTriangle;": "\u22ea",
+    "NotLeftTriangleBar;": "\u29cf\u0338",
+    "NotLeftTriangleEqual;": "\u22ec",
+    "NotLess;": "\u226e",
+    "NotLessEqual;": "\u2270",
+    "NotLessGreater;": "\u2278",
+    "NotLessLess;": "\u226a\u0338",
+    "NotLessSlantEqual;": "\u2a7d\u0338",
+    "NotLessTilde;": "\u2274",
+    "NotNestedGreaterGreater;": "\u2aa2\u0338",
+    "NotNestedLessLess;": "\u2aa1\u0338",
+    "NotPrecedes;": "\u2280",
+    "NotPrecedesEqual;": "\u2aaf\u0338",
+    "NotPrecedesSlantEqual;": "\u22e0",
+    "NotReverseElement;": "\u220c",
+    "NotRightTriangle;": "\u22eb",
+    "NotRightTriangleBar;": "\u29d0\u0338",
+    "NotRightTriangleEqual;": "\u22ed",
+    "NotSquareSubset;": "\u228f\u0338",
+    "NotSquareSubsetEqual;": "\u22e2",
+    "NotSquareSuperset;": "\u2290\u0338",
+    "NotSquareSupersetEqual;": "\u22e3",
+    "NotSubset;": "\u2282\u20d2",
+    "NotSubsetEqual;": "\u2288",
+    "NotSucceeds;": "\u2281",
+    "NotSucceedsEqual;": "\u2ab0\u0338",
+    "NotSucceedsSlantEqual;": "\u22e1",
+    "NotSucceedsTilde;": "\u227f\u0338",
+    "NotSuperset;": "\u2283\u20d2",
+    "NotSupersetEqual;": "\u2289",
+    "NotTilde;": "\u2241",
+    "NotTildeEqual;": "\u2244",
+    "NotTildeFullEqual;": "\u2247",
+    "NotTildeTilde;": "\u2249",
+    "NotVerticalBar;": "\u2224",
+    "Nscr;": "\U0001d4a9",
+    "Ntilde": "\xd1",
+    "Ntilde;": "\xd1",
+    "Nu;": "\u039d",
+    "OElig;": "\u0152",
+    "Oacute": "\xd3",
+    "Oacute;": "\xd3",
+    "Ocirc": "\xd4",
+    "Ocirc;": "\xd4",
+    "Ocy;": "\u041e",
+    "Odblac;": "\u0150",
+    "Ofr;": "\U0001d512",
+    "Ograve": "\xd2",
+    "Ograve;": "\xd2",
+    "Omacr;": "\u014c",
+    "Omega;": "\u03a9",
+    "Omicron;": "\u039f",
+    "Oopf;": "\U0001d546",
+    "OpenCurlyDoubleQuote;": "\u201c",
+    "OpenCurlyQuote;": "\u2018",
+    "Or;": "\u2a54",
+    "Oscr;": "\U0001d4aa",
+    "Oslash": "\xd8",
+    "Oslash;": "\xd8",
+    "Otilde": "\xd5",
+    "Otilde;": "\xd5",
+    "Otimes;": "\u2a37",
+    "Ouml": "\xd6",
+    "Ouml;": "\xd6",
+    "OverBar;": "\u203e",
+    "OverBrace;": "\u23de",
+    "OverBracket;": "\u23b4",
+    "OverParenthesis;": "\u23dc",
+    "PartialD;": "\u2202",
+    "Pcy;": "\u041f",
+    "Pfr;": "\U0001d513",
+    "Phi;": "\u03a6",
+    "Pi;": "\u03a0",
+    "PlusMinus;": "\xb1",
+    "Poincareplane;": "\u210c",
+    "Popf;": "\u2119",
+    "Pr;": "\u2abb",
+    "Precedes;": "\u227a",
+    "PrecedesEqual;": "\u2aaf",
+    "PrecedesSlantEqual;": "\u227c",
+    "PrecedesTilde;": "\u227e",
+    "Prime;": "\u2033",
+    "Product;": "\u220f",
+    "Proportion;": "\u2237",
+    "Proportional;": "\u221d",
+    "Pscr;": "\U0001d4ab",
+    "Psi;": "\u03a8",
+    "QUOT": "\"",
+    "QUOT;": "\"",
+    "Qfr;": "\U0001d514",
+    "Qopf;": "\u211a",
+    "Qscr;": "\U0001d4ac",
+    "RBarr;": "\u2910",
+    "REG": "\xae",
+    "REG;": "\xae",
+    "Racute;": "\u0154",
+    "Rang;": "\u27eb",
+    "Rarr;": "\u21a0",
+    "Rarrtl;": "\u2916",
+    "Rcaron;": "\u0158",
+    "Rcedil;": "\u0156",
+    "Rcy;": "\u0420",
+    "Re;": "\u211c",
+    "ReverseElement;": "\u220b",
+    "ReverseEquilibrium;": "\u21cb",
+    "ReverseUpEquilibrium;": "\u296f",
+    "Rfr;": "\u211c",
+    "Rho;": "\u03a1",
+    "RightAngleBracket;": "\u27e9",
+    "RightArrow;": "\u2192",
+    "RightArrowBar;": "\u21e5",
+    "RightArrowLeftArrow;": "\u21c4",
+    "RightCeiling;": "\u2309",
+    "RightDoubleBracket;": "\u27e7",
+    "RightDownTeeVector;": "\u295d",
+    "RightDownVector;": "\u21c2",
+    "RightDownVectorBar;": "\u2955",
+    "RightFloor;": "\u230b",
+    "RightTee;": "\u22a2",
+    "RightTeeArrow;": "\u21a6",
+    "RightTeeVector;": "\u295b",
+    "RightTriangle;": "\u22b3",
+    "RightTriangleBar;": "\u29d0",
+    "RightTriangleEqual;": "\u22b5",
+    "RightUpDownVector;": "\u294f",
+    "RightUpTeeVector;": "\u295c",
+    "RightUpVector;": "\u21be",
+    "RightUpVectorBar;": "\u2954",
+    "RightVector;": "\u21c0",
+    "RightVectorBar;": "\u2953",
+    "Rightarrow;": "\u21d2",
+    "Ropf;": "\u211d",
+    "RoundImplies;": "\u2970",
+    "Rrightarrow;": "\u21db",
+    "Rscr;": "\u211b",
+    "Rsh;": "\u21b1",
+    "RuleDelayed;": "\u29f4",
+    "SHCHcy;": "\u0429",
+    "SHcy;": "\u0428",
+    "SOFTcy;": "\u042c",
+    "Sacute;": "\u015a",
+    "Sc;": "\u2abc",
+    "Scaron;": "\u0160",
+    "Scedil;": "\u015e",
+    "Scirc;": "\u015c",
+    "Scy;": "\u0421",
+    "Sfr;": "\U0001d516",
+    "ShortDownArrow;": "\u2193",
+    "ShortLeftArrow;": "\u2190",
+    "ShortRightArrow;": "\u2192",
+    "ShortUpArrow;": "\u2191",
+    "Sigma;": "\u03a3",
+    "SmallCircle;": "\u2218",
+    "Sopf;": "\U0001d54a",
+    "Sqrt;": "\u221a",
+    "Square;": "\u25a1",
+    "SquareIntersection;": "\u2293",
+    "SquareSubset;": "\u228f",
+    "SquareSubsetEqual;": "\u2291",
+    "SquareSuperset;": "\u2290",
+    "SquareSupersetEqual;": "\u2292",
+    "SquareUnion;": "\u2294",
+    "Sscr;": "\U0001d4ae",
+    "Star;": "\u22c6",
+    "Sub;": "\u22d0",
+    "Subset;": "\u22d0",
+    "SubsetEqual;": "\u2286",
+    "Succeeds;": "\u227b",
+    "SucceedsEqual;": "\u2ab0",
+    "SucceedsSlantEqual;": "\u227d",
+    "SucceedsTilde;": "\u227f",
+    "SuchThat;": "\u220b",
+    "Sum;": "\u2211",
+    "Sup;": "\u22d1",
+    "Superset;": "\u2283",
+    "SupersetEqual;": "\u2287",
+    "Supset;": "\u22d1",
+    "THORN": "\xde",
+    "THORN;": "\xde",
+    "TRADE;": "\u2122",
+    "TSHcy;": "\u040b",
+    "TScy;": "\u0426",
+    "Tab;": "\t",
+    "Tau;": "\u03a4",
+    "Tcaron;": "\u0164",
+    "Tcedil;": "\u0162",
+    "Tcy;": "\u0422",
+    "Tfr;": "\U0001d517",
+    "Therefore;": "\u2234",
+    "Theta;": "\u0398",
+    "ThickSpace;": "\u205f\u200a",
+    "ThinSpace;": "\u2009",
+    "Tilde;": "\u223c",
+    "TildeEqual;": "\u2243",
+    "TildeFullEqual;": "\u2245",
+    "TildeTilde;": "\u2248",
+    "Topf;": "\U0001d54b",
+    "TripleDot;": "\u20db",
+    "Tscr;": "\U0001d4af",
+    "Tstrok;": "\u0166",
+    "Uacute": "\xda",
+    "Uacute;": "\xda",
+    "Uarr;": "\u219f",
+    "Uarrocir;": "\u2949",
+    "Ubrcy;": "\u040e",
+    "Ubreve;": "\u016c",
+    "Ucirc": "\xdb",
+    "Ucirc;": "\xdb",
+    "Ucy;": "\u0423",
+    "Udblac;": "\u0170",
+    "Ufr;": "\U0001d518",
+    "Ugrave": "\xd9",
+    "Ugrave;": "\xd9",
+    "Umacr;": "\u016a",
+    "UnderBar;": "_",
+    "UnderBrace;": "\u23df",
+    "UnderBracket;": "\u23b5",
+    "UnderParenthesis;": "\u23dd",
+    "Union;": "\u22c3",
+    "UnionPlus;": "\u228e",
+    "Uogon;": "\u0172",
+    "Uopf;": "\U0001d54c",
+    "UpArrow;": "\u2191",
+    "UpArrowBar;": "\u2912",
+    "UpArrowDownArrow;": "\u21c5",
+    "UpDownArrow;": "\u2195",
+    "UpEquilibrium;": "\u296e",
+    "UpTee;": "\u22a5",
+    "UpTeeArrow;": "\u21a5",
+    "Uparrow;": "\u21d1",
+    "Updownarrow;": "\u21d5",
+    "UpperLeftArrow;": "\u2196",
+    "UpperRightArrow;": "\u2197",
+    "Upsi;": "\u03d2",
+    "Upsilon;": "\u03a5",
+    "Uring;": "\u016e",
+    "Uscr;": "\U0001d4b0",
+    "Utilde;": "\u0168",
+    "Uuml": "\xdc",
+    "Uuml;": "\xdc",
+    "VDash;": "\u22ab",
+    "Vbar;": "\u2aeb",
+    "Vcy;": "\u0412",
+    "Vdash;": "\u22a9",
+    "Vdashl;": "\u2ae6",
+    "Vee;": "\u22c1",
+    "Verbar;": "\u2016",
+    "Vert;": "\u2016",
+    "VerticalBar;": "\u2223",
+    "VerticalLine;": "|",
+    "VerticalSeparator;": "\u2758",
+    "VerticalTilde;": "\u2240",
+    "VeryThinSpace;": "\u200a",
+    "Vfr;": "\U0001d519",
+    "Vopf;": "\U0001d54d",
+    "Vscr;": "\U0001d4b1",
+    "Vvdash;": "\u22aa",
+    "Wcirc;": "\u0174",
+    "Wedge;": "\u22c0",
+    "Wfr;": "\U0001d51a",
+    "Wopf;": "\U0001d54e",
+    "Wscr;": "\U0001d4b2",
+    "Xfr;": "\U0001d51b",
+    "Xi;": "\u039e",
+    "Xopf;": "\U0001d54f",
+    "Xscr;": "\U0001d4b3",
+    "YAcy;": "\u042f",
+    "YIcy;": "\u0407",
+    "YUcy;": "\u042e",
+    "Yacute": "\xdd",
+    "Yacute;": "\xdd",
+    "Ycirc;": "\u0176",
+    "Ycy;": "\u042b",
+    "Yfr;": "\U0001d51c",
+    "Yopf;": "\U0001d550",
+    "Yscr;": "\U0001d4b4",
+    "Yuml;": "\u0178",
+    "ZHcy;": "\u0416",
+    "Zacute;": "\u0179",
+    "Zcaron;": "\u017d",
+    "Zcy;": "\u0417",
+    "Zdot;": "\u017b",
+    "ZeroWidthSpace;": "\u200b",
+    "Zeta;": "\u0396",
+    "Zfr;": "\u2128",
+    "Zopf;": "\u2124",
+    "Zscr;": "\U0001d4b5",
+    "aacute": "\xe1",
+    "aacute;": "\xe1",
+    "abreve;": "\u0103",
+    "ac;": "\u223e",
+    "acE;": "\u223e\u0333",
+    "acd;": "\u223f",
+    "acirc": "\xe2",
+    "acirc;": "\xe2",
+    "acute": "\xb4",
+    "acute;": "\xb4",
+    "acy;": "\u0430",
+    "aelig": "\xe6",
+    "aelig;": "\xe6",
+    "af;": "\u2061",
+    "afr;": "\U0001d51e",
+    "agrave": "\xe0",
+    "agrave;": "\xe0",
+    "alefsym;": "\u2135",
+    "aleph;": "\u2135",
+    "alpha;": "\u03b1",
+    "amacr;": "\u0101",
+    "amalg;": "\u2a3f",
+    "amp": "&",
+    "amp;": "&",
+    "and;": "\u2227",
+    "andand;": "\u2a55",
+    "andd;": "\u2a5c",
+    "andslope;": "\u2a58",
+    "andv;": "\u2a5a",
+    "ang;": "\u2220",
+    "ange;": "\u29a4",
+    "angle;": "\u2220",
+    "angmsd;": "\u2221",
+    "angmsdaa;": "\u29a8",
+    "angmsdab;": "\u29a9",
+    "angmsdac;": "\u29aa",
+    "angmsdad;": "\u29ab",
+    "angmsdae;": "\u29ac",
+    "angmsdaf;": "\u29ad",
+    "angmsdag;": "\u29ae",
+    "angmsdah;": "\u29af",
+    "angrt;": "\u221f",
+    "angrtvb;": "\u22be",
+    "angrtvbd;": "\u299d",
+    "angsph;": "\u2222",
+    "angst;": "\xc5",
+    "angzarr;": "\u237c",
+    "aogon;": "\u0105",
+    "aopf;": "\U0001d552",
+    "ap;": "\u2248",
+    "apE;": "\u2a70",
+    "apacir;": "\u2a6f",
+    "ape;": "\u224a",
+    "apid;": "\u224b",
+    "apos;": "'",
+    "approx;": "\u2248",
+    "approxeq;": "\u224a",
+    "aring": "\xe5",
+    "aring;": "\xe5",
+    "ascr;": "\U0001d4b6",
+    "ast;": "*",
+    "asymp;": "\u2248",
+    "asympeq;": "\u224d",
+    "atilde": "\xe3",
+    "atilde;": "\xe3",
+    "auml": "\xe4",
+    "auml;": "\xe4",
+    "awconint;": "\u2233",
+    "awint;": "\u2a11",
+    "bNot;": "\u2aed",
+    "backcong;": "\u224c",
+    "backepsilon;": "\u03f6",
+    "backprime;": "\u2035",
+    "backsim;": "\u223d",
+    "backsimeq;": "\u22cd",
+    "barvee;": "\u22bd",
+    "barwed;": "\u2305",
+    "barwedge;": "\u2305",
+    "bbrk;": "\u23b5",
+    "bbrktbrk;": "\u23b6",
+    "bcong;": "\u224c",
+    "bcy;": "\u0431",
+    "bdquo;": "\u201e",
+    "becaus;": "\u2235",
+    "because;": "\u2235",
+    "bemptyv;": "\u29b0",
+    "bepsi;": "\u03f6",
+    "bernou;": "\u212c",
+    "beta;": "\u03b2",
+    "beth;": "\u2136",
+    "between;": "\u226c",
+    "bfr;": "\U0001d51f",
+    "bigcap;": "\u22c2",
+    "bigcirc;": "\u25ef",
+    "bigcup;": "\u22c3",
+    "bigodot;": "\u2a00",
+    "bigoplus;": "\u2a01",
+    "bigotimes;": "\u2a02",
+    "bigsqcup;": "\u2a06",
+    "bigstar;": "\u2605",
+    "bigtriangledown;": "\u25bd",
+    "bigtriangleup;": "\u25b3",
+    "biguplus;": "\u2a04",
+    "bigvee;": "\u22c1",
+    "bigwedge;": "\u22c0",
+    "bkarow;": "\u290d",
+    "blacklozenge;": "\u29eb",
+    "blacksquare;": "\u25aa",
+    "blacktriangle;": "\u25b4",
+    "blacktriangledown;": "\u25be",
+    "blacktriangleleft;": "\u25c2",
+    "blacktriangleright;": "\u25b8",
+    "blank;": "\u2423",
+    "blk12;": "\u2592",
+    "blk14;": "\u2591",
+    "blk34;": "\u2593",
+    "block;": "\u2588",
+    "bne;": "=\u20e5",
+    "bnequiv;": "\u2261\u20e5",
+    "bnot;": "\u2310",
+    "bopf;": "\U0001d553",
+    "bot;": "\u22a5",
+    "bottom;": "\u22a5",
+    "bowtie;": "\u22c8",
+    "boxDL;": "\u2557",
+    "boxDR;": "\u2554",
+    "boxDl;": "\u2556",
+    "boxDr;": "\u2553",
+    "boxH;": "\u2550",
+    "boxHD;": "\u2566",
+    "boxHU;": "\u2569",
+    "boxHd;": "\u2564",
+    "boxHu;": "\u2567",
+    "boxUL;": "\u255d",
+    "boxUR;": "\u255a",
+    "boxUl;": "\u255c",
+    "boxUr;": "\u2559",
+    "boxV;": "\u2551",
+    "boxVH;": "\u256c",
+    "boxVL;": "\u2563",
+    "boxVR;": "\u2560",
+    "boxVh;": "\u256b",
+    "boxVl;": "\u2562",
+    "boxVr;": "\u255f",
+    "boxbox;": "\u29c9",
+    "boxdL;": "\u2555",
+    "boxdR;": "\u2552",
+    "boxdl;": "\u2510",
+    "boxdr;": "\u250c",
+    "boxh;": "\u2500",
+    "boxhD;": "\u2565",
+    "boxhU;": "\u2568",
+    "boxhd;": "\u252c",
+    "boxhu;": "\u2534",
+    "boxminus;": "\u229f",
+    "boxplus;": "\u229e",
+    "boxtimes;": "\u22a0",
+    "boxuL;": "\u255b",
+    "boxuR;": "\u2558",
+    "boxul;": "\u2518",
+    "boxur;": "\u2514",
+    "boxv;": "\u2502",
+    "boxvH;": "\u256a",
+    "boxvL;": "\u2561",
+    "boxvR;": "\u255e",
+    "boxvh;": "\u253c",
+    "boxvl;": "\u2524",
+    "boxvr;": "\u251c",
+    "bprime;": "\u2035",
+    "breve;": "\u02d8",
+    "brvbar": "\xa6",
+    "brvbar;": "\xa6",
+    "bscr;": "\U0001d4b7",
+    "bsemi;": "\u204f",
+    "bsim;": "\u223d",
+    "bsime;": "\u22cd",
+    "bsol;": "\\",
+    "bsolb;": "\u29c5",
+    "bsolhsub;": "\u27c8",
+    "bull;": "\u2022",
+    "bullet;": "\u2022",
+    "bump;": "\u224e",
+    "bumpE;": "\u2aae",
+    "bumpe;": "\u224f",
+    "bumpeq;": "\u224f",
+    "cacute;": "\u0107",
+    "cap;": "\u2229",
+    "capand;": "\u2a44",
+    "capbrcup;": "\u2a49",
+    "capcap;": "\u2a4b",
+    "capcup;": "\u2a47",
+    "capdot;": "\u2a40",
+    "caps;": "\u2229\ufe00",
+    "caret;": "\u2041",
+    "caron;": "\u02c7",
+    "ccaps;": "\u2a4d",
+    "ccaron;": "\u010d",
+    "ccedil": "\xe7",
+    "ccedil;": "\xe7",
+    "ccirc;": "\u0109",
+    "ccups;": "\u2a4c",
+    "ccupssm;": "\u2a50",
+    "cdot;": "\u010b",
+    "cedil": "\xb8",
+    "cedil;": "\xb8",
+    "cemptyv;": "\u29b2",
+    "cent": "\xa2",
+    "cent;": "\xa2",
+    "centerdot;": "\xb7",
+    "cfr;": "\U0001d520",
+    "chcy;": "\u0447",
+    "check;": "\u2713",
+    "checkmark;": "\u2713",
+    "chi;": "\u03c7",
+    "cir;": "\u25cb",
+    "cirE;": "\u29c3",
+    "circ;": "\u02c6",
+    "circeq;": "\u2257",
+    "circlearrowleft;": "\u21ba",
+    "circlearrowright;": "\u21bb",
+    "circledR;": "\xae",
+    "circledS;": "\u24c8",
+    "circledast;": "\u229b",
+    "circledcirc;": "\u229a",
+    "circleddash;": "\u229d",
+    "cire;": "\u2257",
+    "cirfnint;": "\u2a10",
+    "cirmid;": "\u2aef",
+    "cirscir;": "\u29c2",
+    "clubs;": "\u2663",
+    "clubsuit;": "\u2663",
+    "colon;": ":",
+    "colone;": "\u2254",
+    "coloneq;": "\u2254",
+    "comma;": ",",
+    "commat;": "@",
+    "comp;": "\u2201",
+    "compfn;": "\u2218",
+    "complement;": "\u2201",
+    "complexes;": "\u2102",
+    "cong;": "\u2245",
+    "congdot;": "\u2a6d",
+    "conint;": "\u222e",
+    "copf;": "\U0001d554",
+    "coprod;": "\u2210",
+    "copy": "\xa9",
+    "copy;": "\xa9",
+    "copysr;": "\u2117",
+    "crarr;": "\u21b5",
+    "cross;": "\u2717",
+    "cscr;": "\U0001d4b8",
+    "csub;": "\u2acf",
+    "csube;": "\u2ad1",
+    "csup;": "\u2ad0",
+    "csupe;": "\u2ad2",
+    "ctdot;": "\u22ef",
+    "cudarrl;": "\u2938",
+    "cudarrr;": "\u2935",
+    "cuepr;": "\u22de",
+    "cuesc;": "\u22df",
+    "cularr;": "\u21b6",
+    "cularrp;": "\u293d",
+    "cup;": "\u222a",
+    "cupbrcap;": "\u2a48",
+    "cupcap;": "\u2a46",
+    "cupcup;": "\u2a4a",
+    "cupdot;": "\u228d",
+    "cupor;": "\u2a45",
+    "cups;": "\u222a\ufe00",
+    "curarr;": "\u21b7",
+    "curarrm;": "\u293c",
+    "curlyeqprec;": "\u22de",
+    "curlyeqsucc;": "\u22df",
+    "curlyvee;": "\u22ce",
+    "curlywedge;": "\u22cf",
+    "curren": "\xa4",
+    "curren;": "\xa4",
+    "curvearrowleft;": "\u21b6",
+    "curvearrowright;": "\u21b7",
+    "cuvee;": "\u22ce",
+    "cuwed;": "\u22cf",
+    "cwconint;": "\u2232",
+    "cwint;": "\u2231",
+    "cylcty;": "\u232d",
+    "dArr;": "\u21d3",
+    "dHar;": "\u2965",
+    "dagger;": "\u2020",
+    "daleth;": "\u2138",
+    "darr;": "\u2193",
+    "dash;": "\u2010",
+    "dashv;": "\u22a3",
+    "dbkarow;": "\u290f",
+    "dblac;": "\u02dd",
+    "dcaron;": "\u010f",
+    "dcy;": "\u0434",
+    "dd;": "\u2146",
+    "ddagger;": "\u2021",
+    "ddarr;": "\u21ca",
+    "ddotseq;": "\u2a77",
+    "deg": "\xb0",
+    "deg;": "\xb0",
+    "delta;": "\u03b4",
+    "demptyv;": "\u29b1",
+    "dfisht;": "\u297f",
+    "dfr;": "\U0001d521",
+    "dharl;": "\u21c3",
+    "dharr;": "\u21c2",
+    "diam;": "\u22c4",
+    "diamond;": "\u22c4",
+    "diamondsuit;": "\u2666",
+    "diams;": "\u2666",
+    "die;": "\xa8",
+    "digamma;": "\u03dd",
+    "disin;": "\u22f2",
+    "div;": "\xf7",
+    "divide": "\xf7",
+    "divide;": "\xf7",
+    "divideontimes;": "\u22c7",
+    "divonx;": "\u22c7",
+    "djcy;": "\u0452",
+    "dlcorn;": "\u231e",
+    "dlcrop;": "\u230d",
+    "dollar;": "$",
+    "dopf;": "\U0001d555",
+    "dot;": "\u02d9",
+    "doteq;": "\u2250",
+    "doteqdot;": "\u2251",
+    "dotminus;": "\u2238",
+    "dotplus;": "\u2214",
+    "dotsquare;": "\u22a1",
+    "doublebarwedge;": "\u2306",
+    "downarrow;": "\u2193",
+    "downdownarrows;": "\u21ca",
+    "downharpoonleft;": "\u21c3",
+    "downharpoonright;": "\u21c2",
+    "drbkarow;": "\u2910",
+    "drcorn;": "\u231f",
+    "drcrop;": "\u230c",
+    "dscr;": "\U0001d4b9",
+    "dscy;": "\u0455",
+    "dsol;": "\u29f6",
+    "dstrok;": "\u0111",
+    "dtdot;": "\u22f1",
+    "dtri;": "\u25bf",
+    "dtrif;": "\u25be",
+    "duarr;": "\u21f5",
+    "duhar;": "\u296f",
+    "dwangle;": "\u29a6",
+    "dzcy;": "\u045f",
+    "dzigrarr;": "\u27ff",
+    "eDDot;": "\u2a77",
+    "eDot;": "\u2251",
+    "eacute": "\xe9",
+    "eacute;": "\xe9",
+    "easter;": "\u2a6e",
+    "ecaron;": "\u011b",
+    "ecir;": "\u2256",
+    "ecirc": "\xea",
+    "ecirc;": "\xea",
+    "ecolon;": "\u2255",
+    "ecy;": "\u044d",
+    "edot;": "\u0117",
+    "ee;": "\u2147",
+    "efDot;": "\u2252",
+    "efr;": "\U0001d522",
+    "eg;": "\u2a9a",
+    "egrave": "\xe8",
+    "egrave;": "\xe8",
+    "egs;": "\u2a96",
+    "egsdot;": "\u2a98",
+    "el;": "\u2a99",
+    "elinters;": "\u23e7",
+    "ell;": "\u2113",
+    "els;": "\u2a95",
+    "elsdot;": "\u2a97",
+    "emacr;": "\u0113",
+    "empty;": "\u2205",
+    "emptyset;": "\u2205",
+    "emptyv;": "\u2205",
+    "emsp13;": "\u2004",
+    "emsp14;": "\u2005",
+    "emsp;": "\u2003",
+    "eng;": "\u014b",
+    "ensp;": "\u2002",
+    "eogon;": "\u0119",
+    "eopf;": "\U0001d556",
+    "epar;": "\u22d5",
+    "eparsl;": "\u29e3",
+    "eplus;": "\u2a71",
+    "epsi;": "\u03b5",
+    "epsilon;": "\u03b5",
+    "epsiv;": "\u03f5",
+    "eqcirc;": "\u2256",
+    "eqcolon;": "\u2255",
+    "eqsim;": "\u2242",
+    "eqslantgtr;": "\u2a96",
+    "eqslantless;": "\u2a95",
+    "equals;": "=",
+    "equest;": "\u225f",
+    "equiv;": "\u2261",
+    "equivDD;": "\u2a78",
+    "eqvparsl;": "\u29e5",
+    "erDot;": "\u2253",
+    "erarr;": "\u2971",
+    "escr;": "\u212f",
+    "esdot;": "\u2250",
+    "esim;": "\u2242",
+    "eta;": "\u03b7",
+    "eth": "\xf0",
+    "eth;": "\xf0",
+    "euml": "\xeb",
+    "euml;": "\xeb",
+    "euro;": "\u20ac",
+    "excl;": "!",
+    "exist;": "\u2203",
+    "expectation;": "\u2130",
+    "exponentiale;": "\u2147",
+    "fallingdotseq;": "\u2252",
+    "fcy;": "\u0444",
+    "female;": "\u2640",
+    "ffilig;": "\ufb03",
+    "fflig;": "\ufb00",
+    "ffllig;": "\ufb04",
+    "ffr;": "\U0001d523",
+    "filig;": "\ufb01",
+    "fjlig;": "fj",
+    "flat;": "\u266d",
+    "fllig;": "\ufb02",
+    "fltns;": "\u25b1",
+    "fnof;": "\u0192",
+    "fopf;": "\U0001d557",
+    "forall;": "\u2200",
+    "fork;": "\u22d4",
+    "forkv;": "\u2ad9",
+    "fpartint;": "\u2a0d",
+    "frac12": "\xbd",
+    "frac12;": "\xbd",
+    "frac13;": "\u2153",
+    "frac14": "\xbc",
+    "frac14;": "\xbc",
+    "frac15;": "\u2155",
+    "frac16;": "\u2159",
+    "frac18;": "\u215b",
+    "frac23;": "\u2154",
+    "frac25;": "\u2156",
+    "frac34": "\xbe",
+    "frac34;": "\xbe",
+    "frac35;": "\u2157",
+    "frac38;": "\u215c",
+    "frac45;": "\u2158",
+    "frac56;": "\u215a",
+    "frac58;": "\u215d",
+    "frac78;": "\u215e",
+    "frasl;": "\u2044",
+    "frown;": "\u2322",
+    "fscr;": "\U0001d4bb",
+    "gE;": "\u2267",
+    "gEl;": "\u2a8c",
+    "gacute;": "\u01f5",
+    "gamma;": "\u03b3",
+    "gammad;": "\u03dd",
+    "gap;": "\u2a86",
+    "gbreve;": "\u011f",
+    "gcirc;": "\u011d",
+    "gcy;": "\u0433",
+    "gdot;": "\u0121",
+    "ge;": "\u2265",
+    "gel;": "\u22db",
+    "geq;": "\u2265",
+    "geqq;": "\u2267",
+    "geqslant;": "\u2a7e",
+    "ges;": "\u2a7e",
+    "gescc;": "\u2aa9",
+    "gesdot;": "\u2a80",
+    "gesdoto;": "\u2a82",
+    "gesdotol;": "\u2a84",
+    "gesl;": "\u22db\ufe00",
+    "gesles;": "\u2a94",
+    "gfr;": "\U0001d524",
+    "gg;": "\u226b",
+    "ggg;": "\u22d9",
+    "gimel;": "\u2137",
+    "gjcy;": "\u0453",
+    "gl;": "\u2277",
+    "glE;": "\u2a92",
+    "gla;": "\u2aa5",
+    "glj;": "\u2aa4",
+    "gnE;": "\u2269",
+    "gnap;": "\u2a8a",
+    "gnapprox;": "\u2a8a",
+    "gne;": "\u2a88",
+    "gneq;": "\u2a88",
+    "gneqq;": "\u2269",
+    "gnsim;": "\u22e7",
+    "gopf;": "\U0001d558",
+    "grave;": "`",
+    "gscr;": "\u210a",
+    "gsim;": "\u2273",
+    "gsime;": "\u2a8e",
+    "gsiml;": "\u2a90",
+    "gt": ">",
+    "gt;": ">",
+    "gtcc;": "\u2aa7",
+    "gtcir;": "\u2a7a",
+    "gtdot;": "\u22d7",
+    "gtlPar;": "\u2995",
+    "gtquest;": "\u2a7c",
+    "gtrapprox;": "\u2a86",
+    "gtrarr;": "\u2978",
+    "gtrdot;": "\u22d7",
+    "gtreqless;": "\u22db",
+    "gtreqqless;": "\u2a8c",
+    "gtrless;": "\u2277",
+    "gtrsim;": "\u2273",
+    "gvertneqq;": "\u2269\ufe00",
+    "gvnE;": "\u2269\ufe00",
+    "hArr;": "\u21d4",
+    "hairsp;": "\u200a",
+    "half;": "\xbd",
+    "hamilt;": "\u210b",
+    "hardcy;": "\u044a",
+    "harr;": "\u2194",
+    "harrcir;": "\u2948",
+    "harrw;": "\u21ad",
+    "hbar;": "\u210f",
+    "hcirc;": "\u0125",
+    "hearts;": "\u2665",
+    "heartsuit;": "\u2665",
+    "hellip;": "\u2026",
+    "hercon;": "\u22b9",
+    "hfr;": "\U0001d525",
+    "hksearow;": "\u2925",
+    "hkswarow;": "\u2926",
+    "hoarr;": "\u21ff",
+    "homtht;": "\u223b",
+    "hookleftarrow;": "\u21a9",
+    "hookrightarrow;": "\u21aa",
+    "hopf;": "\U0001d559",
+    "horbar;": "\u2015",
+    "hscr;": "\U0001d4bd",
+    "hslash;": "\u210f",
+    "hstrok;": "\u0127",
+    "hybull;": "\u2043",
+    "hyphen;": "\u2010",
+    "iacute": "\xed",
+    "iacute;": "\xed",
+    "ic;": "\u2063",
+    "icirc": "\xee",
+    "icirc;": "\xee",
+    "icy;": "\u0438",
+    "iecy;": "\u0435",
+    "iexcl": "\xa1",
+    "iexcl;": "\xa1",
+    "iff;": "\u21d4",
+    "ifr;": "\U0001d526",
+    "igrave": "\xec",
+    "igrave;": "\xec",
+    "ii;": "\u2148",
+    "iiiint;": "\u2a0c",
+    "iiint;": "\u222d",
+    "iinfin;": "\u29dc",
+    "iiota;": "\u2129",
+    "ijlig;": "\u0133",
+    "imacr;": "\u012b",
+    "image;": "\u2111",
+    "imagline;": "\u2110",
+    "imagpart;": "\u2111",
+    "imath;": "\u0131",
+    "imof;": "\u22b7",
+    "imped;": "\u01b5",
+    "in;": "\u2208",
+    "incare;": "\u2105",
+    "infin;": "\u221e",
+    "infintie;": "\u29dd",
+    "inodot;": "\u0131",
+    "int;": "\u222b",
+    "intcal;": "\u22ba",
+    "integers;": "\u2124",
+    "intercal;": "\u22ba",
+    "intlarhk;": "\u2a17",
+    "intprod;": "\u2a3c",
+    "iocy;": "\u0451",
+    "iogon;": "\u012f",
+    "iopf;": "\U0001d55a",
+    "iota;": "\u03b9",
+    "iprod;": "\u2a3c",
+    "iquest": "\xbf",
+    "iquest;": "\xbf",
+    "iscr;": "\U0001d4be",
+    "isin;": "\u2208",
+    "isinE;": "\u22f9",
+    "isindot;": "\u22f5",
+    "isins;": "\u22f4",
+    "isinsv;": "\u22f3",
+    "isinv;": "\u2208",
+    "it;": "\u2062",
+    "itilde;": "\u0129",
+    "iukcy;": "\u0456",
+    "iuml": "\xef",
+    "iuml;": "\xef",
+    "jcirc;": "\u0135",
+    "jcy;": "\u0439",
+    "jfr;": "\U0001d527",
+    "jmath;": "\u0237",
+    "jopf;": "\U0001d55b",
+    "jscr;": "\U0001d4bf",
+    "jsercy;": "\u0458",
+    "jukcy;": "\u0454",
+    "kappa;": "\u03ba",
+    "kappav;": "\u03f0",
+    "kcedil;": "\u0137",
+    "kcy;": "\u043a",
+    "kfr;": "\U0001d528",
+    "kgreen;": "\u0138",
+    "khcy;": "\u0445",
+    "kjcy;": "\u045c",
+    "kopf;": "\U0001d55c",
+    "kscr;": "\U0001d4c0",
+    "lAarr;": "\u21da",
+    "lArr;": "\u21d0",
+    "lAtail;": "\u291b",
+    "lBarr;": "\u290e",
+    "lE;": "\u2266",
+    "lEg;": "\u2a8b",
+    "lHar;": "\u2962",
+    "lacute;": "\u013a",
+    "laemptyv;": "\u29b4",
+    "lagran;": "\u2112",
+    "lambda;": "\u03bb",
+    "lang;": "\u27e8",
+    "langd;": "\u2991",
+    "langle;": "\u27e8",
+    "lap;": "\u2a85",
+    "laquo": "\xab",
+    "laquo;": "\xab",
+    "larr;": "\u2190",
+    "larrb;": "\u21e4",
+    "larrbfs;": "\u291f",
+    "larrfs;": "\u291d",
+    "larrhk;": "\u21a9",
+    "larrlp;": "\u21ab",
+    "larrpl;": "\u2939",
+    "larrsim;": "\u2973",
+    "larrtl;": "\u21a2",
+    "lat;": "\u2aab",
+    "latail;": "\u2919",
+    "late;": "\u2aad",
+    "lates;": "\u2aad\ufe00",
+    "lbarr;": "\u290c",
+    "lbbrk;": "\u2772",
+    "lbrace;": "{",
+    "lbrack;": "[",
+    "lbrke;": "\u298b",
+    "lbrksld;": "\u298f",
+    "lbrkslu;": "\u298d",
+    "lcaron;": "\u013e",
+    "lcedil;": "\u013c",
+    "lceil;": "\u2308",
+    "lcub;": "{",
+    "lcy;": "\u043b",
+    "ldca;": "\u2936",
+    "ldquo;": "\u201c",
+    "ldquor;": "\u201e",
+    "ldrdhar;": "\u2967",
+    "ldrushar;": "\u294b",
+    "ldsh;": "\u21b2",
+    "le;": "\u2264",
+    "leftarrow;": "\u2190",
+    "leftarrowtail;": "\u21a2",
+    "leftharpoondown;": "\u21bd",
+    "leftharpoonup;": "\u21bc",
+    "leftleftarrows;": "\u21c7",
+    "leftrightarrow;": "\u2194",
+    "leftrightarrows;": "\u21c6",
+    "leftrightharpoons;": "\u21cb",
+    "leftrightsquigarrow;": "\u21ad",
+    "leftthreetimes;": "\u22cb",
+    "leg;": "\u22da",
+    "leq;": "\u2264",
+    "leqq;": "\u2266",
+    "leqslant;": "\u2a7d",
+    "les;": "\u2a7d",
+    "lescc;": "\u2aa8",
+    "lesdot;": "\u2a7f",
+    "lesdoto;": "\u2a81",
+    "lesdotor;": "\u2a83",
+    "lesg;": "\u22da\ufe00",
+    "lesges;": "\u2a93",
+    "lessapprox;": "\u2a85",
+    "lessdot;": "\u22d6",
+    "lesseqgtr;": "\u22da",
+    "lesseqqgtr;": "\u2a8b",
+    "lessgtr;": "\u2276",
+    "lesssim;": "\u2272",
+    "lfisht;": "\u297c",
+    "lfloor;": "\u230a",
+    "lfr;": "\U0001d529",
+    "lg;": "\u2276",
+    "lgE;": "\u2a91",
+    "lhard;": "\u21bd",
+    "lharu;": "\u21bc",
+    "lharul;": "\u296a",
+    "lhblk;": "\u2584",
+    "ljcy;": "\u0459",
+    "ll;": "\u226a",
+    "llarr;": "\u21c7",
+    "llcorner;": "\u231e",
+    "llhard;": "\u296b",
+    "lltri;": "\u25fa",
+    "lmidot;": "\u0140",
+    "lmoust;": "\u23b0",
+    "lmoustache;": "\u23b0",
+    "lnE;": "\u2268",
+    "lnap;": "\u2a89",
+    "lnapprox;": "\u2a89",
+    "lne;": "\u2a87",
+    "lneq;": "\u2a87",
+    "lneqq;": "\u2268",
+    "lnsim;": "\u22e6",
+    "loang;": "\u27ec",
+    "loarr;": "\u21fd",
+    "lobrk;": "\u27e6",
+    "longleftarrow;": "\u27f5",
+    "longleftrightarrow;": "\u27f7",
+    "longmapsto;": "\u27fc",
+    "longrightarrow;": "\u27f6",
+    "looparrowleft;": "\u21ab",
+    "looparrowright;": "\u21ac",
+    "lopar;": "\u2985",
+    "lopf;": "\U0001d55d",
+    "loplus;": "\u2a2d",
+    "lotimes;": "\u2a34",
+    "lowast;": "\u2217",
+    "lowbar;": "_",
+    "loz;": "\u25ca",
+    "lozenge;": "\u25ca",
+    "lozf;": "\u29eb",
+    "lpar;": "(",
+    "lparlt;": "\u2993",
+    "lrarr;": "\u21c6",
+    "lrcorner;": "\u231f",
+    "lrhar;": "\u21cb",
+    "lrhard;": "\u296d",
+    "lrm;": "\u200e",
+    "lrtri;": "\u22bf",
+    "lsaquo;": "\u2039",
+    "lscr;": "\U0001d4c1",
+    "lsh;": "\u21b0",
+    "lsim;": "\u2272",
+    "lsime;": "\u2a8d",
+    "lsimg;": "\u2a8f",
+    "lsqb;": "[",
+    "lsquo;": "\u2018",
+    "lsquor;": "\u201a",
+    "lstrok;": "\u0142",
+    "lt": "<",
+    "lt;": "<",
+    "ltcc;": "\u2aa6",
+    "ltcir;": "\u2a79",
+    "ltdot;": "\u22d6",
+    "lthree;": "\u22cb",
+    "ltimes;": "\u22c9",
+    "ltlarr;": "\u2976",
+    "ltquest;": "\u2a7b",
+    "ltrPar;": "\u2996",
+    "ltri;": "\u25c3",
+    "ltrie;": "\u22b4",
+    "ltrif;": "\u25c2",
+    "lurdshar;": "\u294a",
+    "luruhar;": "\u2966",
+    "lvertneqq;": "\u2268\ufe00",
+    "lvnE;": "\u2268\ufe00",
+    "mDDot;": "\u223a",
+    "macr": "\xaf",
+    "macr;": "\xaf",
+    "male;": "\u2642",
+    "malt;": "\u2720",
+    "maltese;": "\u2720",
+    "map;": "\u21a6",
+    "mapsto;": "\u21a6",
+    "mapstodown;": "\u21a7",
+    "mapstoleft;": "\u21a4",
+    "mapstoup;": "\u21a5",
+    "marker;": "\u25ae",
+    "mcomma;": "\u2a29",
+    "mcy;": "\u043c",
+    "mdash;": "\u2014",
+    "measuredangle;": "\u2221",
+    "mfr;": "\U0001d52a",
+    "mho;": "\u2127",
+    "micro": "\xb5",
+    "micro;": "\xb5",
+    "mid;": "\u2223",
+    "midast;": "*",
+    "midcir;": "\u2af0",
+    "middot": "\xb7",
+    "middot;": "\xb7",
+    "minus;": "\u2212",
+    "minusb;": "\u229f",
+    "minusd;": "\u2238",
+    "minusdu;": "\u2a2a",
+    "mlcp;": "\u2adb",
+    "mldr;": "\u2026",
+    "mnplus;": "\u2213",
+    "models;": "\u22a7",
+    "mopf;": "\U0001d55e",
+    "mp;": "\u2213",
+    "mscr;": "\U0001d4c2",
+    "mstpos;": "\u223e",
+    "mu;": "\u03bc",
+    "multimap;": "\u22b8",
+    "mumap;": "\u22b8",
+    "nGg;": "\u22d9\u0338",
+    "nGt;": "\u226b\u20d2",
+    "nGtv;": "\u226b\u0338",
+    "nLeftarrow;": "\u21cd",
+    "nLeftrightarrow;": "\u21ce",
+    "nLl;": "\u22d8\u0338",
+    "nLt;": "\u226a\u20d2",
+    "nLtv;": "\u226a\u0338",
+    "nRightarrow;": "\u21cf",
+    "nVDash;": "\u22af",
+    "nVdash;": "\u22ae",
+    "nabla;": "\u2207",
+    "nacute;": "\u0144",
+    "nang;": "\u2220\u20d2",
+    "nap;": "\u2249",
+    "napE;": "\u2a70\u0338",
+    "napid;": "\u224b\u0338",
+    "napos;": "\u0149",
+    "napprox;": "\u2249",
+    "natur;": "\u266e",
+    "natural;": "\u266e",
+    "naturals;": "\u2115",
+    "nbsp": "\xa0",
+    "nbsp;": "\xa0",
+    "nbump;": "\u224e\u0338",
+    "nbumpe;": "\u224f\u0338",
+    "ncap;": "\u2a43",
+    "ncaron;": "\u0148",
+    "ncedil;": "\u0146",
+    "ncong;": "\u2247",
+    "ncongdot;": "\u2a6d\u0338",
+    "ncup;": "\u2a42",
+    "ncy;": "\u043d",
+    "ndash;": "\u2013",
+    "ne;": "\u2260",
+    "neArr;": "\u21d7",
+    "nearhk;": "\u2924",
+    "nearr;": "\u2197",
+    "nearrow;": "\u2197",
+    "nedot;": "\u2250\u0338",
+    "nequiv;": "\u2262",
+    "nesear;": "\u2928",
+    "nesim;": "\u2242\u0338",
+    "nexist;": "\u2204",
+    "nexists;": "\u2204",
+    "nfr;": "\U0001d52b",
+    "ngE;": "\u2267\u0338",
+    "nge;": "\u2271",
+    "ngeq;": "\u2271",
+    "ngeqq;": "\u2267\u0338",
+    "ngeqslant;": "\u2a7e\u0338",
+    "nges;": "\u2a7e\u0338",
+    "ngsim;": "\u2275",
+    "ngt;": "\u226f",
+    "ngtr;": "\u226f",
+    "nhArr;": "\u21ce",
+    "nharr;": "\u21ae",
+    "nhpar;": "\u2af2",
+    "ni;": "\u220b",
+    "nis;": "\u22fc",
+    "nisd;": "\u22fa",
+    "niv;": "\u220b",
+    "njcy;": "\u045a",
+    "nlArr;": "\u21cd",
+    "nlE;": "\u2266\u0338",
+    "nlarr;": "\u219a",
+    "nldr;": "\u2025",
+    "nle;": "\u2270",
+    "nleftarrow;": "\u219a",
+    "nleftrightarrow;": "\u21ae",
+    "nleq;": "\u2270",
+    "nleqq;": "\u2266\u0338",
+    "nleqslant;": "\u2a7d\u0338",
+    "nles;": "\u2a7d\u0338",
+    "nless;": "\u226e",
+    "nlsim;": "\u2274",
+    "nlt;": "\u226e",
+    "nltri;": "\u22ea",
+    "nltrie;": "\u22ec",
+    "nmid;": "\u2224",
+    "nopf;": "\U0001d55f",
+    "not": "\xac",
+    "not;": "\xac",
+    "notin;": "\u2209",
+    "notinE;": "\u22f9\u0338",
+    "notindot;": "\u22f5\u0338",
+    "notinva;": "\u2209",
+    "notinvb;": "\u22f7",
+    "notinvc;": "\u22f6",
+    "notni;": "\u220c",
+    "notniva;": "\u220c",
+    "notnivb;": "\u22fe",
+    "notnivc;": "\u22fd",
+    "npar;": "\u2226",
+    "nparallel;": "\u2226",
+    "nparsl;": "\u2afd\u20e5",
+    "npart;": "\u2202\u0338",
+    "npolint;": "\u2a14",
+    "npr;": "\u2280",
+    "nprcue;": "\u22e0",
+    "npre;": "\u2aaf\u0338",
+    "nprec;": "\u2280",
+    "npreceq;": "\u2aaf\u0338",
+    "nrArr;": "\u21cf",
+    "nrarr;": "\u219b",
+    "nrarrc;": "\u2933\u0338",
+    "nrarrw;": "\u219d\u0338",
+    "nrightarrow;": "\u219b",
+    "nrtri;": "\u22eb",
+    "nrtrie;": "\u22ed",
+    "nsc;": "\u2281",
+    "nsccue;": "\u22e1",
+    "nsce;": "\u2ab0\u0338",
+    "nscr;": "\U0001d4c3",
+    "nshortmid;": "\u2224",
+    "nshortparallel;": "\u2226",
+    "nsim;": "\u2241",
+    "nsime;": "\u2244",
+    "nsimeq;": "\u2244",
+    "nsmid;": "\u2224",
+    "nspar;": "\u2226",
+    "nsqsube;": "\u22e2",
+    "nsqsupe;": "\u22e3",
+    "nsub;": "\u2284",
+    "nsubE;": "\u2ac5\u0338",
+    "nsube;": "\u2288",
+    "nsubset;": "\u2282\u20d2",
+    "nsubseteq;": "\u2288",
+    "nsubseteqq;": "\u2ac5\u0338",
+    "nsucc;": "\u2281",
+    "nsucceq;": "\u2ab0\u0338",
+    "nsup;": "\u2285",
+    "nsupE;": "\u2ac6\u0338",
+    "nsupe;": "\u2289",
+    "nsupset;": "\u2283\u20d2",
+    "nsupseteq;": "\u2289",
+    "nsupseteqq;": "\u2ac6\u0338",
+    "ntgl;": "\u2279",
+    "ntilde": "\xf1",
+    "ntilde;": "\xf1",
+    "ntlg;": "\u2278",
+    "ntriangleleft;": "\u22ea",
+    "ntrianglelefteq;": "\u22ec",
+    "ntriangleright;": "\u22eb",
+    "ntrianglerighteq;": "\u22ed",
+    "nu;": "\u03bd",
+    "num;": "#",
+    "numero;": "\u2116",
+    "numsp;": "\u2007",
+    "nvDash;": "\u22ad",
+    "nvHarr;": "\u2904",
+    "nvap;": "\u224d\u20d2",
+    "nvdash;": "\u22ac",
+    "nvge;": "\u2265\u20d2",
+    "nvgt;": ">\u20d2",
+    "nvinfin;": "\u29de",
+    "nvlArr;": "\u2902",
+    "nvle;": "\u2264\u20d2",
+    "nvlt;": "<\u20d2",
+    "nvltrie;": "\u22b4\u20d2",
+    "nvrArr;": "\u2903",
+    "nvrtrie;": "\u22b5\u20d2",
+    "nvsim;": "\u223c\u20d2",
+    "nwArr;": "\u21d6",
+    "nwarhk;": "\u2923",
+    "nwarr;": "\u2196",
+    "nwarrow;": "\u2196",
+    "nwnear;": "\u2927",
+    "oS;": "\u24c8",
+    "oacute": "\xf3",
+    "oacute;": "\xf3",
+    "oast;": "\u229b",
+    "ocir;": "\u229a",
+    "ocirc": "\xf4",
+    "ocirc;": "\xf4",
+    "ocy;": "\u043e",
+    "odash;": "\u229d",
+    "odblac;": "\u0151",
+    "odiv;": "\u2a38",
+    "odot;": "\u2299",
+    "odsold;": "\u29bc",
+    "oelig;": "\u0153",
+    "ofcir;": "\u29bf",
+    "ofr;": "\U0001d52c",
+    "ogon;": "\u02db",
+    "ograve": "\xf2",
+    "ograve;": "\xf2",
+    "ogt;": "\u29c1",
+    "ohbar;": "\u29b5",
+    "ohm;": "\u03a9",
+    "oint;": "\u222e",
+    "olarr;": "\u21ba",
+    "olcir;": "\u29be",
+    "olcross;": "\u29bb",
+    "oline;": "\u203e",
+    "olt;": "\u29c0",
+    "omacr;": "\u014d",
+    "omega;": "\u03c9",
+    "omicron;": "\u03bf",
+    "omid;": "\u29b6",
+    "ominus;": "\u2296",
+    "oopf;": "\U0001d560",
+    "opar;": "\u29b7",
+    "operp;": "\u29b9",
+    "oplus;": "\u2295",
+    "or;": "\u2228",
+    "orarr;": "\u21bb",
+    "ord;": "\u2a5d",
+    "order;": "\u2134",
+    "orderof;": "\u2134",
+    "ordf": "\xaa",
+    "ordf;": "\xaa",
+    "ordm": "\xba",
+    "ordm;": "\xba",
+    "origof;": "\u22b6",
+    "oror;": "\u2a56",
+    "orslope;": "\u2a57",
+    "orv;": "\u2a5b",
+    "oscr;": "\u2134",
+    "oslash": "\xf8",
+    "oslash;": "\xf8",
+    "osol;": "\u2298",
+    "otilde": "\xf5",
+    "otilde;": "\xf5",
+    "otimes;": "\u2297",
+    "otimesas;": "\u2a36",
+    "ouml": "\xf6",
+    "ouml;": "\xf6",
+    "ovbar;": "\u233d",
+    "par;": "\u2225",
+    "para": "\xb6",
+    "para;": "\xb6",
+    "parallel;": "\u2225",
+    "parsim;": "\u2af3",
+    "parsl;": "\u2afd",
+    "part;": "\u2202",
+    "pcy;": "\u043f",
+    "percnt;": "%",
+    "period;": ".",
+    "permil;": "\u2030",
+    "perp;": "\u22a5",
+    "pertenk;": "\u2031",
+    "pfr;": "\U0001d52d",
+    "phi;": "\u03c6",
+    "phiv;": "\u03d5",
+    "phmmat;": "\u2133",
+    "phone;": "\u260e",
+    "pi;": "\u03c0",
+    "pitchfork;": "\u22d4",
+    "piv;": "\u03d6",
+    "planck;": "\u210f",
+    "planckh;": "\u210e",
+    "plankv;": "\u210f",
+    "plus;": "+",
+    "plusacir;": "\u2a23",
+    "plusb;": "\u229e",
+    "pluscir;": "\u2a22",
+    "plusdo;": "\u2214",
+    "plusdu;": "\u2a25",
+    "pluse;": "\u2a72",
+    "plusmn": "\xb1",
+    "plusmn;": "\xb1",
+    "plussim;": "\u2a26",
+    "plustwo;": "\u2a27",
+    "pm;": "\xb1",
+    "pointint;": "\u2a15",
+    "popf;": "\U0001d561",
+    "pound": "\xa3",
+    "pound;": "\xa3",
+    "pr;": "\u227a",
+    "prE;": "\u2ab3",
+    "prap;": "\u2ab7",
+    "prcue;": "\u227c",
+    "pre;": "\u2aaf",
+    "prec;": "\u227a",
+    "precapprox;": "\u2ab7",
+    "preccurlyeq;": "\u227c",
+    "preceq;": "\u2aaf",
+    "precnapprox;": "\u2ab9",
+    "precneqq;": "\u2ab5",
+    "precnsim;": "\u22e8",
+    "precsim;": "\u227e",
+    "prime;": "\u2032",
+    "primes;": "\u2119",
+    "prnE;": "\u2ab5",
+    "prnap;": "\u2ab9",
+    "prnsim;": "\u22e8",
+    "prod;": "\u220f",
+    "profalar;": "\u232e",
+    "profline;": "\u2312",
+    "profsurf;": "\u2313",
+    "prop;": "\u221d",
+    "propto;": "\u221d",
+    "prsim;": "\u227e",
+    "prurel;": "\u22b0",
+    "pscr;": "\U0001d4c5",
+    "psi;": "\u03c8",
+    "puncsp;": "\u2008",
+    "qfr;": "\U0001d52e",
+    "qint;": "\u2a0c",
+    "qopf;": "\U0001d562",
+    "qprime;": "\u2057",
+    "qscr;": "\U0001d4c6",
+    "quaternions;": "\u210d",
+    "quatint;": "\u2a16",
+    "quest;": "?",
+    "questeq;": "\u225f",
+    "quot": "\"",
+    "quot;": "\"",
+    "rAarr;": "\u21db",
+    "rArr;": "\u21d2",
+    "rAtail;": "\u291c",
+    "rBarr;": "\u290f",
+    "rHar;": "\u2964",
+    "race;": "\u223d\u0331",
+    "racute;": "\u0155",
+    "radic;": "\u221a",
+    "raemptyv;": "\u29b3",
+    "rang;": "\u27e9",
+    "rangd;": "\u2992",
+    "range;": "\u29a5",
+    "rangle;": "\u27e9",
+    "raquo": "\xbb",
+    "raquo;": "\xbb",
+    "rarr;": "\u2192",
+    "rarrap;": "\u2975",
+    "rarrb;": "\u21e5",
+    "rarrbfs;": "\u2920",
+    "rarrc;": "\u2933",
+    "rarrfs;": "\u291e",
+    "rarrhk;": "\u21aa",
+    "rarrlp;": "\u21ac",
+    "rarrpl;": "\u2945",
+    "rarrsim;": "\u2974",
+    "rarrtl;": "\u21a3",
+    "rarrw;": "\u219d",
+    "ratail;": "\u291a",
+    "ratio;": "\u2236",
+    "rationals;": "\u211a",
+    "rbarr;": "\u290d",
+    "rbbrk;": "\u2773",
+    "rbrace;": "}",
+    "rbrack;": "]",
+    "rbrke;": "\u298c",
+    "rbrksld;": "\u298e",
+    "rbrkslu;": "\u2990",
+    "rcaron;": "\u0159",
+    "rcedil;": "\u0157",
+    "rceil;": "\u2309",
+    "rcub;": "}",
+    "rcy;": "\u0440",
+    "rdca;": "\u2937",
+    "rdldhar;": "\u2969",
+    "rdquo;": "\u201d",
+    "rdquor;": "\u201d",
+    "rdsh;": "\u21b3",
+    "real;": "\u211c",
+    "realine;": "\u211b",
+    "realpart;": "\u211c",
+    "reals;": "\u211d",
+    "rect;": "\u25ad",
+    "reg": "\xae",
+    "reg;": "\xae",
+    "rfisht;": "\u297d",
+    "rfloor;": "\u230b",
+    "rfr;": "\U0001d52f",
+    "rhard;": "\u21c1",
+    "rharu;": "\u21c0",
+    "rharul;": "\u296c",
+    "rho;": "\u03c1",
+    "rhov;": "\u03f1",
+    "rightarrow;": "\u2192",
+    "rightarrowtail;": "\u21a3",
+    "rightharpoondown;": "\u21c1",
+    "rightharpoonup;": "\u21c0",
+    "rightleftarrows;": "\u21c4",
+    "rightleftharpoons;": "\u21cc",
+    "rightrightarrows;": "\u21c9",
+    "rightsquigarrow;": "\u219d",
+    "rightthreetimes;": "\u22cc",
+    "ring;": "\u02da",
+    "risingdotseq;": "\u2253",
+    "rlarr;": "\u21c4",
+    "rlhar;": "\u21cc",
+    "rlm;": "\u200f",
+    "rmoust;": "\u23b1",
+    "rmoustache;": "\u23b1",
+    "rnmid;": "\u2aee",
+    "roang;": "\u27ed",
+    "roarr;": "\u21fe",
+    "robrk;": "\u27e7",
+    "ropar;": "\u2986",
+    "ropf;": "\U0001d563",
+    "roplus;": "\u2a2e",
+    "rotimes;": "\u2a35",
+    "rpar;": ")",
+    "rpargt;": "\u2994",
+    "rppolint;": "\u2a12",
+    "rrarr;": "\u21c9",
+    "rsaquo;": "\u203a",
+    "rscr;": "\U0001d4c7",
+    "rsh;": "\u21b1",
+    "rsqb;": "]",
+    "rsquo;": "\u2019",
+    "rsquor;": "\u2019",
+    "rthree;": "\u22cc",
+    "rtimes;": "\u22ca",
+    "rtri;": "\u25b9",
+    "rtrie;": "\u22b5",
+    "rtrif;": "\u25b8",
+    "rtriltri;": "\u29ce",
+    "ruluhar;": "\u2968",
+    "rx;": "\u211e",
+    "sacute;": "\u015b",
+    "sbquo;": "\u201a",
+    "sc;": "\u227b",
+    "scE;": "\u2ab4",
+    "scap;": "\u2ab8",
+    "scaron;": "\u0161",
+    "sccue;": "\u227d",
+    "sce;": "\u2ab0",
+    "scedil;": "\u015f",
+    "scirc;": "\u015d",
+    "scnE;": "\u2ab6",
+    "scnap;": "\u2aba",
+    "scnsim;": "\u22e9",
+    "scpolint;": "\u2a13",
+    "scsim;": "\u227f",
+    "scy;": "\u0441",
+    "sdot;": "\u22c5",
+    "sdotb;": "\u22a1",
+    "sdote;": "\u2a66",
+    "seArr;": "\u21d8",
+    "searhk;": "\u2925",
+    "searr;": "\u2198",
+    "searrow;": "\u2198",
+    "sect": "\xa7",
+    "sect;": "\xa7",
+    "semi;": ";",
+    "seswar;": "\u2929",
+    "setminus;": "\u2216",
+    "setmn;": "\u2216",
+    "sext;": "\u2736",
+    "sfr;": "\U0001d530",
+    "sfrown;": "\u2322",
+    "sharp;": "\u266f",
+    "shchcy;": "\u0449",
+    "shcy;": "\u0448",
+    "shortmid;": "\u2223",
+    "shortparallel;": "\u2225",
+    "shy": "\xad",
+    "shy;": "\xad",
+    "sigma;": "\u03c3",
+    "sigmaf;": "\u03c2",
+    "sigmav;": "\u03c2",
+    "sim;": "\u223c",
+    "simdot;": "\u2a6a",
+    "sime;": "\u2243",
+    "simeq;": "\u2243",
+    "simg;": "\u2a9e",
+    "simgE;": "\u2aa0",
+    "siml;": "\u2a9d",
+    "simlE;": "\u2a9f",
+    "simne;": "\u2246",
+    "simplus;": "\u2a24",
+    "simrarr;": "\u2972",
+    "slarr;": "\u2190",
+    "smallsetminus;": "\u2216",
+    "smashp;": "\u2a33",
+    "smeparsl;": "\u29e4",
+    "smid;": "\u2223",
+    "smile;": "\u2323",
+    "smt;": "\u2aaa",
+    "smte;": "\u2aac",
+    "smtes;": "\u2aac\ufe00",
+    "softcy;": "\u044c",
+    "sol;": "/",
+    "solb;": "\u29c4",
+    "solbar;": "\u233f",
+    "sopf;": "\U0001d564",
+    "spades;": "\u2660",
+    "spadesuit;": "\u2660",
+    "spar;": "\u2225",
+    "sqcap;": "\u2293",
+    "sqcaps;": "\u2293\ufe00",
+    "sqcup;": "\u2294",
+    "sqcups;": "\u2294\ufe00",
+    "sqsub;": "\u228f",
+    "sqsube;": "\u2291",
+    "sqsubset;": "\u228f",
+    "sqsubseteq;": "\u2291",
+    "sqsup;": "\u2290",
+    "sqsupe;": "\u2292",
+    "sqsupset;": "\u2290",
+    "sqsupseteq;": "\u2292",
+    "squ;": "\u25a1",
+    "square;": "\u25a1",
+    "squarf;": "\u25aa",
+    "squf;": "\u25aa",
+    "srarr;": "\u2192",
+    "sscr;": "\U0001d4c8",
+    "ssetmn;": "\u2216",
+    "ssmile;": "\u2323",
+    "sstarf;": "\u22c6",
+    "star;": "\u2606",
+    "starf;": "\u2605",
+    "straightepsilon;": "\u03f5",
+    "straightphi;": "\u03d5",
+    "strns;": "\xaf",
+    "sub;": "\u2282",
+    "subE;": "\u2ac5",
+    "subdot;": "\u2abd",
+    "sube;": "\u2286",
+    "subedot;": "\u2ac3",
+    "submult;": "\u2ac1",
+    "subnE;": "\u2acb",
+    "subne;": "\u228a",
+    "subplus;": "\u2abf",
+    "subrarr;": "\u2979",
+    "subset;": "\u2282",
+    "subseteq;": "\u2286",
+    "subseteqq;": "\u2ac5",
+    "subsetneq;": "\u228a",
+    "subsetneqq;": "\u2acb",
+    "subsim;": "\u2ac7",
+    "subsub;": "\u2ad5",
+    "subsup;": "\u2ad3",
+    "succ;": "\u227b",
+    "succapprox;": "\u2ab8",
+    "succcurlyeq;": "\u227d",
+    "succeq;": "\u2ab0",
+    "succnapprox;": "\u2aba",
+    "succneqq;": "\u2ab6",
+    "succnsim;": "\u22e9",
+    "succsim;": "\u227f",
+    "sum;": "\u2211",
+    "sung;": "\u266a",
+    "sup1": "\xb9",
+    "sup1;": "\xb9",
+    "sup2": "\xb2",
+    "sup2;": "\xb2",
+    "sup3": "\xb3",
+    "sup3;": "\xb3",
+    "sup;": "\u2283",
+    "supE;": "\u2ac6",
+    "supdot;": "\u2abe",
+    "supdsub;": "\u2ad8",
+    "supe;": "\u2287",
+    "supedot;": "\u2ac4",
+    "suphsol;": "\u27c9",
+    "suphsub;": "\u2ad7",
+    "suplarr;": "\u297b",
+    "supmult;": "\u2ac2",
+    "supnE;": "\u2acc",
+    "supne;": "\u228b",
+    "supplus;": "\u2ac0",
+    "supset;": "\u2283",
+    "supseteq;": "\u2287",
+    "supseteqq;": "\u2ac6",
+    "supsetneq;": "\u228b",
+    "supsetneqq;": "\u2acc",
+    "supsim;": "\u2ac8",
+    "supsub;": "\u2ad4",
+    "supsup;": "\u2ad6",
+    "swArr;": "\u21d9",
+    "swarhk;": "\u2926",
+    "swarr;": "\u2199",
+    "swarrow;": "\u2199",
+    "swnwar;": "\u292a",
+    "szlig": "\xdf",
+    "szlig;": "\xdf",
+    "target;": "\u2316",
+    "tau;": "\u03c4",
+    "tbrk;": "\u23b4",
+    "tcaron;": "\u0165",
+    "tcedil;": "\u0163",
+    "tcy;": "\u0442",
+    "tdot;": "\u20db",
+    "telrec;": "\u2315",
+    "tfr;": "\U0001d531",
+    "there4;": "\u2234",
+    "therefore;": "\u2234",
+    "theta;": "\u03b8",
+    "thetasym;": "\u03d1",
+    "thetav;": "\u03d1",
+    "thickapprox;": "\u2248",
+    "thicksim;": "\u223c",
+    "thinsp;": "\u2009",
+    "thkap;": "\u2248",
+    "thksim;": "\u223c",
+    "thorn": "\xfe",
+    "thorn;": "\xfe",
+    "tilde;": "\u02dc",
+    "times": "\xd7",
+    "times;": "\xd7",
+    "timesb;": "\u22a0",
+    "timesbar;": "\u2a31",
+    "timesd;": "\u2a30",
+    "tint;": "\u222d",
+    "toea;": "\u2928",
+    "top;": "\u22a4",
+    "topbot;": "\u2336",
+    "topcir;": "\u2af1",
+    "topf;": "\U0001d565",
+    "topfork;": "\u2ada",
+    "tosa;": "\u2929",
+    "tprime;": "\u2034",
+    "trade;": "\u2122",
+    "triangle;": "\u25b5",
+    "triangledown;": "\u25bf",
+    "triangleleft;": "\u25c3",
+    "trianglelefteq;": "\u22b4",
+    "triangleq;": "\u225c",
+    "triangleright;": "\u25b9",
+    "trianglerighteq;": "\u22b5",
+    "tridot;": "\u25ec",
+    "trie;": "\u225c",
+    "triminus;": "\u2a3a",
+    "triplus;": "\u2a39",
+    "trisb;": "\u29cd",
+    "tritime;": "\u2a3b",
+    "trpezium;": "\u23e2",
+    "tscr;": "\U0001d4c9",
+    "tscy;": "\u0446",
+    "tshcy;": "\u045b",
+    "tstrok;": "\u0167",
+    "twixt;": "\u226c",
+    "twoheadleftarrow;": "\u219e",
+    "twoheadrightarrow;": "\u21a0",
+    "uArr;": "\u21d1",
+    "uHar;": "\u2963",
+    "uacute": "\xfa",
+    "uacute;": "\xfa",
+    "uarr;": "\u2191",
+    "ubrcy;": "\u045e",
+    "ubreve;": "\u016d",
+    "ucirc": "\xfb",
+    "ucirc;": "\xfb",
+    "ucy;": "\u0443",
+    "udarr;": "\u21c5",
+    "udblac;": "\u0171",
+    "udhar;": "\u296e",
+    "ufisht;": "\u297e",
+    "ufr;": "\U0001d532",
+    "ugrave": "\xf9",
+    "ugrave;": "\xf9",
+    "uharl;": "\u21bf",
+    "uharr;": "\u21be",
+    "uhblk;": "\u2580",
+    "ulcorn;": "\u231c",
+    "ulcorner;": "\u231c",
+    "ulcrop;": "\u230f",
+    "ultri;": "\u25f8",
+    "umacr;": "\u016b",
+    "uml": "\xa8",
+    "uml;": "\xa8",
+    "uogon;": "\u0173",
+    "uopf;": "\U0001d566",
+    "uparrow;": "\u2191",
+    "updownarrow;": "\u2195",
+    "upharpoonleft;": "\u21bf",
+    "upharpoonright;": "\u21be",
+    "uplus;": "\u228e",
+    "upsi;": "\u03c5",
+    "upsih;": "\u03d2",
+    "upsilon;": "\u03c5",
+    "upuparrows;": "\u21c8",
+    "urcorn;": "\u231d",
+    "urcorner;": "\u231d",
+    "urcrop;": "\u230e",
+    "uring;": "\u016f",
+    "urtri;": "\u25f9",
+    "uscr;": "\U0001d4ca",
+    "utdot;": "\u22f0",
+    "utilde;": "\u0169",
+    "utri;": "\u25b5",
+    "utrif;": "\u25b4",
+    "uuarr;": "\u21c8",
+    "uuml": "\xfc",
+    "uuml;": "\xfc",
+    "uwangle;": "\u29a7",
+    "vArr;": "\u21d5",
+    "vBar;": "\u2ae8",
+    "vBarv;": "\u2ae9",
+    "vDash;": "\u22a8",
+    "vangrt;": "\u299c",
+    "varepsilon;": "\u03f5",
+    "varkappa;": "\u03f0",
+    "varnothing;": "\u2205",
+    "varphi;": "\u03d5",
+    "varpi;": "\u03d6",
+    "varpropto;": "\u221d",
+    "varr;": "\u2195",
+    "varrho;": "\u03f1",
+    "varsigma;": "\u03c2",
+    "varsubsetneq;": "\u228a\ufe00",
+    "varsubsetneqq;": "\u2acb\ufe00",
+    "varsupsetneq;": "\u228b\ufe00",
+    "varsupsetneqq;": "\u2acc\ufe00",
+    "vartheta;": "\u03d1",
+    "vartriangleleft;": "\u22b2",
+    "vartriangleright;": "\u22b3",
+    "vcy;": "\u0432",
+    "vdash;": "\u22a2",
+    "vee;": "\u2228",
+    "veebar;": "\u22bb",
+    "veeeq;": "\u225a",
+    "vellip;": "\u22ee",
+    "verbar;": "|",
+    "vert;": "|",
+    "vfr;": "\U0001d533",
+    "vltri;": "\u22b2",
+    "vnsub;": "\u2282\u20d2",
+    "vnsup;": "\u2283\u20d2",
+    "vopf;": "\U0001d567",
+    "vprop;": "\u221d",
+    "vrtri;": "\u22b3",
+    "vscr;": "\U0001d4cb",
+    "vsubnE;": "\u2acb\ufe00",
+    "vsubne;": "\u228a\ufe00",
+    "vsupnE;": "\u2acc\ufe00",
+    "vsupne;": "\u228b\ufe00",
+    "vzigzag;": "\u299a",
+    "wcirc;": "\u0175",
+    "wedbar;": "\u2a5f",
+    "wedge;": "\u2227",
+    "wedgeq;": "\u2259",
+    "weierp;": "\u2118",
+    "wfr;": "\U0001d534",
+    "wopf;": "\U0001d568",
+    "wp;": "\u2118",
+    "wr;": "\u2240",
+    "wreath;": "\u2240",
+    "wscr;": "\U0001d4cc",
+    "xcap;": "\u22c2",
+    "xcirc;": "\u25ef",
+    "xcup;": "\u22c3",
+    "xdtri;": "\u25bd",
+    "xfr;": "\U0001d535",
+    "xhArr;": "\u27fa",
+    "xharr;": "\u27f7",
+    "xi;": "\u03be",
+    "xlArr;": "\u27f8",
+    "xlarr;": "\u27f5",
+    "xmap;": "\u27fc",
+    "xnis;": "\u22fb",
+    "xodot;": "\u2a00",
+    "xopf;": "\U0001d569",
+    "xoplus;": "\u2a01",
+    "xotime;": "\u2a02",
+    "xrArr;": "\u27f9",
+    "xrarr;": "\u27f6",
+    "xscr;": "\U0001d4cd",
+    "xsqcup;": "\u2a06",
+    "xuplus;": "\u2a04",
+    "xutri;": "\u25b3",
+    "xvee;": "\u22c1",
+    "xwedge;": "\u22c0",
+    "yacute": "\xfd",
+    "yacute;": "\xfd",
+    "yacy;": "\u044f",
+    "ycirc;": "\u0177",
+    "ycy;": "\u044b",
+    "yen": "\xa5",
+    "yen;": "\xa5",
+    "yfr;": "\U0001d536",
+    "yicy;": "\u0457",
+    "yopf;": "\U0001d56a",
+    "yscr;": "\U0001d4ce",
+    "yucy;": "\u044e",
+    "yuml": "\xff",
+    "yuml;": "\xff",
+    "zacute;": "\u017a",
+    "zcaron;": "\u017e",
+    "zcy;": "\u0437",
+    "zdot;": "\u017c",
+    "zeetrf;": "\u2128",
+    "zeta;": "\u03b6",
+    "zfr;": "\U0001d537",
+    "zhcy;": "\u0436",
+    "zigrarr;": "\u21dd",
+    "zopf;": "\U0001d56b",
+    "zscr;": "\U0001d4cf",
+    "zwj;": "\u200d",
+    "zwnj;": "\u200c",
+}
+
+replacementCharacters = {
+    0x0: "\uFFFD",
+    0x0d: "\u000D",
+    0x80: "\u20AC",
+    0x81: "\u0081",
+    0x82: "\u201A",
+    0x83: "\u0192",
+    0x84: "\u201E",
+    0x85: "\u2026",
+    0x86: "\u2020",
+    0x87: "\u2021",
+    0x88: "\u02C6",
+    0x89: "\u2030",
+    0x8A: "\u0160",
+    0x8B: "\u2039",
+    0x8C: "\u0152",
+    0x8D: "\u008D",
+    0x8E: "\u017D",
+    0x8F: "\u008F",
+    0x90: "\u0090",
+    0x91: "\u2018",
+    0x92: "\u2019",
+    0x93: "\u201C",
+    0x94: "\u201D",
+    0x95: "\u2022",
+    0x96: "\u2013",
+    0x97: "\u2014",
+    0x98: "\u02DC",
+    0x99: "\u2122",
+    0x9A: "\u0161",
+    0x9B: "\u203A",
+    0x9C: "\u0153",
+    0x9D: "\u009D",
+    0x9E: "\u017E",
+    0x9F: "\u0178",
+}
+
+tokenTypes = {
+    "Doctype": 0,
+    "Characters": 1,
+    "SpaceCharacters": 2,
+    "StartTag": 3,
+    "EndTag": 4,
+    "EmptyTag": 5,
+    "Comment": 6,
+    "ParseError": 7
+}
+
+tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
+                           tokenTypes["EmptyTag"]])
+
+
+prefixes = dict([(v, k) for k, v in namespaces.items()])
+prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
+
+
+class DataLossWarning(UserWarning):
+    """Raised when the current tree is unable to represent the input data"""
+    pass
+
+
+class _ReparseException(Exception):
+    pass
diff --git a/bleach/_vendor/html5lib/filters/__init__.py b/bleach/_vendor/html5lib/filters/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bleach/_vendor/html5lib/filters/alphabeticalattributes.py b/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
new file mode 100644
index 00000000..5ba926e3
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
@@ -0,0 +1,29 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Return an appropriate key for an attribute for sorting
+
+    Attributes have a namespace that can be either ``None`` or a string. We
+    can't compare the two because they're different types, so we convert
+    ``None`` to an empty string first.
+
+    """
+    return (attr[0][0] or ''), attr[0][1]
+
+
+class Filter(base.Filter):
+    """Alphabetizes attributes for elements"""
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            if token["type"] in ("StartTag", "EmptyTag"):
+                attrs = OrderedDict()
+                for name, value in sorted(token["data"].items(),
+                                          key=_attr_key):
+                    attrs[name] = value
+                token["data"] = attrs
+            yield token
diff --git a/bleach/_vendor/html5lib/filters/base.py b/bleach/_vendor/html5lib/filters/base.py
new file mode 100644
index 00000000..c7dbaed0
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/base.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
diff --git a/bleach/_vendor/html5lib/filters/inject_meta_charset.py b/bleach/_vendor/html5lib/filters/inject_meta_charset.py
new file mode 100644
index 00000000..aefb5c84
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/inject_meta_charset.py
@@ -0,0 +1,73 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Injects ``<meta charset=ENCODING>`` tag into head of document"""
+    def __init__(self, source, encoding):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg encoding: the encoding to set
+
+        """
+        base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == "head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == "meta":
+                    # replace charset with actual encoding
+                    has_http_equiv_content_type = False
+                    for (namespace, name), value in token["data"].items():
+                        if namespace is not None:
+                            continue
+                        elif name.lower() == 'charset':
+                            token["data"][(namespace, name)] = self.encoding
+                            meta_found = True
+                            break
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
+                            has_http_equiv_content_type = True
+                    else:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+                            meta_found = True
+
+                elif token["name"].lower() == "head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": "head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": "meta",
+                           "data": {(None, "charset"): self.encoding}}
+                    yield {"type": "EndTag", "name": "head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == "head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": "meta",
+                               "data": {(None, "charset"): self.encoding}}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
diff --git a/bleach/_vendor/html5lib/filters/lint.py b/bleach/_vendor/html5lib/filters/lint.py
new file mode 100644
index 00000000..acd4d7a2
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/lint.py
@@ -0,0 +1,93 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type
+
+from . import base
+from ..constants import namespaces, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Lints the token stream for errors
+
+    If it finds any errors, it'll raise an ``AssertionError``.
+
+    """
+    def __init__(self, source, require_matching_tags=True):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg require_matching_tags: whether or not to require matching tags
+
+        """
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags
+
+    def __iter__(self):
+        open_elements = []
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)
+
+            elif type == "EndTag":
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)
+
+            elif type == "Comment":
+                data = token["data"]
+                assert isinstance(data, text_type)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                assert isinstance(data, text_type)
+                assert data != ""
+                if type == "SpaceCharacters":
+                    assert data.strip(spaceCharacters) == ""
+
+            elif type == "Doctype":
+                name = token["name"]
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)
+
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)
+
+            else:
+                assert False, "Unknown token type: %(type)s" % {"type": type}
+
+            yield token
diff --git a/bleach/_vendor/html5lib/filters/optionaltags.py b/bleach/_vendor/html5lib/filters/optionaltags.py
new file mode 100644
index 00000000..4a865012
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/optionaltags.py
@@ -0,0 +1,207 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Removes optional tags from the token stream"""
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        if previous1 is not None:
+            yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if (token["data"] or
+                        not self.is_optional_start(token["name"], previous, next)):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceded by another colgroup element whose
+            # end tag has been omitted.
+            if type in ("StartTag", "EmptyTag"):
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                        previous['name'] in ('tbody', 'thead', 'tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog',
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol',
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
diff --git a/bleach/_vendor/html5lib/filters/sanitizer.py b/bleach/_vendor/html5lib/filters/sanitizer.py
new file mode 100644
index 00000000..e852f53b
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/sanitizer.py
@@ -0,0 +1,896 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]
+
+
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        """Creates a Filter
+
+        :arg allowed_elements: set of elements to allow--everything else will
+            be escaped
+
+        :arg allowed_attributes: set of attributes to allow in
+            elements--everything else will be stripped
+
+        :arg allowed_css_properties: set of CSS properties to allow--everything
+            else will be stripped
+
+        :arg allowed_css_keywords: set of CSS keywords to allow--everything
+            else will be stripped
+
+        :arg allowed_svg_properties: set of SVG properties to allow--everything
+            else will be removed
+
+        :arg allowed_protocols: set of allowed protocols for URIs
+
+        :arg allowed_content_types: set of allowed content types for ``data`` URIs.
+
+        :arg attr_val_is_uri: set of attributes that have URI values--values
+            that have a scheme not listed in ``allowed_protocols`` are removed
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs--these are removed
+
+        """
+        super(Filter, self).__init__(source)
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
+    # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
+    # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
+    # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
+    # allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
diff --git a/bleach/_vendor/html5lib/filters/whitespace.py b/bleach/_vendor/html5lib/filters/whitespace.py
new file mode 100644
index 00000000..0d12584b
--- /dev/null
+++ b/bleach/_vendor/html5lib/filters/whitespace.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Collapses whitespace except in pre, textarea, and script elements"""
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+    def __iter__(self):
+        preserve = 0
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+                    and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = " "
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+
+def collapse_spaces(text):
+    return SPACES_REGEX.sub(' ', text)
diff --git a/bleach/_vendor/html5lib/html5parser.py b/bleach/_vendor/html5lib/html5parser.py
new file mode 100644
index 00000000..9d39b9d4
--- /dev/null
+++ b/bleach/_vendor/html5lib/html5parser.py
@@ -0,0 +1,2791 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import with_metaclass, viewkeys
+
+import types
+from collections import OrderedDict
+
+from . import _inputstream
+from . import _tokenizer
+
+from . import treebuilders
+from .treebuilders.base import Marker
+
+from . import _utils
+from .constants import (
+    spaceCharacters, asciiUpper2Lower,
+    specialElements, headingElements, cdataElements, rcdataElements,
+    tokenTypes, tagTokenTypes,
+    namespaces,
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+    adjustForeignAttributes as adjustForeignAttributesMap,
+    adjustMathMLAttributes, adjustSVGAttributes,
+    E,
+    _ReparseException
+)
+
+
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+    """Parse an HTML document as a string or file-like object into a tree
+
+    :arg doc: the document to parse as a string or file-like object
+
+    :arg treebuilder: the treebuilder to use when parsing
+
+    :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+    :returns: parsed tree
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> parse('<html><body><p>This is a doc</p></body></html>')
+    <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
+
+    """
+    tb = treebuilders.getTreeBuilder(treebuilder)
+    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+    return p.parse(doc, **kwargs)
+
+
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+    """Parse an HTML fragment as a string or file-like object into a tree
+
+    :arg doc: the fragment to parse as a string or file-like object
+
+    :arg container: the container context to parse the fragment in
+
+    :arg treebuilder: the treebuilder to use when parsing
+
+    :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+    :returns: parsed tree
+
+    Example:
+
+    >>> from html5lib.html5libparser import parseFragment
+    >>> parseFragment('<b>this is a fragment</b>')
+    <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
+
+    """
+    tb = treebuilders.getTreeBuilder(treebuilder)
+    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+    return p.parseFragment(doc, container=container, **kwargs)
+
+
+def method_decorator_metaclass(function):
+    class Decorated(type):
+        def __new__(meta, classname, bases, classDict):
+            for attributeName, attribute in classDict.items():
+                if isinstance(attribute, types.FunctionType):
+                    attribute = function(attribute)
+
+                classDict[attributeName] = attribute
+            return type.__new__(meta, classname, bases, classDict)
+    return Decorated
+
+
+class HTMLParser(object):
+    """HTML parser
+
+    Generates a tree structure from a stream of (possibly malformed) HTML.
+
+    """
+
+    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
+        """
+        :arg tree: a treebuilder class controlling the type of tree that will be
+            returned. Built in treebuilders can be accessed through
+            html5lib.treebuilders.getTreeBuilder(treeType)
+
+        :arg strict: raise an exception when a parse error is encountered
+
+        :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+        :arg debug: whether or not to enable debug mode which logs things
+
+        Example:
+
+        >>> from html5lib.html5parser import HTMLParser
+        >>> parser = HTMLParser()                     # generates parser with etree builder
+        >>> parser = HTMLParser('lxml', strict=True)  # generates parser with lxml builder which is strict
+
+        """
+
+        # Raise an exception on the first error encountered
+        self.strict = strict
+
+        if tree is None:
+            tree = treebuilders.getTreeBuilder("etree")
+        self.tree = tree(namespaceHTMLElements)
+        self.errors = []
+
+        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
+                            getPhases(debug).items()])
+
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except _ReparseException:
+            self.reset()
+            self.mainLoop()
+
+    def reset(self):
+        self.tree.reset()
+        self.firstStartTag = False
+        self.errors = []
+        self.log = []  # only used with debug mode
+        # "quirks" / "limited quirks" / "no quirks"
+        self.compatMode = "no quirks"
+
+        if self.innerHTMLMode:
+            self.innerHTML = self.container.lower()
+
+            if self.innerHTML in cdataElements:
+                self.tokenizer.state = self.tokenizer.rcdataState
+            elif self.innerHTML in rcdataElements:
+                self.tokenizer.state = self.tokenizer.rawtextState
+            elif self.innerHTML == 'plaintext':
+                self.tokenizer.state = self.tokenizer.plaintextState
+            else:
+                # state already is data state
+                # self.tokenizer.state = self.tokenizer.dataState
+                pass
+            self.phase = self.phases["beforeHtml"]
+            self.phase.insertHtmlElement()
+            self.resetInsertionMode()
+        else:
+            self.innerHTML = False  # pylint:disable=redefined-variable-type
+            self.phase = self.phases["initial"]
+
+        self.lastPhase = None
+
+        self.beforeRCDataPhase = None
+
+        self.framesetOK = True
+
+    @property
+    def documentEncoding(self):
+        """Name of the character encoding that was used to decode the input stream, or
+        :obj:`None` if that is not determined yet
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.charEncoding[0].name
+
+    def isHTMLIntegrationPoint(self, element):
+        if (element.name == "annotation-xml" and
+                element.namespace == namespaces["mathml"]):
+            return ("encoding" in element.attributes and
+                    element.attributes["encoding"].translate(
+                        asciiUpper2Lower) in
+                    ("text/html", "application/xhtml+xml"))
+        else:
+            return (element.namespace, element.name) in htmlIntegrationPointElements
+
+    def isMathMLTextIntegrationPoint(self, element):
+        return (element.namespace, element.name) in mathmlTextIntegrationPointElements
+
+    def mainLoop(self):
+        CharactersToken = tokenTypes["Characters"]
+        SpaceCharactersToken = tokenTypes["SpaceCharacters"]
+        StartTagToken = tokenTypes["StartTag"]
+        EndTagToken = tokenTypes["EndTag"]
+        CommentToken = tokenTypes["Comment"]
+        DoctypeToken = tokenTypes["Doctype"]
+        ParseErrorToken = tokenTypes["ParseError"]
+
+        for token in self.normalizedTokens():
+            prev_token = None
+            new_token = token
+            while new_token is not None:
+                prev_token = new_token
+                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
+                currentNodeNamespace = currentNode.namespace if currentNode else None
+                currentNodeName = currentNode.name if currentNode else None
+
+                type = new_token["type"]
+
+                if type == ParseErrorToken:
+                    self.parseError(new_token["data"], new_token.get("datavars", {}))
+                    new_token = None
+                else:
+                    if (len(self.tree.openElements) == 0 or
+                        currentNodeNamespace == self.tree.defaultNamespace or
+                        (self.isMathMLTextIntegrationPoint(currentNode) and
+                         ((type == StartTagToken and
+                           token["name"] not in frozenset(["mglyph", "malignmark"])) or
+                          type in (CharactersToken, SpaceCharactersToken))) or
+                        (currentNodeNamespace == namespaces["mathml"] and
+                         currentNodeName == "annotation-xml" and
+                         type == StartTagToken and
+                         token["name"] == "svg") or
+                        (self.isHTMLIntegrationPoint(currentNode) and
+                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
+                        phase = self.phase
+                    else:
+                        phase = self.phases["inForeignContent"]
+
+                    if type == CharactersToken:
+                        new_token = phase.processCharacters(new_token)
+                    elif type == SpaceCharactersToken:
+                        new_token = phase.processSpaceCharacters(new_token)
+                    elif type == StartTagToken:
+                        new_token = phase.processStartTag(new_token)
+                    elif type == EndTagToken:
+                        new_token = phase.processEndTag(new_token)
+                    elif type == CommentToken:
+                        new_token = phase.processComment(new_token)
+                    elif type == DoctypeToken:
+                        new_token = phase.processDoctype(new_token)
+
+            if (type == StartTagToken and prev_token["selfClosing"] and
+                    not prev_token["selfClosingAcknowledged"]):
+                self.parseError("non-void-element-with-trailing-solidus",
+                                {"name": prev_token["name"]})
+
+        # When the loop finishes it's EOF
+        reprocess = True
+        phases = []
+        while reprocess:
+            phases.append(self.phase)
+            reprocess = self.phase.processEOF()
+            if reprocess:
+                assert self.phase not in phases
+
+    def normalizedTokens(self):
+        for token in self.tokenizer:
+            yield self.normalizeToken(token)
+
+    def parse(self, stream, *args, **kwargs):
+        """Parse a HTML document into a well-formed tree
+
+        :arg stream: a file-like object or string containing the HTML to be parsed
+
+            The optional encoding parameter must be a string that indicates
+            the encoding.  If specified, that encoding will be used,
+            regardless of any BOM or later declaration (such as in a meta
+            element).
+
+        :arg scripting: treat noscript elements as if JavaScript was turned on
+
+        :returns: parsed tree
+
+        Example:
+
+        >>> from html5lib.html5parser import HTMLParser
+        >>> parser = HTMLParser()
+        >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
+        <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
+
+        """
+        self._parse(stream, False, None, *args, **kwargs)
+        return self.tree.getDocument()
+
+    def parseFragment(self, stream, *args, **kwargs):
+        """Parse a HTML fragment into a well-formed tree fragment
+
+        :arg container: name of the element we're setting the innerHTML
+            property if set to None, default to 'div'
+
+        :arg stream: a file-like object or string containing the HTML to be parsed
+
+            The optional encoding parameter must be a string that indicates
+            the encoding.  If specified, that encoding will be used,
+            regardless of any BOM or later declaration (such as in a meta
+            element)
+
+        :arg scripting: treat noscript elements as if JavaScript was turned on
+
+        :returns: parsed tree
+
+        Example:
+
+        >>> from html5lib.html5libparser import HTMLParser
+        >>> parser = HTMLParser()
+        >>> parser.parseFragment('<b>this is a fragment</b>')
+        <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
+
+        """
+        self._parse(stream, True, *args, **kwargs)
+        return self.tree.getFragment()
+
+    def parseError(self, errorcode="XXX-undefined-error", datavars=None):
+        # XXX The idea is to make errorcode mandatory.
+        if datavars is None:
+            datavars = {}
+        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
+        if self.strict:
+            raise ParseError(E[errorcode] % datavars)
+
+    def normalizeToken(self, token):
+        # HTML5 specific normalizations to the token stream
+        if token["type"] == tokenTypes["StartTag"]:
+            raw = token["data"]
+            token["data"] = OrderedDict(raw)
+            if len(raw) > len(token["data"]):
+                # we had some duplicated attribute, fix so first wins
+                token["data"].update(raw[::-1])
+
+        return token
+
+    def adjustMathMLAttributes(self, token):
+        adjust_attributes(token, adjustMathMLAttributes)
+
+    def adjustSVGAttributes(self, token):
+        adjust_attributes(token, adjustSVGAttributes)
+
+    def adjustForeignAttributes(self, token):
+        adjust_attributes(token, adjustForeignAttributesMap)
+
+    def reparseTokenNormal(self, token):
+        # pylint:disable=unused-argument
+        self.parser.phase()
+
+    def resetInsertionMode(self):
+        # The name of this method is mostly historical. (It's also used in the
+        # specification.)
+        last = False
+        newModes = {
+            "select": "inSelect",
+            "td": "inCell",
+            "th": "inCell",
+            "tr": "inRow",
+            "tbody": "inTableBody",
+            "thead": "inTableBody",
+            "tfoot": "inTableBody",
+            "caption": "inCaption",
+            "colgroup": "inColumnGroup",
+            "table": "inTable",
+            "head": "inBody",
+            "body": "inBody",
+            "frameset": "inFrameset",
+            "html": "beforeHead"
+        }
+        for node in self.tree.openElements[::-1]:
+            nodeName = node.name
+            new_phase = None
+            if node == self.tree.openElements[0]:
+                assert self.innerHTML
+                last = True
+                nodeName = self.innerHTML
+            # Check for conditions that should only happen in the innerHTML
+            # case
+            if nodeName in ("select", "colgroup", "head", "html"):
+                assert self.innerHTML
+
+            if not last and node.namespace != self.tree.defaultNamespace:
+                continue
+
+            if nodeName in newModes:
+                new_phase = self.phases[newModes[nodeName]]
+                break
+            elif last:
+                new_phase = self.phases["inBody"]
+                break
+
+        self.phase = new_phase
+
+    def parseRCDataRawtext(self, token, contentType):
+        # Generic RCDATA/RAWTEXT Parsing algorithm
+        assert contentType in ("RAWTEXT", "RCDATA")
+
+        self.tree.insertElement(token)
+
+        if contentType == "RAWTEXT":
+            self.tokenizer.state = self.tokenizer.rawtextState
+        else:
+            self.tokenizer.state = self.tokenizer.rcdataState
+
+        self.originalPhase = self.phase
+
+        self.phase = self.phases["text"]
+
+
+@_utils.memoize
+def getPhases(debug):
+    def log(function):
+        """Logger that records which phase processes each token"""
+        type_names = dict((value, key) for key, value in
+                          tokenTypes.items())
+
+        def wrapped(self, *args, **kwargs):
+            if function.__name__.startswith("process") and len(args) > 0:
+                token = args[0]
+                try:
+                    info = {"type": type_names[token['type']]}
+                except:
+                    raise
+                if token['type'] in tagTokenTypes:
+                    info["name"] = token['name']
+
+                self.parser.log.append((self.parser.tokenizer.state.__name__,
+                                        self.parser.phase.__class__.__name__,
+                                        self.__class__.__name__,
+                                        function.__name__,
+                                        info))
+                return function(self, *args, **kwargs)
+            else:
+                return function(self, *args, **kwargs)
+        return wrapped
+
+    def getMetaclass(use_metaclass, metaclass_func):
+        if use_metaclass:
+            return method_decorator_metaclass(metaclass_func)
+        else:
+            return type
+
+    # pylint:disable=unused-argument
+    class Phase(with_metaclass(getMetaclass(debug, log))):
+        """Base class for helper object that implements each phase of processing
+        """
+
+        def __init__(self, parser, tree):
+            self.parser = parser
+            self.tree = tree
+
+        def processEOF(self):
+            raise NotImplementedError
+
+        def processComment(self, token):
+            # For most phases the following is correct. Where it's not it will be
+            # overridden.
+            self.tree.insertComment(token, self.tree.openElements[-1])
+
+        def processDoctype(self, token):
+            self.parser.parseError("unexpected-doctype")
+
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processSpaceCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processStartTag(self, token):
+            return self.startTagHandler[token["name"]](token)
+
+        def startTagHtml(self, token):
+            if not self.parser.firstStartTag and token["name"] == "html":
+                self.parser.parseError("non-html-root")
+            # XXX Need a check here to see if the first start tag token emitted is
+            # this token... If it's not, invoke self.parser.parseError().
+            for attr, value in token["data"].items():
+                if attr not in self.tree.openElements[0].attributes:
+                    self.tree.openElements[0].attributes[attr] = value
+            self.parser.firstStartTag = False
+
+        def processEndTag(self, token):
+            return self.endTagHandler[token["name"]](token)
+
+    class InitialPhase(Phase):
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+            correct = token["correct"]
+
+            if (name != "html" or publicId is not None or
+                    systemId is not None and systemId != "about:legacy-compat"):
+                self.parser.parseError("unknown-doctype")
+
+            if publicId is None:
+                publicId = ""
+
+            self.tree.insertDoctype(token)
+
+            if publicId != "":
+                publicId = publicId.translate(asciiUpper2Lower)
+
+            if (not correct or token["name"] != "html" or
+                    publicId.startswith(
+                        ("+//silmaril//dtd html pro v0r11 19970101//",
+                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+                         "-//as//dtd html 3.0 aswedit + extensions//",
+                         "-//ietf//dtd html 2.0 level 1//",
+                         "-//ietf//dtd html 2.0 level 2//",
+                         "-//ietf//dtd html 2.0 strict level 1//",
+                         "-//ietf//dtd html 2.0 strict level 2//",
+                         "-//ietf//dtd html 2.0 strict//",
+                         "-//ietf//dtd html 2.0//",
+                         "-//ietf//dtd html 2.1e//",
+                         "-//ietf//dtd html 3.0//",
+                         "-//ietf//dtd html 3.2 final//",
+                         "-//ietf//dtd html 3.2//",
+                         "-//ietf//dtd html 3//",
+                         "-//ietf//dtd html level 0//",
+                         "-//ietf//dtd html level 1//",
+                         "-//ietf//dtd html level 2//",
+                         "-//ietf//dtd html level 3//",
+                         "-//ietf//dtd html strict level 0//",
+                         "-//ietf//dtd html strict level 1//",
+                         "-//ietf//dtd html strict level 2//",
+                         "-//ietf//dtd html strict level 3//",
+                         "-//ietf//dtd html strict//",
+                         "-//ietf//dtd html//",
+                         "-//metrius//dtd metrius presentational//",
+                         "-//microsoft//dtd internet explorer 2.0 html strict//",
+                         "-//microsoft//dtd internet explorer 2.0 html//",
+                         "-//microsoft//dtd internet explorer 2.0 tables//",
+                         "-//microsoft//dtd internet explorer 3.0 html strict//",
+                         "-//microsoft//dtd internet explorer 3.0 html//",
+                         "-//microsoft//dtd internet explorer 3.0 tables//",
+                         "-//netscape comm. corp.//dtd html//",
+                         "-//netscape comm. corp.//dtd strict html//",
+                         "-//o'reilly and associates//dtd html 2.0//",
+                         "-//o'reilly and associates//dtd html extended 1.0//",
+                         "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+                         "-//spyglass//dtd html 2.0 extended//",
+                         "-//sq//dtd html 2.0 hotmetal + extensions//",
+                         "-//sun microsystems corp.//dtd hotjava html//",
+                         "-//sun microsystems corp.//dtd hotjava strict html//",
+                         "-//w3c//dtd html 3 1995-03-24//",
+                         "-//w3c//dtd html 3.2 draft//",
+                         "-//w3c//dtd html 3.2 final//",
+                         "-//w3c//dtd html 3.2//",
+                         "-//w3c//dtd html 3.2s draft//",
+                         "-//w3c//dtd html 4.0 frameset//",
+                         "-//w3c//dtd html 4.0 transitional//",
+                         "-//w3c//dtd html experimental 19960712//",
+                         "-//w3c//dtd html experimental 970421//",
+                         "-//w3c//dtd w3 html//",
+                         "-//w3o//dtd w3 html 3.0//",
+                         "-//webtechs//dtd mozilla html 2.0//",
+                         "-//webtechs//dtd mozilla html//")) or
+                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
+                                 "-/w3c/dtd html 4.0 transitional/en",
+                                 "html") or
+                    publicId.startswith(
+                        ("-//w3c//dtd html 4.01 frameset//",
+                         "-//w3c//dtd html 4.01 transitional//")) and
+                    systemId is None or
+                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+                self.parser.compatMode = "quirks"
+            elif (publicId.startswith(
+                    ("-//w3c//dtd xhtml 1.0 frameset//",
+                     "-//w3c//dtd xhtml 1.0 transitional//")) or
+                  publicId.startswith(
+                      ("-//w3c//dtd html 4.01 frameset//",
+                       "-//w3c//dtd html 4.01 transitional//")) and
+                  systemId is not None):
+                self.parser.compatMode = "limited quirks"
+
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def anythingElse(self):
+            self.parser.compatMode = "quirks"
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-doctype-but-got-chars")
+            self.anythingElse()
+            return token
+
+        def processStartTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEOF(self):
+            self.parser.parseError("expected-doctype-but-got-eof")
+            self.anythingElse()
+            return True
+
+    class BeforeHtmlPhase(Phase):
+        # helper methods
+        def insertHtmlElement(self):
+            self.tree.insertRoot(impliedTagToken("html", "StartTag"))
+            self.parser.phase = self.parser.phases["beforeHead"]
+
+        # other
+        def processEOF(self):
+            self.insertHtmlElement()
+            return True
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processCharacters(self, token):
+            self.insertHtmlElement()
+            return token
+
+        def processStartTag(self, token):
+            if token["name"] == "html":
+                self.parser.firstStartTag = True
+            self.insertHtmlElement()
+            return token
+
+        def processEndTag(self, token):
+            if token["name"] not in ("head", "body", "html", "br"):
+                self.parser.parseError("unexpected-end-tag-before-html",
+                                       {"name": token["name"]})
+            else:
+                self.insertHtmlElement()
+                return token
+
+    class BeforeHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("head", "body", "html", "br"), self.endTagImplyHead)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return True
+
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processCharacters(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagHead(self, token):
+            self.tree.insertElement(token)
+            self.tree.headPointer = self.tree.openElements[-1]
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def startTagOther(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagImplyHead(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("end-tag-after-implied-root",
+                                   {"name": token["name"]})
+
+    class InHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("title", self.startTagTitle),
+                (("noframes", "style"), self.startTagNoFramesStyle),
+                ("noscript", self.startTagNoscript),
+                ("script", self.startTagScript),
+                (("base", "basefont", "bgsound", "command", "link"),
+                 self.startTagBaseLinkCommand),
+                ("meta", self.startTagMeta),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("head", self.endTagHead),
+                (("br", "html", "body"), self.endTagHtmlBodyBr)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # the real thing
+        def processEOF(self):
+            self.anythingElse()
+            return True
+
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagHead(self, token):
+            self.parser.parseError("two-heads-are-not-better-than-one")
+
+        def startTagBaseLinkCommand(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagMeta(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+            attributes = token["data"]
+            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+                if "charset" in attributes:
+                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+                elif ("content" in attributes and
+                      "http-equiv" in attributes and
+                      attributes["http-equiv"].lower() == "content-type"):
+                    # Encoding it as UTF-8 here is a hack, as really we should pass
+                    # the abstract Unicode string, and just use the
+                    # ContentAttrParser on that, but using UTF-8 allows all chars
+                    # to be encoded and as a ASCII-superset works.
+                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+                    parser = _inputstream.ContentAttrParser(data)
+                    codec = parser.parse()
+                    self.parser.tokenizer.stream.changeEncoding(codec)
+
+        def startTagTitle(self, token):
+            self.parser.parseRCDataRawtext(token, "RCDATA")
+
+        def startTagNoFramesStyle(self, token):
+            # Need to decide whether to implement the scripting-disabled case
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.parser.parseRCDataRawtext(token, "RAWTEXT")
+            else:
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inHeadNoscript"]
+
+        def startTagScript(self, token):
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+            self.parser.originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["text"]
+
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagHead(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "head", "Expected head got %s" % node.name
+            self.parser.phase = self.parser.phases["afterHead"]
+
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            self.endTagHead(impliedTagToken("head"))
+
+    class InHeadNoscriptPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
+                (("head", "noscript"), self.startTagHeadNoscript),
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("noscript", self.endTagNoscript),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.parseError("eof-in-head-noscript")
+            self.anythingElse()
+            return True
+
+        def processComment(self, token):
+            return self.parser.phases["inHead"].processComment(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("char-in-head-noscript")
+            self.anythingElse()
+            return token
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inHead"].processSpaceCharacters(token)
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagBaseLinkCommand(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagHeadNoscript(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagNoscript(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "noscript", "Expected noscript got %s" % node.name
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            # Caller must raise parse error first!
+            self.endTagNoscript(impliedTagToken("noscript"))
+
+    class AfterHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+                  "style", "title"),
+                 self.startTagFromHead),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+                                                           self.endTagHtmlBodyBr)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.anythingElse()
+            return True
+
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagBody(self, token):
+            self.parser.framesetOK = False
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inBody"]
+
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inFrameset"]
+
+        def startTagFromHead(self, token):
+            self.parser.parseError("unexpected-start-tag-out-of-my-head",
+                                   {"name": token["name"]})
+            self.tree.openElements.append(self.tree.headPointer)
+            self.parser.phases["inHead"].processStartTag(token)
+            for node in self.tree.openElements[::-1]:
+                if node.name == "head":
+                    self.tree.openElements.remove(node)
+                    break
+
+        def startTagHead(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            self.tree.insertElement(impliedTagToken("body", "StartTag"))
+            self.parser.phase = self.parser.phases["inBody"]
+            self.parser.framesetOK = True
+
+    class InBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+        # the really-really-really-very crazy mode
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("base", "basefont", "bgsound", "command", "link", "meta",
+                  "script", "style", "title"),
+                 self.startTagProcessInHead),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("address", "article", "aside", "blockquote", "center", "details",
+                  "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+                  "section", "summary", "ul"),
+                 self.startTagCloseP),
+                (headingElements, self.startTagHeading),
+                (("pre", "listing"), self.startTagPreListing),
+                ("form", self.startTagForm),
+                (("li", "dd", "dt"), self.startTagListItem),
+                ("plaintext", self.startTagPlaintext),
+                ("a", self.startTagA),
+                (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+                  "strong", "tt", "u"), self.startTagFormatting),
+                ("nobr", self.startTagNobr),
+                ("button", self.startTagButton),
+                (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
+                ("xmp", self.startTagXmp),
+                ("table", self.startTagTable),
+                (("area", "br", "embed", "img", "keygen", "wbr"),
+                 self.startTagVoidFormatting),
+                (("param", "source", "track"), self.startTagParamSource),
+                ("input", self.startTagInput),
+                ("hr", self.startTagHr),
+                ("image", self.startTagImage),
+                ("isindex", self.startTagIsIndex),
+                ("textarea", self.startTagTextarea),
+                ("iframe", self.startTagIFrame),
+                ("noscript", self.startTagNoscript),
+                (("noembed", "noframes"), self.startTagRawtext),
+                ("select", self.startTagSelect),
+                (("rp", "rt"), self.startTagRpRt),
+                (("option", "optgroup"), self.startTagOpt),
+                (("math"), self.startTagMath),
+                (("svg"), self.startTagSvg),
+                (("caption", "col", "colgroup", "frame", "head",
+                  "tbody", "td", "tfoot", "th", "thead",
+                  "tr"), self.startTagMisplaced)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("body", self.endTagBody),
+                ("html", self.endTagHtml),
+                (("address", "article", "aside", "blockquote", "button", "center",
+                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+                  "section", "summary", "ul"), self.endTagBlock),
+                ("form", self.endTagForm),
+                ("p", self.endTagP),
+                (("dd", "dt", "li"), self.endTagListItem),
+                (headingElements, self.endTagHeading),
+                (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+                  "strike", "strong", "tt", "u"), self.endTagFormatting),
+                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def isMatchingFormattingElement(self, node1, node2):
+            return (node1.name == node2.name and
+                    node1.namespace == node2.namespace and
+                    node1.attributes == node2.attributes)
+
+        # helper
+        def addFormattingElement(self, token):
+            self.tree.insertElement(token)
+            element = self.tree.openElements[-1]
+
+            matchingElements = []
+            for node in self.tree.activeFormattingElements[::-1]:
+                if node is Marker:
+                    break
+                elif self.isMatchingFormattingElement(node, element):
+                    matchingElements.append(node)
+
+            assert len(matchingElements) <= 3
+            if len(matchingElements) == 3:
+                self.tree.activeFormattingElements.remove(matchingElements[-1])
+            self.tree.activeFormattingElements.append(element)
+
+        # the real deal
+        def processEOF(self):
+            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+                                          "tfoot", "th", "thead", "tr", "body",
+                                          "html"))
+            for node in self.tree.openElements[::-1]:
+                if node.name not in allowed_elements:
+                    self.parser.parseError("expected-closing-tag-but-got-eof")
+                    break
+            # Stop parsing
+
+        def processSpaceCharactersDropNewline(self, token):
+            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
+            # want to drop leading newlines
+            data = token["data"]
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            if (data.startswith("\n") and
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+                    not self.tree.openElements[-1].hasContent()):
+                data = data[1:]
+            if data:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertText(data)
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                # The tokenizer should always emit null on its own
+                return
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
+            # This must be bad for performance
+            if (self.parser.framesetOK and
+                any([char not in spaceCharacters
+                     for char in token["data"]])):
+                self.parser.framesetOK = False
+
+        def processSpaceCharactersNonPre(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
+
+        def startTagProcessInHead(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagBody(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "body"})
+            if (len(self.tree.openElements) == 1 or
+                    self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            else:
+                self.parser.framesetOK = False
+                for attr, value in token["data"].items():
+                    if attr not in self.tree.openElements[1].attributes:
+                        self.tree.openElements[1].attributes[attr] = value
+
+        def startTagFrameset(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
+            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            elif not self.parser.framesetOK:
+                pass
+            else:
+                if self.tree.openElements[1].parent:
+                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
+                while self.tree.openElements[-1].name != "html":
+                    self.tree.openElements.pop()
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inFrameset"]
+
+        def startTagCloseP(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+
+        def startTagPreListing(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+
+        def startTagForm(self, token):
+            if self.tree.formPointer:
+                self.parser.parseError("unexpected-start-tag", {"name": "form"})
+            else:
+                if self.tree.elementInScope("p", variant="button"):
+                    self.endTagP(impliedTagToken("p"))
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
+
+        def startTagListItem(self, token):
+            self.parser.framesetOK = False
+
+            stopNamesMap = {"li": ["li"],
+                            "dt": ["dt", "dd"],
+                            "dd": ["dt", "dd"]}
+            stopNames = stopNamesMap[token["name"]]
+            for node in reversed(self.tree.openElements):
+                if node.name in stopNames:
+                    self.parser.phase.processEndTag(
+                        impliedTagToken(node.name, "EndTag"))
+                    break
+                if (node.nameTuple in specialElements and
+                        node.name not in ("address", "div", "p")):
+                    break
+
+            if self.tree.elementInScope("p", variant="button"):
+                self.parser.phase.processEndTag(
+                    impliedTagToken("p", "EndTag"))
+
+            self.tree.insertElement(token)
+
+        def startTagPlaintext(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
+
+        def startTagHeading(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            if self.tree.openElements[-1].name in headingElements:
+                self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagA(self, token):
+            afeAElement = self.tree.elementInActiveFormattingElements("a")
+            if afeAElement:
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "a", "endName": "a"})
+                self.endTagFormatting(impliedTagToken("a"))
+                if afeAElement in self.tree.openElements:
+                    self.tree.openElements.remove(afeAElement)
+                if afeAElement in self.tree.activeFormattingElements:
+                    self.tree.activeFormattingElements.remove(afeAElement)
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagNobr(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            if self.tree.elementInScope("nobr"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "nobr", "endName": "nobr"})
+                self.processEndTag(impliedTagToken("nobr"))
+                # XXX Need tests that trigger the following
+                self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagButton(self, token):
+            if self.tree.elementInScope("button"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "button", "endName": "button"})
+                self.processEndTag(impliedTagToken("button"))
+                return token
+            else:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertElement(token)
+                self.parser.framesetOK = False
+
+        def startTagAppletMarqueeObject(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.activeFormattingElements.append(Marker)
+            self.parser.framesetOK = False
+
+        def startTagXmp(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.framesetOK = False
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagTable(self, token):
+            if self.parser.compatMode != "quirks":
+                if self.tree.elementInScope("p", variant="button"):
+                    self.processEndTag(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.parser.phase = self.parser.phases["inTable"]
+
+        def startTagVoidFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
+
+        def startTagInput(self, token):
+            framesetOK = self.parser.framesetOK
+            self.startTagVoidFormatting(token)
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                # input type=hidden doesn't change framesetOK
+                self.parser.framesetOK = framesetOK
+
+        def startTagParamSource(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagHr(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
+
+        def startTagImage(self, token):
+            # No really...
+            self.parser.parseError("unexpected-start-tag-treated-as",
+                                   {"originalName": "image", "newName": "img"})
+            self.processStartTag(impliedTagToken("img", "StartTag",
+                                                 attributes=token["data"],
+                                                 selfClosing=token["selfClosing"]))
+
+        def startTagIsIndex(self, token):
+            self.parser.parseError("deprecated-tag", {"name": "isindex"})
+            if self.tree.formPointer:
+                return
+            form_attrs = {}
+            if "action" in token["data"]:
+                form_attrs["action"] = token["data"]["action"]
+            self.processStartTag(impliedTagToken("form", "StartTag",
+                                                 attributes=form_attrs))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processStartTag(impliedTagToken("label", "StartTag"))
+            # XXX Localization ...
+            if "prompt" in token["data"]:
+                prompt = token["data"]["prompt"]
+            else:
+                prompt = "This is a searchable index. Enter search keywords: "
+            self.processCharacters(
+                {"type": tokenTypes["Characters"], "data": prompt})
+            attributes = token["data"].copy()
+            if "action" in attributes:
+                del attributes["action"]
+            if "prompt" in attributes:
+                del attributes["prompt"]
+            attributes["name"] = "isindex"
+            self.processStartTag(impliedTagToken("input", "StartTag",
+                                                 attributes=attributes,
+                                                 selfClosing=token["selfClosing"]))
+            self.processEndTag(impliedTagToken("label"))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processEndTag(impliedTagToken("form"))
+
+        def startTagTextarea(self, token):
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+            self.parser.framesetOK = False
+
+        def startTagIFrame(self, token):
+            self.parser.framesetOK = False
+            self.startTagRawtext(token)
+
+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.startTagRawtext(token)
+            else:
+                self.startTagOther(token)
+
+        def startTagRawtext(self, token):
+            """iframe, noembed noframes, noscript(if scripting enabled)"""
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagOpt(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.parser.phase.processEndTag(impliedTagToken("option"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.tree.insertElement(token)
+
+        def startTagSelect(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            if self.parser.phase in (self.parser.phases["inTable"],
+                                     self.parser.phases["inCaption"],
+                                     self.parser.phases["inColumnGroup"],
+                                     self.parser.phases["inTableBody"],
+                                     self.parser.phases["inRow"],
+                                     self.parser.phases["inCell"]):
+                self.parser.phase = self.parser.phases["inSelectInTable"]
+            else:
+                self.parser.phase = self.parser.phases["inSelect"]
+
+        def startTagRpRt(self, token):
+            if self.tree.elementInScope("ruby"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "ruby":
+                    self.parser.parseError()
+            self.tree.insertElement(token)
+
+        def startTagMath(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustMathMLAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["mathml"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagSvg(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustSVGAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["svg"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagMisplaced(self, token):
+            """ Elements that should be children of other elements that have a
+            different insertion mode; here they are ignored
+            "caption", "col", "colgroup", "frame", "frameset", "head",
+            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+            "tr", "noscript"
+            """
+            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+
+        def endTagP(self, token):
+            if not self.tree.elementInScope("p", variant="button"):
+                self.startTagCloseP(impliedTagToken("p", "StartTag"))
+                self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                self.endTagP(impliedTagToken("p", "EndTag"))
+            else:
+                self.tree.generateImpliedEndTags("p")
+                if self.tree.openElements[-1].name != "p":
+                    self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                node = self.tree.openElements.pop()
+                while node.name != "p":
+                    node = self.tree.openElements.pop()
+
+        def endTagBody(self, token):
+            if not self.tree.elementInScope("body"):
+                self.parser.parseError()
+                return
+            elif self.tree.openElements[-1].name != "body":
+                for node in self.tree.openElements[2:]:
+                    if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+                                                   "option", "p", "rp", "rt",
+                                                   "tbody", "td", "tfoot",
+                                                   "th", "thead", "tr", "body",
+                                                   "html")):
+                        # Not sure this is the correct name for the parse error
+                        self.parser.parseError(
+                            "expected-one-end-tag-but-got-another",
+                            {"gotName": "body", "expectedName": node.name})
+                        break
+            self.parser.phase = self.parser.phases["afterBody"]
+
+        def endTagHtml(self, token):
+            # We repeat the test for the body end tag token being ignored here
+            if self.tree.elementInScope("body"):
+                self.endTagBody(impliedTagToken("body"))
+                return token
+
+        def endTagBlock(self, token):
+            # Put us back in the right whitespace handling mode
+            if token["name"] == "pre":
+                self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            inScope = self.tree.elementInScope(token["name"])
+            if inScope:
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+            if inScope:
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagForm(self, token):
+            node = self.tree.formPointer
+            self.tree.formPointer = None
+            if node is None or not self.tree.elementInScope(node):
+                self.parser.parseError("unexpected-end-tag",
+                                       {"name": "form"})
+            else:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1] != node:
+                    self.parser.parseError("end-tag-too-early-ignored",
+                                           {"name": "form"})
+                self.tree.openElements.remove(node)
+
+        def endTagListItem(self, token):
+            if token["name"] == "li":
+                variant = "list"
+            else:
+                variant = None
+            if not self.tree.elementInScope(token["name"], variant=variant):
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+            else:
+                self.tree.generateImpliedEndTags(exclude=token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError(
+                        "end-tag-too-early",
+                        {"name": token["name"]})
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagHeading(self, token):
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    self.tree.generateImpliedEndTags()
+                    break
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    item = self.tree.openElements.pop()
+                    while item.name not in headingElements:
+                        item = self.tree.openElements.pop()
+                    break
+
+        def endTagFormatting(self, token):
+            """The much-feared adoption agency algorithm"""
+            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
+            # XXX Better parseError messages appreciated.
+
+            # Step 1
+            outerLoopCounter = 0
+
+            # Step 2
+            while outerLoopCounter < 8:
+
+                # Step 3
+                outerLoopCounter += 1
+
+                # Step 4:
+
+                # Let the formatting element be the last element in
+                # the list of active formatting elements that:
+                # - is between the end of the list and the last scope
+                # marker in the list, if any, or the start of the list
+                # otherwise, and
+                # - has the same tag name as the token.
+                formattingElement = self.tree.elementInActiveFormattingElements(
+                    token["name"])
+                if (not formattingElement or
+                    (formattingElement in self.tree.openElements and
+                     not self.tree.elementInScope(formattingElement.name))):
+                    # If there is no such node, then abort these steps
+                    # and instead act as described in the "any other
+                    # end tag" entry below.
+                    self.endTagOther(token)
+                    return
+
+                # Otherwise, if there is such a node, but that node is
+                # not in the stack of open elements, then this is a
+                # parse error; remove the element from the list, and
+                # abort these steps.
+                elif formattingElement not in self.tree.openElements:
+                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
+                    self.tree.activeFormattingElements.remove(formattingElement)
+                    return
+
+                # Otherwise, if there is such a node, and that node is
+                # also in the stack of open elements, but the element
+                # is not in scope, then this is a parse error; ignore
+                # the token, and abort these steps.
+                elif not self.tree.elementInScope(formattingElement.name):
+                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
+                    return
+
+                # Otherwise, there is a formatting element and that
+                # element is in the stack and is in scope. If the
+                # element is not the current node, this is a parse
+                # error. In any case, proceed with the algorithm as
+                # written in the following steps.
+                else:
+                    if formattingElement != self.tree.openElements[-1]:
+                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
+
+                # Step 5:
+
+                # Let the furthest block be the topmost node in the
+                # stack of open elements that is lower in the stack
+                # than the formatting element, and is an element in
+                # the special category. There might not be one.
+                afeIndex = self.tree.openElements.index(formattingElement)
+                furthestBlock = None
+                for element in self.tree.openElements[afeIndex:]:
+                    if element.nameTuple in specialElements:
+                        furthestBlock = element
+                        break
+
+                # Step 6:
+
+                # If there is no furthest block, then the UA must
+                # first pop all the nodes from the bottom of the stack
+                # of open elements, from the current node up to and
+                # including the formatting element, then remove the
+                # formatting element from the list of active
+                # formatting elements, and finally abort these steps.
+                if furthestBlock is None:
+                    element = self.tree.openElements.pop()
+                    while element != formattingElement:
+                        element = self.tree.openElements.pop()
+                    self.tree.activeFormattingElements.remove(element)
+                    return
+
+                # Step 7
+                commonAncestor = self.tree.openElements[afeIndex - 1]
+
+                # Step 8:
+                # The bookmark is supposed to help us identify where to reinsert
+                # nodes in step 15. We have to ensure that we reinsert nodes after
+                # the node before the active formatting element. Note the bookmark
+                # can move in step 9.7
+                bookmark = self.tree.activeFormattingElements.index(formattingElement)
+
+                # Step 9
+                lastNode = node = furthestBlock
+                innerLoopCounter = 0
+
+                index = self.tree.openElements.index(node)
+                while innerLoopCounter < 3:
+                    innerLoopCounter += 1
+                    # Node is element before node in open elements
+                    index -= 1
+                    node = self.tree.openElements[index]
+                    if node not in self.tree.activeFormattingElements:
+                        self.tree.openElements.remove(node)
+                        continue
+                    # Step 9.6
+                    if node == formattingElement:
+                        break
+                    # Step 9.7
+                    if lastNode == furthestBlock:
+                        bookmark = self.tree.activeFormattingElements.index(node) + 1
+                    # Step 9.8
+                    clone = node.cloneNode()
+                    # Replace node with clone
+                    self.tree.activeFormattingElements[
+                        self.tree.activeFormattingElements.index(node)] = clone
+                    self.tree.openElements[
+                        self.tree.openElements.index(node)] = clone
+                    node = clone
+                    # Step 9.9
+                    # Remove lastNode from its parents, if any
+                    if lastNode.parent:
+                        lastNode.parent.removeChild(lastNode)
+                    node.appendChild(lastNode)
+                    # Step 9.10
+                    lastNode = node
+
+                # Step 10
+                # Foster parent lastNode if commonAncestor is a
+                # table, tbody, tfoot, thead, or tr we need to foster
+                # parent the lastNode
+                if lastNode.parent:
+                    lastNode.parent.removeChild(lastNode)
+
+                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
+                    parent, insertBefore = self.tree.getTableMisnestedNodePosition()
+                    parent.insertBefore(lastNode, insertBefore)
+                else:
+                    commonAncestor.appendChild(lastNode)
+
+                # Step 11
+                clone = formattingElement.cloneNode()
+
+                # Step 12
+                furthestBlock.reparentChildren(clone)
+
+                # Step 13
+                furthestBlock.appendChild(clone)
+
+                # Step 14
+                self.tree.activeFormattingElements.remove(formattingElement)
+                self.tree.activeFormattingElements.insert(bookmark, clone)
+
+                # Step 15
+                self.tree.openElements.remove(formattingElement)
+                self.tree.openElements.insert(
+                    self.tree.openElements.index(furthestBlock) + 1, clone)
+
+        def endTagAppletMarqueeObject(self, token):
+            if self.tree.elementInScope(token["name"]):
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            if self.tree.elementInScope(token["name"]):
+                element = self.tree.openElements.pop()
+                while element.name != token["name"]:
+                    element = self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-end-tag-treated-as",
+                                   {"originalName": "br", "newName": "br element"})
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(impliedTagToken("br", "StartTag"))
+            self.tree.openElements.pop()
+
+        def endTagOther(self, token):
+            for node in self.tree.openElements[::-1]:
+                if node.name == token["name"]:
+                    self.tree.generateImpliedEndTags(exclude=token["name"])
+                    if self.tree.openElements[-1].name != token["name"]:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                    while self.tree.openElements.pop() != node:
+                        pass
+                    break
+                else:
+                    if node.nameTuple in specialElements:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                        break
+
+    class TextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("script", self.endTagScript)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processEOF(self):
+            self.parser.parseError("expected-named-closing-tag-but-got-eof",
+                                   {"name": self.tree.openElements[-1].name})
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+            return True
+
+        def startTagOther(self, token):
+            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
+
+        def endTagScript(self, token):
+            node = self.tree.openElements.pop()
+            assert node.name == "script"
+            self.parser.phase = self.parser.originalPhase
+            # The rest of this method is all stuff that only happens if
+            # document.write works
+
+        def endTagOther(self, token):
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+
+    class InTablePhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("caption", self.startTagCaption),
+                ("colgroup", self.startTagColgroup),
+                ("col", self.startTagCol),
+                (("tbody", "tfoot", "thead"), self.startTagRowGroup),
+                (("td", "th", "tr"), self.startTagImplyTbody),
+                ("table", self.startTagTable),
+                (("style", "script"), self.startTagStyleScript),
+                ("input", self.startTagInput),
+                ("form", self.startTagForm)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableContext(self):
+            # "clear the stack back to a table context"
+            while self.tree.openElements[-1].name not in ("table", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name":  self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            # When the current node is <html> it's an innerHTML case
+
+        # processing methods
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-table")
+            else:
+                assert self.parser.innerHTML
+            # Stop parsing
+
+        def processSpaceCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processCharacters(token)
+
+        def insertText(self, token):
+            # If we get here there must be at least one non-whitespace character
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processCharacters(token)
+            self.tree.insertFromTable = False
+
+        def startTagCaption(self, token):
+            self.clearStackToTableContext()
+            self.tree.activeFormattingElements.append(Marker)
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCaption"]
+
+        def startTagColgroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inColumnGroup"]
+
+        def startTagCol(self, token):
+            self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
+            return token
+
+        def startTagRowGroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inTableBody"]
+
+        def startTagImplyTbody(self, token):
+            self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
+            return token
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                   {"startName": "table", "endName": "table"})
+            self.parser.phase.processEndTag(impliedTagToken("table"))
+            if not self.parser.innerHTML:
+                return token
+
+        def startTagStyleScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagInput(self, token):
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                self.parser.parseError("unexpected-hidden-input-in-table")
+                self.tree.insertElement(token)
+                # XXX associate with form
+                self.tree.openElements.pop()
+            else:
+                self.startTagOther(token)
+
+        def startTagForm(self, token):
+            self.parser.parseError("unexpected-form-in-table")
+            if self.tree.formPointer is None:
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
+                self.tree.openElements.pop()
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processStartTag(token)
+            self.tree.insertFromTable = False
+
+        def endTagTable(self, token):
+            if self.tree.elementInScope("table", variant="table"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "table":
+                    self.parser.parseError("end-tag-too-early-named",
+                                           {"gotName": "table",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "table":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processEndTag(token)
+            self.tree.insertFromTable = False
+
+    class InTableTextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.originalPhase = None
+            self.characterTokens = []
+
+        def flushCharacters(self):
+            data = "".join([item["data"] for item in self.characterTokens])
+            if any([item not in spaceCharacters for item in data]):
+                token = {"type": tokenTypes["Characters"], "data": data}
+                self.parser.phases["inTable"].insertText(token)
+            elif data:
+                self.tree.insertText(data)
+            self.characterTokens = []
+
+        def processComment(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEOF(self):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return True
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.characterTokens.append(token)
+
+        def processSpaceCharacters(self, token):
+            # pretty sure we should never reach here
+            self.characterTokens.append(token)
+    #        assert False
+
+        def processStartTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEndTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+    class InCaptionPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableElement)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("caption", self.endTagCaption),
+                ("table", self.endTagTable),
+                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagCaption(self):
+            return not self.tree.elementInScope("caption", variant="table")
+
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableElement(self, token):
+            self.parser.parseError()
+            # XXX Have to duplicate logic here to find out if the tag is ignored
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagCaption(self, token):
+            if not self.ignoreEndTagCaption():
+                # AT this code is quite similar to endTagTable in "InTable"
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "caption":
+                    self.parser.parseError("expected-one-end-tag-but-got-another",
+                                           {"gotName": "caption",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "caption":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
+            self.parser.parseError()
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InColumnGroupPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("col", self.startTagCol)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("colgroup", self.endTagColgroup),
+                ("col", self.endTagCol)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagColgroup(self):
+            return self.tree.openElements[-1].name == "html"
+
+        def processEOF(self):
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+                return
+            else:
+                ignoreEndTag = self.ignoreEndTagColgroup()
+                self.endTagColgroup(impliedTagToken("colgroup"))
+                if not ignoreEndTag:
+                    return True
+
+        def processCharacters(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def startTagCol(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def endTagColgroup(self, token):
+            if self.ignoreEndTagColgroup():
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+            else:
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTable"]
+
+        def endTagCol(self, token):
+            self.parser.parseError("no-end-tag", {"name": "col"})
+
+        def endTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+    class InTableBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("tr", self.startTagTr),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+                 self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "td", "th",
+                  "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableBodyContext(self):
+            while self.tree.openElements[-1].name not in ("tbody", "tfoot",
+                                                          "thead", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
+
+        def startTagTr(self, token):
+            self.clearStackToTableBodyContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inRow"]
+
+        def startTagTableCell(self, token):
+            self.parser.parseError("unexpected-cell-in-table-body",
+                                   {"name": token["name"]})
+            self.startTagTr(impliedTagToken("tr", "StartTag"))
+            return token
+
+        def startTagTableOther(self, token):
+            # XXX AT Any ideas on how to share this with endTagTable?
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
+
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.clearStackToTableBodyContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                self.parser.parseError("unexpected-end-tag-in-table-body",
+                                       {"name": token["name"]})
+
+        def endTagTable(self, token):
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-body",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InRowPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+                  "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("tr", self.endTagTr),
+                ("table", self.endTagTable),
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                (("body", "caption", "col", "colgroup", "html", "td", "th"),
+                 self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods (XXX unify this with other table helper methods)
+        def clearStackToTableRowContext(self):
+            while self.tree.openElements[-1].name not in ("tr", "html"):
+                self.parser.parseError("unexpected-implied-end-tag-in-table-row",
+                                       {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+
+        def ignoreEndTagTr(self):
+            return not self.tree.elementInScope("tr", variant="table")
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
+
+        def startTagTableCell(self, token):
+            self.clearStackToTableRowContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCell"]
+            self.tree.activeFormattingElements.append(Marker)
+
+        def startTagTableOther(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
+
+        def endTagTr(self, token):
+            if not self.ignoreEndTagTr():
+                self.clearStackToTableRowContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTableBody"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # Reprocess the current tag if the tr end tag was not ignored
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
+
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagTr(impliedTagToken("tr"))
+                return token
+            else:
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-row",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InCellPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("td", "th"), self.endTagTableCell),
+                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
+                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper
+        def closeCell(self):
+            if self.tree.elementInScope("td", variant="table"):
+                self.endTagTableCell(impliedTagToken("td"))
+            elif self.tree.elementInScope("th", variant="table"):
+                self.endTagTableCell(impliedTagToken("th"))
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableOther(self, token):
+            if (self.tree.elementInScope("td", variant="table") or
+                    self.tree.elementInScope("th", variant="table")):
+                self.closeCell()
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagTableCell(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.tree.generateImpliedEndTags(token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError("unexpected-cell-end-tag",
+                                           {"name": token["name"]})
+                    while True:
+                        node = self.tree.openElements.pop()
+                        if node.name == token["name"]:
+                            break
+                else:
+                    self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inRow"]
+            else:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagImply(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.closeCell()
+                return token
+            else:
+                # sometimes innerHTML case
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InSelectPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("option", self.startTagOption),
+                ("optgroup", self.startTagOptgroup),
+                ("select", self.startTagSelect),
+                (("input", "keygen", "textarea"), self.startTagInput),
+                ("script", self.startTagScript)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("option", self.endTagOption),
+                ("optgroup", self.endTagOptgroup),
+                ("select", self.endTagSelect)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-select")
+            else:
+                assert self.parser.innerHTML
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.tree.insertText(token["data"])
+
+        def startTagOption(self, token):
+            # We need to imply </option> if <option> is the current node.
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagOptgroup(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagSelect(self, token):
+            self.parser.parseError("unexpected-select-in-select")
+            self.endTagSelect(impliedTagToken("select"))
+
+        def startTagInput(self, token):
+            self.parser.parseError("unexpected-input-in-select")
+            if self.tree.elementInScope("select", variant="select"):
+                self.endTagSelect(impliedTagToken("select"))
+                return token
+            else:
+                assert self.parser.innerHTML
+
+        def startTagScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-select",
+                                   {"name": token["name"]})
+
+        def endTagOption(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "option"})
+
+        def endTagOptgroup(self, token):
+            # </optgroup> implicitly closes <option>
+            if (self.tree.openElements[-1].name == "option" and
+                    self.tree.openElements[-2].name == "optgroup"):
+                self.tree.openElements.pop()
+            # It also closes </optgroup>
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            # But nothing else
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "optgroup"})
+
+        def endTagSelect(self, token):
+            if self.tree.elementInScope("select", variant="select"):
+                node = self.tree.openElements.pop()
+                while node.name != "select":
+                    node = self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-select",
+                                   {"name": token["name"]})
+
+    class InSelectInTablePhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.startTagTable)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.endTagTable)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.phases["inSelect"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inSelect"].processCharacters(token)
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
+            self.endTagOther(impliedTagToken("select"))
+            return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inSelect"].processStartTag(token)
+
+        def endTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagOther(impliedTagToken("select"))
+                return token
+
+        def endTagOther(self, token):
+            return self.parser.phases["inSelect"].processEndTag(token)
+
+    class InForeignContentPhase(Phase):
+        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
+                                      "center", "code", "dd", "div", "dl", "dt",
+                                      "em", "embed", "h1", "h2", "h3",
+                                      "h4", "h5", "h6", "head", "hr", "i", "img",
+                                      "li", "listing", "menu", "meta", "nobr",
+                                      "ol", "p", "pre", "ruby", "s", "small",
+                                      "span", "strong", "strike", "sub", "sup",
+                                      "table", "tt", "u", "ul", "var"])
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+        def adjustSVGTagNames(self, token):
+            replacements = {"altglyph": "altGlyph",
+                            "altglyphdef": "altGlyphDef",
+                            "altglyphitem": "altGlyphItem",
+                            "animatecolor": "animateColor",
+                            "animatemotion": "animateMotion",
+                            "animatetransform": "animateTransform",
+                            "clippath": "clipPath",
+                            "feblend": "feBlend",
+                            "fecolormatrix": "feColorMatrix",
+                            "fecomponenttransfer": "feComponentTransfer",
+                            "fecomposite": "feComposite",
+                            "feconvolvematrix": "feConvolveMatrix",
+                            "fediffuselighting": "feDiffuseLighting",
+                            "fedisplacementmap": "feDisplacementMap",
+                            "fedistantlight": "feDistantLight",
+                            "feflood": "feFlood",
+                            "fefunca": "feFuncA",
+                            "fefuncb": "feFuncB",
+                            "fefuncg": "feFuncG",
+                            "fefuncr": "feFuncR",
+                            "fegaussianblur": "feGaussianBlur",
+                            "feimage": "feImage",
+                            "femerge": "feMerge",
+                            "femergenode": "feMergeNode",
+                            "femorphology": "feMorphology",
+                            "feoffset": "feOffset",
+                            "fepointlight": "fePointLight",
+                            "fespecularlighting": "feSpecularLighting",
+                            "fespotlight": "feSpotLight",
+                            "fetile": "feTile",
+                            "feturbulence": "feTurbulence",
+                            "foreignobject": "foreignObject",
+                            "glyphref": "glyphRef",
+                            "lineargradient": "linearGradient",
+                            "radialgradient": "radialGradient",
+                            "textpath": "textPath"}
+
+            if token["name"] in replacements:
+                token["name"] = replacements[token["name"]]
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                token["data"] = "\uFFFD"
+            elif (self.parser.framesetOK and
+                  any(char not in spaceCharacters for char in token["data"])):
+                self.parser.framesetOK = False
+            Phase.processCharacters(self, token)
+
+        def processStartTag(self, token):
+            currentNode = self.tree.openElements[-1]
+            if (token["name"] in self.breakoutElements or
+                (token["name"] == "font" and
+                 set(token["data"].keys()) & set(["color", "face", "size"]))):
+                self.parser.parseError("unexpected-html-element-in-foreign-content",
+                                       {"name": token["name"]})
+                while (self.tree.openElements[-1].namespace !=
+                       self.tree.defaultNamespace and
+                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
+                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
+                    self.tree.openElements.pop()
+                return token
+
+            else:
+                if currentNode.namespace == namespaces["mathml"]:
+                    self.parser.adjustMathMLAttributes(token)
+                elif currentNode.namespace == namespaces["svg"]:
+                    self.adjustSVGTagNames(token)
+                    self.parser.adjustSVGAttributes(token)
+                self.parser.adjustForeignAttributes(token)
+                token["namespace"] = currentNode.namespace
+                self.tree.insertElement(token)
+                if token["selfClosing"]:
+                    self.tree.openElements.pop()
+                    token["selfClosingAcknowledged"] = True
+
+        def processEndTag(self, token):
+            nodeIndex = len(self.tree.openElements) - 1
+            node = self.tree.openElements[-1]
+            if node.name.translate(asciiUpper2Lower) != token["name"]:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+            while True:
+                if node.name.translate(asciiUpper2Lower) == token["name"]:
+                    # XXX this isn't in the spec but it seems necessary
+                    if self.parser.phase == self.parser.phases["inTableText"]:
+                        self.parser.phase.flushCharacters()
+                        self.parser.phase = self.parser.phase.originalPhase
+                    while self.tree.openElements.pop() != node:
+                        assert self.tree.openElements
+                    new_token = None
+                    break
+                nodeIndex -= 1
+
+                node = self.tree.openElements[nodeIndex]
+                if node.namespace != self.tree.defaultNamespace:
+                    continue
+                else:
+                    new_token = self.parser.phase.processEndTag(token)
+                    break
+            return new_token
+
+    class AfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            # Stop parsing
+            pass
+
+        def processComment(self, token):
+            # This is needed because data is to be appended to the <html> element
+            # here and not to whatever is currently open.
+            self.tree.insertComment(token, self.tree.openElements[0])
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-body")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def endTagHtml(self, name):
+            if self.parser.innerHTML:
+                self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
+            else:
+                self.parser.phase = self.parser.phases["afterAfterBody"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+    class InFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("frameset", self.startTagFrameset),
+                ("frame", self.startTagFrame),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("frameset", self.endTagFrameset)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-frameset")
+            else:
+                assert self.parser.innerHTML
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-in-frameset")
+
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
+
+        def startTagFrame(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+
+        def startTagNoframes(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-frameset",
+                                   {"name": token["name"]})
+
+        def endTagFrameset(self, token):
+            if self.tree.openElements[-1].name == "html":
+                # innerHTML case
+                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
+            else:
+                self.tree.openElements.pop()
+            if (not self.parser.innerHTML and
+                    self.tree.openElements[-1].name != "frameset"):
+                # If we're not in innerHTML mode and the current node is not a
+                # "frameset" element (anymore) then switch.
+                self.parser.phase = self.parser.phases["afterFrameset"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-frameset",
+                                   {"name": token["name"]})
+
+    class AfterFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#after3
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("html", self.endTagHtml)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            # Stop parsing
+            pass
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-frameset")
+
+        def startTagNoframes(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-frameset",
+                                   {"name": token["name"]})
+
+        def endTagHtml(self, token):
+            self.parser.phase = self.parser.phases["afterAfterFrameset"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-frameset",
+                                   {"name": token["name"]})
+
+    class AfterAfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+        def processEOF(self):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+    class AfterAfterFramesetPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoFrames)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+        def processEOF(self):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagNoFrames(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+    # pylint:enable=unused-argument
+
+    return {
+        "initial": InitialPhase,
+        "beforeHtml": BeforeHtmlPhase,
+        "beforeHead": BeforeHeadPhase,
+        "inHead": InHeadPhase,
+        "inHeadNoscript": InHeadNoscriptPhase,
+        "afterHead": AfterHeadPhase,
+        "inBody": InBodyPhase,
+        "text": TextPhase,
+        "inTable": InTablePhase,
+        "inTableText": InTableTextPhase,
+        "inCaption": InCaptionPhase,
+        "inColumnGroup": InColumnGroupPhase,
+        "inTableBody": InTableBodyPhase,
+        "inRow": InRowPhase,
+        "inCell": InCellPhase,
+        "inSelect": InSelectPhase,
+        "inSelectInTable": InSelectInTablePhase,
+        "inForeignContent": InForeignContentPhase,
+        "afterBody": AfterBodyPhase,
+        "inFrameset": InFramesetPhase,
+        "afterFrameset": AfterFramesetPhase,
+        "afterAfterBody": AfterAfterBodyPhase,
+        "afterAfterFrameset": AfterAfterFramesetPhase,
+        # XXX after after frameset
+    }
+
+
+def adjust_attributes(token, replacements):
+    needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+    if needs_adjustment:
+        token['data'] = OrderedDict((replacements.get(k, k), v)
+                                    for k, v in token['data'].items())
+
+
+def impliedTagToken(name, type="EndTag", attributes=None,
+                    selfClosing=False):
+    if attributes is None:
+        attributes = {}
+    return {"type": tokenTypes[type], "name": name, "data": attributes,
+            "selfClosing": selfClosing}
+
+
+class ParseError(Exception):
+    """Error in parsed document"""
+    pass
diff --git a/bleach/_vendor/html5lib/serializer.py b/bleach/_vendor/html5lib/serializer.py
new file mode 100644
index 00000000..d6b7105d
--- /dev/null
+++ b/bleach/_vendor/html5lib/serializer.py
@@ -0,0 +1,409 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+import re
+
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")
+
+
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    """Serializes the input token stream using the specified treewalker
+
+    :arg input: the token stream to serialize
+
+    :arg tree: the treewalker to use
+
+    :arg encoding: the encoding to use
+
+    :arg serializer_opts: any options to pass to the
+        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
+
+    :returns: the tree serialized as a string
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> from html5lib.serializer import serialize
+    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
+    >>> serialize(token_stream, omit_optional_tags=False)
+    '<html><head></head><body><p>Hi!</p></body></html>'
+
+    """
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = "legacy"  # be secure by default
+    quote_char = '"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    alphabetical_attributes = False
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer
+
+        :arg inject_meta_charset: Whether or not to inject the meta charset.
+
+            Defaults to ``True``.
+
+        :arg quote_attr_values: Whether to quote attribute values that don't
+            require quoting per legacy browser behavior (``"legacy"``), when
+            required by the standard (``"spec"``), or always (``"always"``).
+
+            Defaults to ``"legacy"``.
+
+        :arg quote_char: Use given quote character for attribute quoting.
+
+            Defaults to ``"`` which will use double quotes unless attribute
+            value contains a double quote, in which case single quotes are
+            used.
+
+        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
+            values.
+
+            Defaults to ``False``.
+
+        :arg escape_rcdata: Whether to escape characters that need to be
+            escaped within normal elements within rcdata elements such as
+            style.
+
+            Defaults to ``False``.
+
+        :arg resolve_entities: Whether to resolve named character entities that
+            appear in the source tree. The XML predefined entities &lt; &gt;
+            &amp; &quot; &apos; are unaffected by this setting.
+
+            Defaults to ``True``.
+
+        :arg strip_whitespace: Whether to remove semantically meaningless
+            whitespace. (This compresses all whitespace to a single space
+            except within ``pre``.)
+
+            Defaults to ``False``.
+
+        :arg minimize_boolean_attributes: Shortens boolean attributes to give
+            just the attribute value, for example::
+
+              <input disabled="disabled">
+
+            becomes::
+
+              <input disabled>
+
+            Defaults to ``True``.
+
+        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
+            start tag of void elements (empty elements whose end tag is
+            forbidden). E.g. ``<hr/>``.
+
+            Defaults to ``False``.
+
+        :arg space_before_trailing_solidus: Places a space immediately before
+            the closing slash in a tag using a trailing solidus. E.g.
+            ``<hr />``. Requires ``use_trailing_solidus=True``.
+
+            Defaults to ``True``.
+
+        :arg sanitize: Strip all unsafe or unknown constructs from output.
+            See :py:class:`html5lib.filters.sanitizer.Filter`.
+
+            Defaults to ``False``.
+
+        :arg omit_optional_tags: Omit start/end tags that are optional.
+
+            Defaults to ``True``.
+
+        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
+
+            Defaults to ``False``.
+
+        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+        if 'quote_char' in kwargs:
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "htmlentityreplace")
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+
+        if encoding and self.inject_meta_charset:
+            from .filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+        # WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from .filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from .filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from .filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = "<!DOCTYPE %s" % token["name"]
+
+                if token["publicId"]:
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
+                            self.serializeError("System identifer contains both single and double quote characters")
+                        quote_char = "'"
+                    else:
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError("Unexpected </ in CDATA")
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict("<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                for (_, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
+                        if self.quote_attr_values == "always" or len(v) == 0:
+                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
+                        else:
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(" /")
+                    else:
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                yield self.encodeStrict("</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError("Comment contains --")
+                yield self.encodeStrict("<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        """Serializes the stream from the treewalker into a string
+
+        :arg treewalker: the treewalker to serialize
+
+        :arg encoding: the string encoding to use
+
+        :returns: the serialized tree
+
+        Example:
+
+        >>> from html5lib import parse, getTreeWalker
+        >>> from html5lib.serializer import HTMLSerializer
+        >>> token_stream = parse('<html><body>Hi!</body></html>')
+        >>> walker = getTreeWalker('etree')
+        >>> serializer = HTMLSerializer(omit_optional_tags=False)
+        >>> serializer.render(walker(token_stream))
+        '<html><head></head><body>Hi!</body></html>'
+
+        """
+        if encoding:
+            return b"".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return "".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+
+class SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
diff --git a/bleach/_vendor/html5lib/treeadapters/__init__.py b/bleach/_vendor/html5lib/treeadapters/__init__.py
new file mode 100644
index 00000000..dfeb0ba5
--- /dev/null
+++ b/bleach/_vendor/html5lib/treeadapters/__init__.py
@@ -0,0 +1,30 @@
+"""Tree adapters let you convert from one tree structure to another
+
+Example:
+
+.. code-block:: python
+
+   import html5lib
+   from html5lib.treeadapters import genshi
+
+   doc = '<html><body>Hi!</body></html>'
+   treebuilder = html5lib.getTreeBuilder('etree')
+   parser = html5lib.HTMLParser(tree=treebuilder)
+   tree = parser.parse(doc)
+   TreeWalker = html5lib.getTreeWalker('etree')
+
+   genshi_tree = genshi.to_genshi(TreeWalker(tree))
+
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+from . import sax
+
+__all__ = ["sax"]
+
+try:
+    from . import genshi  # noqa
+except ImportError:
+    pass
+else:
+    __all__.append("genshi")
diff --git a/bleach/_vendor/html5lib/treeadapters/genshi.py b/bleach/_vendor/html5lib/treeadapters/genshi.py
new file mode 100644
index 00000000..61d5fb6a
--- /dev/null
+++ b/bleach/_vendor/html5lib/treeadapters/genshi.py
@@ -0,0 +1,54 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName, Attrs
+from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
+
+
+def to_genshi(walker):
+    """Convert a tree to a genshi tree
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :returns: generator of genshi nodes
+
+    """
+    text = []
+    for token in walker:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            text.append(token["data"])
+        elif text:
+            yield TEXT, "".join(text), (None, -1, -1)
+            text = []
+
+        if type in ("StartTag", "EmptyTag"):
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+            attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                           for attr, value in token["data"].items()])
+            yield (START, (QName(name), attrs), (None, -1, -1))
+            if type == "EmptyTag":
+                type = "EndTag"
+
+        if type == "EndTag":
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+
+            yield END, QName(name), (None, -1, -1)
+
+        elif type == "Comment":
+            yield COMMENT, token["data"], (None, -1, -1)
+
+        elif type == "Doctype":
+            yield DOCTYPE, (token["name"], token["publicId"],
+                            token["systemId"]), (None, -1, -1)
+
+        else:
+            pass  # FIXME: What to do?
+
+    if text:
+        yield TEXT, "".join(text), (None, -1, -1)
diff --git a/bleach/_vendor/html5lib/treeadapters/sax.py b/bleach/_vendor/html5lib/treeadapters/sax.py
new file mode 100644
index 00000000..f4ccea5a
--- /dev/null
+++ b/bleach/_vendor/html5lib/treeadapters/sax.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :arg handler: SAX handler to use
+
+    """
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
diff --git a/bleach/_vendor/html5lib/treebuilders/__init__.py b/bleach/_vendor/html5lib/treebuilders/__init__.py
new file mode 100644
index 00000000..d44447ea
--- /dev/null
+++ b/bleach/_vendor/html5lib/treebuilders/__init__.py
@@ -0,0 +1,88 @@
+"""A collection of modules for building different kinds of trees from HTML
+documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1. A set of classes for various types of elements: Document, Doctype, Comment,
+   Element. These must implement the interface of ``base.treebuilders.Node``
+   (although comment nodes have a different signature for their constructor,
+   see ``treebuilders.etree.Comment``) Textual content may also be implemented
+   as another node type, or not, as your tree implementation requires.
+
+2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
+   from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
+
+   * ``documentClass`` - the class to use for the bottommost node of a document
+   * ``elementClass`` - the class to use for HTML Elements
+   * ``commentClass`` - the class to use for comments
+   * ``doctypeClass`` - the class to use for doctypes
+
+   It also has one required method:
+
+   * ``getDocument`` - Returns the root node of the complete document tree
+
+3. If you wish to run the unit tests, you must also create a ``testSerializer``
+   method on your treebuilder which accepts a node and returns a string
+   containing Node and its children serialized according to the format used in
+   the unittests
+
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .._utils import default_etree
+
+treeBuilderCache = {}
+
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of trees with built-in support
+
+    :arg treeType: the name of the tree type required (case-insensitive). Supported
+        values are:
+
+        * "dom" - A generic builder for DOM implementations, defaulting to a
+          xml.dom.minidom based implementation.
+        * "etree" - A generic builder for tree implementations exposing an
+          ElementTree-like interface, defaulting to xml.etree.cElementTree if
+          available and xml.etree.ElementTree if not.
+        * "lxml" - A etree-based builder for lxml.etree, handling limitations
+          of lxml's implementation.
+
+    :arg implementation: (Currently applies to the "etree" and "dom" tree
+        types). A module implementing the tree type e.g. xml.etree.ElementTree
+        or xml.etree.cElementTree.
+
+    :arg kwargs: Any additional options to pass to the TreeBuilder when
+        creating it.
+
+    Example:
+
+    >>> from html5lib.treebuilders import getTreeBuilder
+    >>> builder = getTreeBuilder('etree')
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType == "dom":
+            from . import dom
+            # Come up with a sane default (pref. from the stdlib)
+            if implementation is None:
+                from xml.dom import minidom
+                implementation = minidom
+            # NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+        else:
+            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
+    return treeBuilderCache.get(treeType)
diff --git a/bleach/_vendor/html5lib/treebuilders/base.py b/bleach/_vendor/html5lib/treebuilders/base.py
new file mode 100644
index 00000000..05d97ecc
--- /dev/null
+++ b/bleach/_vendor/html5lib/treebuilders/base.py
@@ -0,0 +1,417 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from ..constants import scopingElements, tableInsertModeElements, namespaces
+
+# The scope markers are inserted when entering object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, object elements, and marquees.
+Marker = None
+
+listElementsMap = {
+    None: (frozenset(scopingElements), False),
+    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
+    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
+                                              (namespaces["html"], "ul")])), False),
+    "table": (frozenset([(namespaces["html"], "html"),
+                         (namespaces["html"], "table")]), False),
+    "select": (frozenset([(namespaces["html"], "optgroup"),
+                          (namespaces["html"], "option")]), True)
+}
+
+
+class Node(object):
+    """Represents an item in the tree"""
+    def __init__(self, name):
+        """Creates a Node
+
+        :arg name: The tag name associated with the node
+
+        """
+        # The tag name assocaited with the node
+        self.name = name
+        # The parent of the current node (or None for the document node)
+        self.parent = None
+        # The value of the current node (applies to text nodes and comments)
+        self.value = None
+        # A dict holding name -> value pairs for attributes of the node
+        self.attributes = {}
+        # A list of child nodes of the current node. This must include all
+        # elements but not necessarily other node types.
+        self.childNodes = []
+        # A list of miscellaneous flags that can be set on the node.
+        self._flags = []
+
+    def __str__(self):
+        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
+                                  for name, value in
+                                  self.attributes.items()])
+        if attributesStr:
+            return "<%s %s>" % (self.name, attributesStr)
+        else:
+            return "<%s>" % (self.name)
+
+    def __repr__(self):
+        return "<%s>" % (self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+
+        :arg node: the node to insert
+
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the
+        start of node insertBefore or to the end of the node's text.
+
+        :arg data: the data to insert
+
+        :arg insertBefore: True if you want to insert the text before the node
+            and False if you want to insert it after the node
+
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the
+        list of child nodes. Raises ValueError if refNode is not a child of
+        the current node
+
+        :arg node: the node to insert
+
+        :arg refNode: the child node to insert the node before
+
+        """
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+
+        :arg node: the child node to remove
+
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent.
+        This is needed so that trees that don't store text as nodes move the
+        text in the correct way
+
+        :arg newParent: the node to move all this node's children to
+
+        """
+        # XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+
+        if not node1.attributes == node2.attributes:
+            return False
+
+        return True
+
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+
+    * documentClass - the class to use for the bottommost node of a document
+    * elementClass - the class to use for HTML Elements
+    * commentClass - the class to use for comments
+    * doctypeClass - the class to use for doctypes
+
+    """
+    # pylint:disable=not-callable
+
+    # Document class
+    documentClass = None
+
+    # The class to use for creating a node
+    elementClass = None
+
+    # The class to use for creating comments
+    commentClass = None
+
+    # The class to use for creating doctypes
+    doctypeClass = None
+
+    # Fragment class
+    fragmentClass = None
+
+    def __init__(self, namespaceHTMLElements):
+        """Create a TreeBuilder
+
+        :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+        """
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+        self.reset()
+
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
+
+        # XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, variant=None):
+
+        # If we pass a node in we match that. if we pass a string
+        # match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+        if not exactNode:
+            if isinstance(target, text_type):
+                target = (namespaces["html"], target)
+            assert isinstance(target, tuple)
+
+        listElements, invert = listElementsMap[variant]
+
+        for node in reversed(self.openElements):
+            if exactNode and node == target:
+                return True
+            elif not exactNode and node.nameTuple == target:
+                return True
+            elif (invert ^ (node.nameTuple in listElements)):
+                return False
+
+        assert False  # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = len(self.activeFormattingElements) - 1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            if i == 0:
+                # This will be reset to 0 below
+                i = -1
+                break
+            i -= 1
+            # Step 5: let entry be one earlier in the list.
+            entry = self.activeFormattingElements[i]
+
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            entry = self.activeFormattingElements[i]
+            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
+
+            # Step 9
+            element = self.insertElement({"type": "StartTag",
+                                          "name": clone.name,
+                                          "namespace": clone.namespace,
+                                          "data": clone.attributes})
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.document.appendChild(doctype)
+
+    def insertComment(self, token, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(token["data"]))
+
+    def createElement(self, token):
+        """Create an element but don't insert it anywhere"""
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+
+    def insertElementNormal(self, token):
+        name = token["name"]
+        assert isinstance(name, text_type), "Element %s not unicode" % name
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, token):
+        """Create an element and insert it into the tree"""
+        element = self.createElement(token)
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(token)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name
+                                         not in tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
+        lastTable = None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == "table":
+                lastTable = elm
+                break
+        if lastTable:
+            # XXX - we should really check that this parent is actually a
+            # node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+                name != exclude):
+            self.openElements.pop()
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        """Return the final tree"""
+        return self.document
+
+    def getFragment(self):
+        """Return the final fragment"""
+        # assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+
+        :arg node: the node from which to start serializing
+
+        """
+        raise NotImplementedError
diff --git a/bleach/_vendor/html5lib/treebuilders/dom.py b/bleach/_vendor/html5lib/treebuilders/dom.py
new file mode 100644
index 00000000..dcfac220
--- /dev/null
+++ b/bleach/_vendor/html5lib/treebuilders/dom.py
@@ -0,0 +1,236 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+from collections import MutableMapping
+from xml.dom import minidom, Node
+import weakref
+
+from . import base
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+
+    class AttrList(MutableMapping):
+        def __init__(self, element):
+            self.element = element
+
+        def __iter__(self):
+            return iter(self.element.attributes.keys())
+
+        def __setitem__(self, name, value):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr
+
+        def __len__(self):
+            return len(self.element.attributes)
+
+        def items(self):
+            return list(self.element.attributes.items())
+
+        def values(self):
+            return list(self.element.attributes.values())
+
+        def __getitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
+
+        def __delitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                del self.element.attributes[name]
+
+    class NodeBuilder(base.Node):
+        def __init__(self, element):
+            base.Node.__init__(self, element.nodeName)
+            self.element = element
+
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+                             self.element.namespaceURI or None)
+
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+
+        def insertText(self, data, insertBefore=None):
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+
+        def getAttributes(self):
+            return AttrList(self.element)
+
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in list(attributes.items()):
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" + name[1])
+                        else:
+                            qualifiedName = name[1]
+                        self.element.setAttributeNS(name[2], qualifiedName,
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            name, value)
+        attributes = property(getAttributes, setAttributes)
+
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+
+        def hasContent(self):
+            return self.element.hasChildNodes()
+
+        def getNameTuple(self):
+            if self.namespace is None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
+            return weakref.proxy(self)
+
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            return self.dom
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self).element
+
+        def insertText(self, data, parent=None):
+            data = data
+            if parent != self:
+                base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
+                    if Node.TEXT_NODE not in self.dom._child_node_types:
+                        self.dom._child_node_types = list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+
+        implementation = DomImplementation
+        name = None
+
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                        element.namespaceURI is not None):
+                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
+                                      element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.hasAttributes():
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
+                        name = attr.nodeName
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
+                        else:
+                            name = attr.nodeName
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    return locals()
+
+
+# The actual means to get a module!
+getDomModule = moduleFactoryFactory(getDomBuilder)
diff --git a/bleach/_vendor/html5lib/treebuilders/etree.py b/bleach/_vendor/html5lib/treebuilders/etree.py
new file mode 100644
index 00000000..cb1d4aef
--- /dev/null
+++ b/bleach/_vendor/html5lib/treebuilders/etree.py
@@ -0,0 +1,340 @@
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+from six import text_type
+
+import re
+
+from . import base
+from .. import _ihatexml
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class Element(base.Node):
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s" % (namespace, name)
+            return etree_tag
+
+        def _setName(self, name):
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getName(self):
+            return self._name
+
+        name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+
+        def _getAttributes(self):
+            return self._element.attrib
+
+        def _setAttributes(self, attributes):
+            # Delete existing attributes first
+            # XXX - there may be a better way to do this...
+            for key in list(self._element.attrib.keys()):
+                del self._element.attrib[key]
+            for key, value in attributes.items():
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
+
+        attributes = property(_getAttributes, _setAttributes)
+
+        def _getChildNodes(self):
+            return self._childNodes
+
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+
+        childNodes = property(_getChildNodes, _setChildNodes)
+
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or len(self._element))
+
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+
+        def insertBefore(self, node, refNode):
+            index = list(self._element).index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+
+        def removeChild(self, node):
+            self._childNodes.remove(node)
+            self._element.remove(node._element)
+            node.parent = None
+
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                # Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                # Insert the text before the specified node
+                children = list(self._element)
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index - 1].tail:
+                        self._element[index - 1].tail = ""
+                    self._element[index - 1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+
+        def cloneNode(self):
+            element = type(self)(self.name, self.namespace)
+            for name, value in self.attributes.items():
+                element.attributes[name] = value
+            return element
+
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            base.Node.reparentChildren(self, newParent)
+
+    class Comment(Element):
+        def __init__(self, data):
+            # Use the superclass constructor to set all properties on the
+            # wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getData(self):
+            return self._element.text
+
+        def _setData(self, value):
+            self._element.text = value
+
+        data = property(_getData, _setData)
+
+    class DocumentType(Element):
+        def __init__(self, name, publicId, systemId):
+            Element.__init__(self, "<!DOCTYPE>")
+            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
+
+        def _getPublicId(self):
+            return self._element.get("publicId", "")
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set("publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+
+        def _getSystemId(self):
+            return self._element.get("systemId", "")
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set("systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
+
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_ROOT")
+
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
+
+    def testSerializer(element):
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                rv.append("#document")
+                if element.text is not None:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+            elif element.tag == ElementTreeCommentType:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            else:
+                assert isinstance(element.tag, text_type), \
+                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    name = "%s %s" % (prefix, name)
+                rv.append("|%s<%s>" % (' ' * indent, name))
+
+                if hasattr(element, "attrib"):
+                    attributes = []
+                    for name, value in element.attrib.items():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            attr_string = "%s %s" % (prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    def tostring(element):  # pylint:disable=unused-variable
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        filter = _ihatexml.InfosetFilter()
+
+        def serializeElement(element):
+            if isinstance(element, ElementTree.ElementTree):
+                element = element.getroot()
+
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                if element.text is not None:
+                    rv.append(element.text)
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+
+                for child in element:
+                    serializeElement(child)
+
+            elif element.tag == ElementTreeCommentType:
+                rv.append("<!--%s-->" % (element.text,))
+            else:
+                # This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
+                else:
+                    attr = " ".join(["%s=\"%s\"" % (
+                        filter.fromXmlName(name), value)
+                        for name, value in element.attrib.items()])
+                    rv.append("<%s %s>" % (element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+
+                for child in element:
+                    serializeElement(child)
+
+                rv.append("</%s>" % (element.tag,))
+
+            if element.tail:
+                rv.append(element.tail)
+
+        serializeElement(element)
+
+        return "".join(rv)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+        implementation = ElementTreeImplementation
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                if self.defaultNamespace is not None:
+                    return self.document._element.find(
+                        "{%s}html" % self.defaultNamespace)
+                else:
+                    return self.document._element.find("html")
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self)._element
+
+    return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/bleach/_vendor/html5lib/treebuilders/etree_lxml.py b/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
new file mode 100644
index 00000000..ca12a99c
--- /dev/null
+++ b/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
@@ -0,0 +1,366 @@
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+import warnings
+import re
+import sys
+
+from . import base
+from ..constants import DataLossWarning
+from .. import constants
+from . import etree as etree_builders
+from .. import _ihatexml
+
+import lxml.etree as etree
+
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+comment_type = etree.Comment("asd").tag
+
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name
+        self.publicId = publicId
+        self.systemId = systemId
+
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    childNodes = property(_getChildNodes)
+
+
+def testSerializer(element):
+    rv = []
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if hasattr(element, "getroot"):
+                # Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
+                            element.docinfo.root_name,
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent + 2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, str) or isinstance(element, bytes):
+                # Text in a fragment
+                assert isinstance(element, str) or sys.version_info[0] == 2
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                # Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent + 2)
+        elif element.tag == comment_type:
+            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
+        else:
+            assert isinstance(element, etree._Element)
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
+                                          infosetFilter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>" % (' ' * indent,
+                                       infosetFilter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                attributes = []
+                for name, value in element.attrib.items():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = infosetFilter.fromXmlName(name)
+                        prefix = constants.prefixes[ns]
+                        attr_string = "%s %s" % (prefix, name)
+                    else:
+                        attr_string = infosetFilter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+
+            if element.text:
+                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
+
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+
+        elif element.tag == comment_type:
+            rv.append("<!--%s-->" % (element.text,))
+
+        else:
+            # This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>" % (element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\"" % (name, value)
+                                 for name, value in element.attrib.items()])
+                rv.append("<%s %s>" % (element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element:
+                serializeElement(child)
+
+            rv.append("</%s>" % (element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    return "".join(rv)
+
+
+class TreeBuilder(base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document
+    implementation = etree
+
+    def __init__(self, namespaceHTMLElements, fullTree=False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
+                self._element = element
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
+                for key, value in self.items():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                    else:
+                        name = infosetFilter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                else:
+                    name = infosetFilter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = infosetFilter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = infosetFilter.coerceElement(name)
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+
+            def _getName(self):
+                return infosetFilter.fromXmlName(self._name)
+
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = infosetFilter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = infosetFilter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = infosetFilter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = Comment
+        # self.fragmentClass = builder.DocumentFragment
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)
+
+    def reset(self):
+        base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(list(element))
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name:
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+            self.doctype = None
+        else:
+            coercedName = self.infosetFilter.coerceElement(name)
+            if coercedName != name:
+                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
+
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
+            self.doctype = doctype
+
+    def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
+        self.initial_comments.append(data)
+
+    def insertCommentMain(self, data, parent=None):
+        if (parent == self.document and
+                self.document._elementTree.getroot()[-1].tag == comment_type):
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+        super(TreeBuilder, self).insertComment(data, parent)
+
+    def insertRoot(self, token):
+        # Because of the way libxml2 works, it doesn't seem to be possible to
+        # alter information like the doctype after the tree has been parsed.
+        # Therefore we need to use the built-in parser to create our initial
+        # tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype:
+            assert self.doctype.name
+            docStr += "<!DOCTYPE %s" % self.doctype.name
+            if (self.doctype.publicId is not None or
+                    self.doctype.systemId is not None):
+                docStr += (' PUBLIC "%s" ' %
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
+                if self.doctype.systemId:
+                    sysid = self.doctype.systemId
+                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
+                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+                        sysid = sysid.replace("'", 'U00027')
+                    if sysid.find("'") >= 0:
+                        docStr += '"%s"' % sysid
+                    else:
+                        docStr += "'%s'" % sysid
+                else:
+                    docStr += "''"
+            docStr += ">"
+            if self.doctype.name != token["name"]:
+                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
+        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
+        root = etree.fromstring(docStr)
+
+        # Append the initial comments:
+        for comment_token in self.initial_comments:
+            comment = self.commentClass(comment_token["data"])
+            root.addprevious(comment._element)
+
+        # Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+
+        # Give the root element the right name
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        if namespace is None:
+            etree_tag = name
+        else:
+            etree_tag = "{%s}%s" % (namespace, name)
+        root.tag = etree_tag
+
+        # Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name, namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+
+        # Reset to the default insert comment function
+        self.insertComment = self.insertCommentMain
diff --git a/bleach/_vendor/html5lib/treewalkers/__init__.py b/bleach/_vendor/html5lib/treewalkers/__init__.py
new file mode 100644
index 00000000..9bec2076
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/__init__.py
@@ -0,0 +1,154 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .. import constants
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint"]
+
+treeWalkerCache = {}
+
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    :arg str treeType: the name of the tree type required (case-insensitive).
+        Supported values are:
+
+        * "dom": The xml.dom.minidom DOM implementation
+        * "etree": A generic walker for tree implementations exposing an
+          elementtree-like interface (known to work with ElementTree,
+          cElementTree and lxml.etree).
+        * "lxml": Optimized walker for lxml.etree
+        * "genshi": a Genshi stream
+
+    :arg implementation: A module implementing the tree type e.g.
+        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
+        tree type only).
+
+    :arg kwargs: keyword arguments passed to the etree walker--for other
+        walkers, this has no effect
+
+    :returns: a TreeWalker class
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType == "dom":
+            from . import dom
+            treeWalkerCache[treeType] = dom.TreeWalker
+        elif treeType == "genshi":
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers
+
+    Takes a TreeWalker instance and pretty prints the output of walking the tree.
+
+    :arg walker: a TreeWalker instance
+
+    """
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
diff --git a/bleach/_vendor/html5lib/treewalkers/base.py b/bleach/_vendor/html5lib/treewalkers/base.py
new file mode 100644
index 00000000..80c474c4
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/base.py
@@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
+
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+spaceCharacters = "".join(spaceCharacters)
+
+
+class TreeWalker(object):
+    """Walks a tree yielding tokens
+
+    Tokens are dicts that all have a ``type`` field specifying the type of the
+    token.
+
+    """
+    def __init__(self, tree):
+        """Creates a TreeWalker
+
+        :arg tree: the tree to walk
+
+        """
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        """Generates an error token with the given message
+
+        :arg msg: the error message
+
+        :returns: SerializeError token
+
+        """
+        return {"type": "SerializeError", "data": msg}
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        """Generates an EmptyTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :arg hasChildren: whether or not to yield a SerializationError because
+            this tag shouldn't have children
+
+        :returns: EmptyTag token
+
+        """
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
+               "data": attrs}
+        if hasChildren:
+            yield self.error("Void element has children")
+
+    def startTag(self, namespace, name, attrs):
+        """Generates a StartTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :returns: StartTag token
+
+        """
+        return {"type": "StartTag",
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
+
+    def endTag(self, namespace, name):
+        """Generates an EndTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :returns: EndTag token
+
+        """
+        return {"type": "EndTag",
+                "name": name,
+                "namespace": namespace}
+
+    def text(self, data):
+        """Generates SpaceCharacters and Characters tokens
+
+        Depending on what's in the data, this generates one or more
+        ``SpaceCharacters`` and ``Characters`` tokens.
+
+        For example:
+
+            >>> from html5lib.treewalkers.base import TreeWalker
+            >>> # Give it an empty tree just so it instantiates
+            >>> walker = TreeWalker([])
+            >>> list(walker.text(''))
+            []
+            >>> list(walker.text('  '))
+            [{u'data': '  ', u'type': u'SpaceCharacters'}]
+            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
+            [{u'data': ' ', u'type': u'SpaceCharacters'},
+            {u'data': u'abc', u'type': u'Characters'},
+            {u'data': u' ', u'type': u'SpaceCharacters'}]
+
+        :arg data: the text data
+
+        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
+
+        """
+        data = data
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data) - len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        """Generates a Comment token
+
+        :arg data: the comment
+
+        :returns: Comment token
+
+        """
+        return {"type": "Comment", "data": data}
+
+    def doctype(self, name, publicId=None, systemId=None):
+        """Generates a Doctype token
+
+        :arg name:
+
+        :arg publicId:
+
+        :arg systemId:
+
+        :returns: the Doctype token
+
+        """
+        return {"type": "Doctype",
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}
+
+    def entity(self, name):
+        """Generates an Entity token
+
+        :arg name: the entity name
+
+        :returns: an Entity token
+
+        """
+        return {"type": "Entity", "name": name}
+
+    def unknown(self, nodeType):
+        """Handles unknown node types"""
+        return self.error("Unknown node type: " + nodeType)
+
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+
+    def getFirstChild(self, node):
+        raise NotImplementedError
+
+    def getNextSibling(self, node):
+        raise NotImplementedError
+
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes,
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)
diff --git a/bleach/_vendor/html5lib/treewalkers/dom.py b/bleach/_vendor/html5lib/treewalkers/dom.py
new file mode 100644
index 00000000..b0c89b00
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/dom.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+
+from . import base
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                if attr.namespaceURI:
+                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
+                else:
+                    attrs[(None, attr.name)] = attr.value
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
+                    attrs, node.hasChildNodes())
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (base.DOCUMENT,)
+
+        else:
+            return base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
diff --git a/bleach/_vendor/html5lib/treewalkers/etree.py b/bleach/_vendor/html5lib/treewalkers/etree.py
new file mode 100644
index 00000000..d15a7eeb
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/etree.py
@@ -0,0 +1,130 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import OrderedDict
+import re
+
+from six import string_types
+
+from . import base
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. The current element
+
+        2. The index of the element relative to its parent
+
+        3. A stack of ancestor elements
+
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
+        """
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple):  # It might be the root Element
+                elt, _, _, flag = node
+                if flag in ("text", "tail"):
+                    return base.TEXT, getattr(elt, flag)
+                else:
+                    node = elt
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
+                return (base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return (base.DOCTYPE, node.text,
+                        node.get("publicId"), node.get("systemId"))
+
+            elif node.tag == ElementTreeCommentType:
+                return base.COMMENT, node.text
+
+            else:
+                assert isinstance(node.tag, string_types), type(node.tag)
+                # This is assumed to be an ordinary element
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                attrs = OrderedDict()
+                for name, value in list(node.attrib.items()):
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1), match.group(2))] = value
+                    else:
+                        attrs[(None, name)] = value
+                return (base.ELEMENT, namespace, tag,
+                        attrs, len(node) or node.text)
+
+        def getFirstChild(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                element, key, parents, flag = node, None, [], None
+
+            if flag in ("text", "tail"):
+                return None
+            else:
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+
+        def getNextSibling(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key + 1], key + 1, parents, None
+                else:
+                    return None
+
+        def getParentNode(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    assert list(parents[-1]).count(parent) == 1
+                    return parent, list(parents[-1]).index(parent), parents, None
+
+    return locals()
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/bleach/_vendor/html5lib/treewalkers/etree_lxml.py b/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
new file mode 100644
index 00000000..fb236311
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
@@ -0,0 +1,213 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from lxml import etree
+from ..treebuilders.etree import tag_regexp
+
+from . import base
+
+from .. import _ihatexml
+
+
+def ensure_str(s):
+    if s is None:
+        return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return s.decode("ascii", "strict")
+
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+
+        try:
+            if et.docinfo.internalDTD:
+                self.children.append(Doctype(self,
+                                             ensure_str(et.docinfo.root_name),
+                                             ensure_str(et.docinfo.public_id),
+                                             ensure_str(et.docinfo.system_url)))
+        except AttributeError:
+            pass
+
+        try:
+            node = et.getroot()
+        except AttributeError:
+            node = et
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = ensure_str(self.obj.text)
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = ensure_str(self.obj.tail)
+        else:
+            self.tail = None
+
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __bool__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
+        if isinstance(tree, list):
+            self.fragmentChildren = set(tree)
+            tree = FragmentRoot(tree)
+        else:
+            self.fragmentChildren = set()
+            tree = Root(tree)
+        base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = _ihatexml.InfosetFilter()
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            return base.TEXT, ensure_str(getattr(node, key))
+
+        elif isinstance(node, Root):
+            return (base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
+            return base.TEXT, ensure_str(node.obj)
+
+        elif node.tag == etree.Comment:
+            return base.COMMENT, ensure_str(node.text)
+
+        elif node.tag == etree.Entity:
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+
+        else:
+            # This is assumed to be an ordinary element
+            match = tag_regexp.match(ensure_str(node.tag))
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = ensure_str(node.tag)
+            attrs = {}
+            for name, value in list(node.attrib.items()):
+                name = ensure_str(name)
+                value = ensure_str(value)
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1), match.group(2))] = value
+                else:
+                    attrs[(None, name)] = value
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), "Text nodes have no children"
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else:  # tail
+                return node.getnext()
+
+        return (node, "tail") if node.tail else node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None
+
+        return node.getparent()
diff --git a/bleach/_vendor/html5lib/treewalkers/genshi.py b/bleach/_vendor/html5lib/treewalkers/genshi.py
new file mode 100644
index 00000000..7483be27
--- /dev/null
+++ b/bleach/_vendor/html5lib/treewalkers/genshi.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+from . import base
+
+from ..constants import voidElements, namespaces
+
+
+class TreeWalker(base.TreeWalker):
+    def __iter__(self):
+        # Buffer the events so we can pass in the following one
+        previous = None
+        for event in self.tree:
+            if previous is not None:
+                for token in self.tokens(previous, event):
+                    yield token
+            previous = event
+
+        # Don't forget the final event!
+        if previous is not None:
+            for token in self.tokens(previous, None):
+                yield token
+
+    def tokens(self, event, next):
+        kind, data, _ = event
+        if kind == START:
+            tag, attribs = data
+            name = tag.localname
+            namespace = tag.namespace
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
+                                           not next or next[0] != END or
+                                           next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, converted_attribs)
+
+        elif kind == END:
+            name = data.localname
+            namespace = data.namespace
+            if namespace != namespaces["html"] or name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(*data)
+
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
+                      START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
diff --git a/bleach/_vendor/pip_install_vendor.sh b/bleach/_vendor/pip_install_vendor.sh
new file mode 100755
index 00000000..659c7acd
--- /dev/null
+++ b/bleach/_vendor/pip_install_vendor.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+
+pip install --no-binary all --no-compile --target . --no-deps -r vendor.txt
diff --git a/bleach/_vendor/vendor.txt b/bleach/_vendor/vendor.txt
new file mode 100644
index 00000000..7f12682b
--- /dev/null
+++ b/bleach/_vendor/vendor.txt
@@ -0,0 +1 @@
+html5lib==1.0.1
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 849443ce..ef8afc6c 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -2,10 +2,10 @@
 import re
 import six
 
-import html5lib
-from html5lib.filters.base import Filter
-from html5lib.filters.sanitizer import allowed_protocols
-from html5lib.serializer import HTMLSerializer
+from bleach._vendor import html5lib
+from bleach._vendor.html5lib.filters.base import Filter
+from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
+from bleach._vendor.html5lib.serializer import HTMLSerializer
 
 from bleach import callbacks as linkify_callbacks
 from bleach.utils import alphabetize_attributes, force_unicode
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 31f12400..7cbb9e87 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -7,23 +7,19 @@
 from six.moves.urllib.parse import urlparse
 from xml.sax.saxutils import unescape
 
-import html5lib
-from html5lib.constants import (
+from bleach._vendor import html5lib
+from bleach._vendor.html5lib.constants import (
     entities,
     namespaces,
     prefixes,
     tokenTypes,
 )
-try:
-    from html5lib.constants import ReparseException
-except ImportError:
-    # html5lib-python 1.0 changed the name
-    from html5lib.constants import _ReparseException as ReparseException
-from html5lib.filters.base import Filter
-from html5lib.filters import sanitizer
-from html5lib.serializer import HTMLSerializer
-from html5lib._tokenizer import HTMLTokenizer
-from html5lib._trie import Trie
+from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
+from bleach._vendor.html5lib.filters.base import Filter
+from bleach._vendor.html5lib.filters import sanitizer
+from bleach._vendor.html5lib.serializer import HTMLSerializer
+from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
+from bleach._vendor.html5lib._trie import Trie
 
 from bleach.utils import alphabetize_attributes, force_unicode
 
diff --git a/setup.cfg b/setup.cfg
index 69c6d1f2..0d149b6d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,6 +2,10 @@
 test=pytest
 
 [flake8]
+exclude =
+    .git/,
+    .tox/,
+    bleach/_vendor/*
 ignore =
     # E731: do not assign a lambda expression, use a def
     E731,
diff --git a/setup.py b/setup.py
index e078c80f..a7a27a8e 100755
--- a/setup.py
+++ b/setup.py
@@ -19,9 +19,8 @@
 
 install_requires = [
     'six',
-    # >= 8 9s because of breaking API change
-    # the 'pre' suffix is needed for supporting '1.0b*' versions
-    'html5lib>=0.99999999pre,!=1.0b1,!=1.0b2,!=1.0b3,!=1.0b4,!=1.0b5,!=1.0b6,!=1.0b7,!=1.0b8',
+    # html5lib requirements
+    'webencodings',
 ]
 
 
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 951d5b2a..96f0916e 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,6 +1,6 @@
 import os
 
-from html5lib.filters.base import Filter
+from bleach._vendor.html5lib.filters.base import Filter
 import pytest
 
 from bleach import clean
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 3d446651..4fa45126 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -1,10 +1,7 @@
 import re
-try:
-    from urllib.parse import quote_plus
-except ImportError:
-    from urllib import quote_plus
 
 import pytest
+from six.moves.urllib_parse import quote_plus
 
 from bleach import linkify, DEFAULT_CALLBACKS as DC
 from bleach.linkifier import Linker
diff --git a/tox.ini b/tox.ini
index d5539644..0f818e50 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,8 +7,8 @@
 
 [tox]
 envlist =
-    py{27,34,35,36}-html5lib{99999999,999999999,10b9,10b10,101}
-    pypy-html5lib99999999
+    py{27,34,35,36}
+    pypy
     py{27,34,35,36}-build-no-lang
     docs
     lint
@@ -21,11 +21,6 @@ basepython =
     py36: python3.6
 deps =
     -rrequirements-dev.txt
-    html5lib99999999: html5lib==0.99999999
-    html5lib999999999: html5lib==0.999999999
-    html5lib10b9: html5lib==1.0b9
-    html5lib10b10: html5lib==1.0b10
-    html5lib101: html5lib==1.0.1
 commands =
     py.test {posargs:-v}
     python setup.py build

From 45abd92d0b28820e11447351bd1e1c1e8d025991 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sun, 26 Aug 2018 15:32:06 -0400
Subject: [PATCH 168/314] Update .travis.yml file regarding html5lib

We don't need to test a million variations of html5lib anymore.
---
 .travis.yml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index cd05d9aa..b690608d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,18 +10,9 @@ python:
   - "3.5"
   - "3.6"
   - "pypy"
-env:
-  - HTML5LIB=0.99999999   # 8
-  - HTML5LIB=0.999999999  # 9
-  - HTML5LIB=1.0b9
-  - HTML5LIB=1.0b10
-  - HTML5LIB=1.0.1
 install:
-  # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
   - pip install -U pip setuptools>=18.5
   - pip install -r requirements-dev.txt
-  # stomp on html5lib install with the specified one
-  - pip install html5lib==$HTML5LIB
 script:
   - py.test
   - flake8 bleach/

From f2aa0af07a0ca6b6a7a99bf6f5b848328746b4dd Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 27 Aug 2018 15:28:27 -0400
Subject: [PATCH 169/314] Add hash for html5lib; add review instructions

---
 bleach/_vendor/README.rst | 20 ++++++++++++++++++--
 bleach/_vendor/vendor.txt |  4 +++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst
index 41c1d13e..a8a42d68 100644
--- a/bleach/_vendor/README.rst
+++ b/bleach/_vendor/README.rst
@@ -13,9 +13,25 @@ Vendored libraries must follow these rules:
 4. Requirements of the library become requirements of Bleach.
 5. No modifications to the library may be made.
 
+
+Adding/Updating a vendored library
+==================================
+
 Way to vendor a library or update a version:
 
-1. Update ``vendor.txt`` with the library and version.
-2. Remove old files and directories.
+1. Update ``vendor.txt`` with the library, version, and hash. You can use 
+   `hashin <https://pypi.org/project/hashin/>`_.
+2. Remove all old files and directories of the old version.
 3. Run ``pip_install_vendor.sh`` and check everything it produced in including
    the ``.dist-info`` directory and contents.
+
+
+Reviewing a change involving a vendored library
+===============================================
+
+Way to verify a vendored library addition/update:
+
+1. Pull down the branch.
+2. Delete all the old files and directories of the old version.
+3. Run ``pip_install_vendor.sh``.
+4. Run ``git diff`` and verify there are no changes.
diff --git a/bleach/_vendor/vendor.txt b/bleach/_vendor/vendor.txt
index 7f12682b..bb395f8c 100644
--- a/bleach/_vendor/vendor.txt
+++ b/bleach/_vendor/vendor.txt
@@ -1 +1,3 @@
-html5lib==1.0.1
+html5lib==1.0.1 \
+    --hash=sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3 \
+    --hash=sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736

From ba37abea00955cbebff74dbc611c6063645110f8 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 28 Aug 2018 10:17:47 -0400
Subject: [PATCH 170/314] Add vendor verify

Adds vendor_verify step in Travis testing. This also tweaks Travis testing
to lint in just a couple of environments rather than all of them.
---
 .travis.yml                          | 16 ++++++++++++++--
 MANIFEST.in                          |  2 ++
 bleach/_vendor/pip_install_vendor.sh |  2 +-
 scripts/run_tests.sh                 | 22 ++++++++++++++++++++++
 scripts/vendor_verify.sh             | 23 +++++++++++++++++++++++
 tox.ini                              | 14 ++++++++++++--
 6 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100755 scripts/run_tests.sh
 create mode 100755 scripts/vendor_verify.sh

diff --git a/.travis.yml b/.travis.yml
index b690608d..a5d55009 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,18 +4,30 @@ language: python
 cache:
   directories:
   - "~/.cache/pip"
+
 python:
   - "2.7"
   - "3.4"
   - "3.5"
   - "3.6"
   - "pypy"
+
 install:
   - pip install -U pip setuptools>=18.5
   - pip install -r requirements-dev.txt
+
+matrix:
+  include:
+    - python: 2.7
+      env: MODE=lint
+    - python: 2.7
+      env: MODE=vendorverify
+    - python: 3.4
+      env: MODE=lint
+
 script:
-  - py.test
-  - flake8 bleach/
+  - ./scripts/run_tests.sh $MODE
+
 deploy:
   provider: pypi
   user: jezdez
diff --git a/MANIFEST.in b/MANIFEST.in
index 2a85593e..70e794fd 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -10,6 +10,8 @@ include README.rst
 include docs/conf.py
 include docs/Makefile
 
+include scripts/*
+
 recursive-include bleach *.py *.json *.rst *.sh *.txt INSTALLER METADATA RECORD WHEEL
 recursive-include docs *.rst
 recursive-include tests *.py *.test
diff --git a/bleach/_vendor/pip_install_vendor.sh b/bleach/_vendor/pip_install_vendor.sh
index 659c7acd..4320bc88 100755
--- a/bleach/_vendor/pip_install_vendor.sh
+++ b/bleach/_vendor/pip_install_vendor.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 
-pip install --no-binary all --no-compile --target . --no-deps -r vendor.txt
+pip install --no-binary all --no-compile --no-deps -r vendor.txt --target .
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
new file mode 100755
index 00000000..d3b3cc65
--- /dev/null
+++ b/scripts/run_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Make sure we're running from the bleach repository directory and
+# not this directory.
+THISDIR=$(basename `pwd`)
+if [[ "${THISDIR}" == "scripts" ]]; then
+    cd ..
+fi
+
+MODE=${1:-test}
+
+case "${MODE}" in
+  test)
+    pytest ;;
+  lint)
+    flake8 bleach/ ;;
+  vendorverify)
+    ./scripts/vendor_verify.sh ;;
+  *)
+    echo "Unknown mode $MODE."
+    exit 1
+esac
diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
new file mode 100755
index 00000000..88f64b06
--- /dev/null
+++ b/scripts/vendor_verify.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Install vendored packages into /tmp and then compare with what's in
+# bleach/_vendor/.
+
+DEST=/tmp/vendor-test
+
+if [[ -e "${DEST}" ]]; then
+    echo "${DEST} exists. Please remove."
+    exit 1
+fi
+
+mkdir "${DEST}"
+
+pip install --no-binary all --no-compile --no-deps -r bleach/_vendor/vendor.txt --target "${DEST}"
+
+diff -r \
+    --exclude="__init__.py" \
+    --exclude="README.rst" \
+    --exclude="vendor.txt" \
+    --exclude="pip_install_vendor.sh" \
+    --exclude="__pycache__" \
+    bleach/_vendor/ "${DEST}"
diff --git a/tox.ini b/tox.ini
index 0f818e50..41d082f1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,7 @@ envlist =
     py{27,34,35,36}-build-no-lang
     docs
     lint
+    vendorverify
 
 [testenv]
 basepython =
@@ -22,7 +23,7 @@ basepython =
 deps =
     -rrequirements-dev.txt
 commands =
-    py.test {posargs:-v}
+    pytest {posargs:-v}
     python setup.py build
 
 [testenv:py27-build-no-lang]
@@ -55,10 +56,19 @@ commands =
 
 [testenv:lint]
 basepython = python3.6
+changedir = scripts
 deps =
     -rrequirements-dev.txt
 commands =
-    flake8 bleach/
+    ./run_tests.sh lint
+
+[testenv:vendorverify]
+basepython = python3.6
+changedir = scripts
+deps =
+    -rrequirements-dev.txt
+commands =
+    ./run_tests.sh vendorverify
 
 [testenv:docs]
 basepython = python3.6

From b07814e0753ca6cbcf0b4747b8d59b90adec0b91 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 28 Aug 2018 15:08:46 -0400
Subject: [PATCH 171/314] Extract all html5lib things into a shim module

This creates an intermediary html5lib_shim module between the html5lib
library and Bleach. This will make it easier to update html5lib and fix
any issues that arise from that.

We're going to treat this as an incompatible API change, so changing
the upcoming version to 3.0.0.

Fixes #316
---
 CHANGES                     |  19 ++-
 bleach/__init__.py          |   2 +-
 bleach/html5lib_shim.py     | 258 ++++++++++++++++++++++++++++++++++
 bleach/linkifier.py         |  16 +--
 bleach/sanitizer.py         | 268 +++---------------------------------
 tests/test_clean.py         |  22 +--
 tests/test_html5lib_shim.py |  23 ++++
 7 files changed, 322 insertions(+), 286 deletions(-)
 create mode 100644 bleach/html5lib_shim.py
 create mode 100644 tests/test_html5lib_shim.py

diff --git a/CHANGES b/CHANGES
index 423c1ecb..517361dd 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,7 +1,7 @@
 Bleach changes
 ==============
 
-Version 2.1.5 (in development)
+Version 3.0.0 (in development)
 ------------------------------
 
 **Security fixes**
@@ -10,12 +10,25 @@ None
 
 **Backwards incompatible changes**
 
-None
+* A bunch of things were moved:
+
+  ``bleach.sanitizer`` -> ``bleach.html5lib_shim``:
+
+  * ``convert_entity``
+  * ``convert_entities``
+  * ``match_entity``
+  * ``next_possible_entity``
+  * ``BleachHTMLSerializer``
+  * ``BleachHTMLTokenizer``
+  * ``BleachHTMLParser``
+
+  These weren't documented and aren't part of the public API, but people
+  read code so we're considering it an incompatible API change anyhow.
 
 **Features**
 
 * No longer depends on html5lib. html5lib==1.0.1 was vendored into Bleach.
-  (#386)
+  Bleach will work fine with other libraries that depend on html5lib. (#386)
 
 **Bug fixes**
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index f953fc51..b4a927a5 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.5.dev0'
+__version__ = '3.0.0.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
new file mode 100644
index 00000000..a7b49ad3
--- /dev/null
+++ b/bleach/html5lib_shim.py
@@ -0,0 +1,258 @@
+# flake8: noqa
+"""
+Shim module between Bleach and html5lib. This makes it easier to upgrade the
+html5lib library without having to change a lot of code.
+"""
+
+from __future__ import unicode_literals
+
+import re
+import string
+
+import six
+
+from bleach._vendor.html5lib import (
+    HTMLParser,
+    getTreeWalker,
+)
+from bleach._vendor.html5lib.constants import (
+    entities,
+    namespaces,
+    prefixes,
+    tokenTypes,
+)
+from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
+from bleach._vendor.html5lib.filters.base import Filter
+from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
+from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
+from bleach._vendor.html5lib.serializer import HTMLSerializer
+from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
+from bleach._vendor.html5lib._trie import Trie
+
+
+#: Map of entity name to expanded entity
+ENTITIES = entities
+
+#: Trie of html entity string -> character representation
+ENTITIES_TRIE = Trie(ENTITIES)
+
+
+class BleachHTMLTokenizer(HTMLTokenizer):
+    """Tokenizer that doesn't consume character entities"""
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # We don't want to consume and convert entities, so this overrides the
+        # html5lib tokenizer's consumeEntity so that it's now a no-op.
+        #
+        # However, when that gets called, it's consumed an &, so we put that in
+        # the stream.
+        if fromAttribute:
+            self.currentToken['data'][-1][1] += '&'
+
+        else:
+            self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
+
+
+class BleachHTMLParser(HTMLParser):
+    """Parser that uses BleachHTMLTokenizer"""
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+        # Override HTMLParser so we can swap out the tokenizer for our own.
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs)
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()
+
+
+def convert_entity(value):
+    """Convert an entity (minus the & and ; part) into what it represents
+
+    This handles numeric, hex, and text entities.
+
+    :arg value: the string (minus the ``&`` and ``;`` part) to convert
+
+    :returns: unicode character or None if it's an ambiguous ampersand that
+        doesn't match a character entity
+
+    """
+    if value[0] == '#':
+        if value[1] in ('x', 'X'):
+            return six.unichr(int(value[2:], 16))
+        return six.unichr(int(value[1:], 10))
+
+    return ENTITIES.get(value, None)
+
+
+def convert_entities(text):
+    """Converts all found entities in the text
+
+    :arg text: the text to convert entities in
+
+    :returns: unicode text with converted entities
+
+    """
+    if '&' not in text:
+        return text
+
+    new_text = []
+    for part in next_possible_entity(text):
+        if not part:
+            continue
+
+        if part.startswith('&'):
+            entity = match_entity(part)
+            if entity is not None:
+                converted = convert_entity(entity)
+
+                # If it's not an ambiguous ampersand, then replace with the
+                # unicode character. Otherwise, we leave the entity in.
+                if converted is not None:
+                    new_text.append(converted)
+                    remainder = part[len(entity) + 2:]
+                    if part:
+                        new_text.append(remainder)
+                    continue
+
+        new_text.append(part)
+
+    return u''.join(new_text)
+
+
+def match_entity(stream):
+    """Returns first entity in stream or None if no entity exists
+
+    Note: For Bleach purposes, entities must start with a "&" and end with
+    a ";". This ignoresambiguous character entities that have no ";" at the
+    end.
+
+    :arg stream: the character stream
+
+    :returns: ``None`` or the entity string without "&" or ";"
+
+    """
+    # Nix the & at the beginning
+    if stream[0] != '&':
+        raise ValueError('Stream should begin with "&"')
+
+    stream = stream[1:]
+
+    stream = list(stream)
+    possible_entity = ''
+    end_characters = '<&=;' + string.whitespace
+
+    # Handle number entities
+    if stream and stream[0] == '#':
+        possible_entity = '#'
+        stream.pop(0)
+
+        if stream and stream[0] in ('x', 'X'):
+            allowed = '0123456789abcdefABCDEF'
+            possible_entity += stream.pop(0)
+        else:
+            allowed = '0123456789'
+
+        # FIXME(willkg): Do we want to make sure these are valid number
+        # entities? This doesn't do that currently.
+        while stream and stream[0] not in end_characters:
+            c = stream.pop(0)
+            if c not in allowed:
+                break
+            possible_entity += c
+
+        if possible_entity and stream and stream[0] == ';':
+            return possible_entity
+        return None
+
+    # Handle character entities
+    while stream and stream[0] not in end_characters:
+        c = stream.pop(0)
+        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+            break
+        possible_entity += c
+
+    if possible_entity and stream and stream[0] == ';':
+        return possible_entity
+
+    return None
+
+
+AMP_SPLIT_RE = re.compile('(&)')
+
+
+def next_possible_entity(text):
+    """Takes a text and generates a list of possible entities
+
+    :arg text: the text to look at
+
+    :returns: generator where each part (except the first) starts with an
+        "&"
+
+    """
+    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+        if i == 0:
+            yield part
+        elif i % 2 == 0:
+            yield '&' + part
+
+
+class BleachHTMLSerializer(HTMLSerializer):
+    """HTMLSerializer that undoes & -> &amp; in attributes"""
+    def escape_base_amp(self, stoken):
+        """Escapes bare & in HTML attribute values"""
+        # First, undo what the HTMLSerializer did. We need to do this because
+        # html5lib's HTMLSerializer expected the tokenizer to consume all the
+        # character entities, but the BleachHTMLTokenizer doesn't.
+        stoken = stoken.replace('&amp;', '&')
+
+        # Then, escape any bare &
+        for part in next_possible_entity(stoken):
+            if not part:
+                continue
+
+            if part.startswith('&'):
+                entity = match_entity(part)
+                # Only leave entities in that are not ambiguous. If they're
+                # ambiguous, then we escape the ampersand.
+                if entity is not None and convert_entity(entity) is not None:
+                    yield '&' + entity + ';'
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and and one for ; at the end
+                    part = part[len(entity) + 2:]
+                    if part:
+                        yield part
+                    continue
+
+            yield part.replace('&', '&amp;')
+
+    def serialize(self, treewalker, encoding=None):
+        """Wrap HTMLSerializer.serialize and escape bare & in attributes"""
+        in_tag = False
+        after_equals = False
+
+        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
+            if in_tag:
+                if stoken == '>':
+                    in_tag = False
+
+                elif after_equals:
+                    if stoken != '"':
+                        for part in self.escape_base_amp(stoken):
+                            yield part
+
+                        after_equals = False
+                        continue
+
+                elif stoken == '=':
+                    after_equals = True
+
+                yield stoken
+            else:
+                if stoken.startswith('<'):
+                    in_tag = True
+                yield stoken
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index ef8afc6c..3c8c3eeb 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -2,12 +2,8 @@
 import re
 import six
 
-from bleach._vendor import html5lib
-from bleach._vendor.html5lib.filters.base import Filter
-from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
-from bleach._vendor.html5lib.serializer import HTMLSerializer
-
 from bleach import callbacks as linkify_callbacks
+from bleach import html5lib_shim
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
@@ -33,7 +29,7 @@
 TLDS.reverse()
 
 
-def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
     """Builds the url regex used by linkifier
 
    If you want a different set of tlds or allowed protocols, pass those in
@@ -114,9 +110,9 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
         self.url_re = url_re
         self.email_re = email_re
 
-        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
-        self.walker = html5lib.getTreeWalker('etree')
-        self.serializer = HTMLSerializer(
+        self.parser = html5lib_shim.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib_shim.getTreeWalker('etree')
+        self.serializer = html5lib_shim.HTMLSerializer(
             quote_attr_values='always',
             omit_optional_tags=False,
 
@@ -157,7 +153,7 @@ def linkify(self, text):
         return self.serializer.render(filtered)
 
 
-class LinkifyFilter(Filter):
+class LinkifyFilter(html5lib_shim.Filter):
     """html5lib filter that linkifies text
 
     This will do the following:
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 7cbb9e87..05e26753 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,35 +1,16 @@
 from __future__ import unicode_literals
+
 from itertools import chain
 import re
-import string
 
 import six
 from six.moves.urllib.parse import urlparse
 from xml.sax.saxutils import unescape
 
-from bleach._vendor import html5lib
-from bleach._vendor.html5lib.constants import (
-    entities,
-    namespaces,
-    prefixes,
-    tokenTypes,
-)
-from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
-from bleach._vendor.html5lib.filters.base import Filter
-from bleach._vendor.html5lib.filters import sanitizer
-from bleach._vendor.html5lib.serializer import HTMLSerializer
-from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
-from bleach._vendor.html5lib._trie import Trie
-
+from bleach import html5lib_shim
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
-#: Map of entity name to expanded entity
-ENTITIES = entities
-
-#: Trie of html entity string -> character representation
-ENTITIES_TRIE = Trie(ENTITIES)
-
 #: List of allowed tags
 ALLOWED_TAGS = [
     'a',
@@ -54,17 +35,12 @@
     'acronym': ['title'],
 }
 
-
 #: List of allowed styles
 ALLOWED_STYLES = []
 
-
 #: List of allowed protocols
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 
-
-AMP_SPLIT_RE = re.compile('(&)')
-
 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
 INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
 
@@ -79,90 +55,6 @@
 INVISIBLE_REPLACEMENT_CHAR = '?'
 
 
-def convert_entity(value):
-    """Convert an entity (minus the & and ; part) into what it represents
-
-    This handles numeric, hex, and text entities.
-
-    :arg value: the string (minus the ``&`` and ``;`` part) to convert
-
-    :returns: unicode character or None if it's an ambiguous ampersand that
-        doesn't match a character entity
-
-    """
-    if value[0] == '#':
-        if value[1] in ('x', 'X'):
-            return six.unichr(int(value[2:], 16))
-        return six.unichr(int(value[1:], 10))
-
-    return ENTITIES.get(value, None)
-
-
-def convert_entities(text):
-    """Converts all found entities in the text
-
-    :arg text: the text to convert entities in
-
-    :returns: unicode text with converted entities
-
-    """
-    if '&' not in text:
-        return text
-
-    new_text = []
-    for part in next_possible_entity(text):
-        if not part:
-            continue
-
-        if part.startswith('&'):
-            entity = match_entity(part)
-            if entity is not None:
-                converted = convert_entity(entity)
-
-                # If it's not an ambiguous ampersand, then replace with the
-                # unicode character. Otherwise, we leave the entity in.
-                if converted is not None:
-                    new_text.append(converted)
-                    remainder = part[len(entity) + 2:]
-                    if part:
-                        new_text.append(remainder)
-                    continue
-
-        new_text.append(part)
-
-    return u''.join(new_text)
-
-
-class BleachHTMLTokenizer(HTMLTokenizer):
-    def consumeEntity(self, allowedChar=None, fromAttribute=False):
-        # We don't want to consume and convert entities, so this overrides the
-        # html5lib tokenizer's consumeEntity so that it's now a no-op.
-        #
-        # However, when that gets called, it's consumed an &, so we put that in
-        # the stream.
-        if fromAttribute:
-            self.currentToken['data'][-1][1] += '&'
-
-        else:
-            self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
-
-
-class BleachHTMLParser(html5lib.HTMLParser):
-    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
-        # Override HTMLParser so we can swap out the tokenizer for our own.
-        self.innerHTMLMode = innerHTML
-        self.container = container
-        self.scripting = scripting
-        self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs)
-        self.reset()
-
-        try:
-            self.mainLoop()
-        except ReparseException:
-            self.reset()
-            self.mainLoop()
-
-
 class Cleaner(object):
     """Cleaner for cleaning HTML fragments of malicious content
 
@@ -223,9 +115,9 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         self.strip_comments = strip_comments
         self.filters = filters or []
 
-        self.parser = BleachHTMLParser(namespaceHTMLElements=False)
-        self.walker = html5lib.getTreeWalker('etree')
-        self.serializer = BleachHTMLSerializer(
+        self.parser = html5lib_shim.BleachHTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib_shim.getTreeWalker('etree')
+        self.serializer = html5lib_shim.BleachHTMLSerializer(
             quote_attr_values='always',
             omit_optional_tags=False,
             escape_lt_in_attrs=True,
@@ -325,80 +217,7 @@ def _attr_filter(tag, attr, value):
     raise ValueError('attributes needs to be a callable, a list or a dict')
 
 
-def match_entity(stream):
-    """Returns first entity in stream or None if no entity exists
-
-    Note: For Bleach purposes, entities must start with a "&" and end with
-    a ";".
-
-    :arg stream: the character stream
-
-    :returns: ``None`` or the entity string without "&" or ";"
-
-    """
-    # Nix the & at the beginning
-    if stream[0] != '&':
-        raise ValueError('Stream should begin with "&"')
-
-    stream = stream[1:]
-
-    stream = list(stream)
-    possible_entity = ''
-    end_characters = '<&=;' + string.whitespace
-
-    # Handle number entities
-    if stream and stream[0] == '#':
-        possible_entity = '#'
-        stream.pop(0)
-
-        if stream and stream[0] in ('x', 'X'):
-            allowed = '0123456789abcdefABCDEF'
-            possible_entity += stream.pop(0)
-        else:
-            allowed = '0123456789'
-
-        # FIXME(willkg): Do we want to make sure these are valid number
-        # entities? This doesn't do that currently.
-        while stream and stream[0] not in end_characters:
-            c = stream.pop(0)
-            if c not in allowed:
-                break
-            possible_entity += c
-
-        if possible_entity and stream and stream[0] == ';':
-            return possible_entity
-        return None
-
-    # Handle character entities
-    while stream and stream[0] not in end_characters:
-        c = stream.pop(0)
-        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
-            break
-        possible_entity += c
-
-    if possible_entity and stream and stream[0] == ';':
-        return possible_entity
-
-    return None
-
-
-def next_possible_entity(text):
-    """Takes a text and generates a list of possible entities
-
-    :arg text: the text to look at
-
-    :returns: generator where each part (except the first) starts with an
-        "&"
-
-    """
-    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
-        if i == 0:
-            yield part
-        elif i % 2 == 0:
-            yield '&' + part
-
-
-class BleachSanitizerFilter(sanitizer.Filter):
+class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
     """html5lib Filter that sanitizes text
 
     This filter can be used anywhere html5lib filters can be used.
@@ -430,14 +249,13 @@ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
 
         """
         self.attr_filter = attribute_filter_factory(attributes)
-
         self.strip_disallowed_elements = strip_disallowed_elements
         self.strip_html_comments = strip_html_comments
 
         return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
     def __iter__(self):
-        for token in Filter.__iter__(self):
+        for token in html5lib_shim.Filter.__iter__(self):
             ret = self.sanitize_token(token)
 
             if not ret:
@@ -523,12 +341,12 @@ def sanitize_characters(self, token):
 
         # For each possible entity that starts with a "&", we try to extract an
         # actual entity and re-tokenize accordingly
-        for part in next_possible_entity(data):
+        for part in html5lib_shim.next_possible_entity(data):
             if not part:
                 continue
 
             if part.startswith('&'):
-                entity = match_entity(part)
+                entity = html5lib_shim.match_entity(part)
                 if entity is not None:
                     new_tokens.append({'type': 'Entity', 'name': entity})
                     # Length of the entity plus 2--one for & at the beginning
@@ -556,7 +374,7 @@ def sanitize_uri_value(self, value, allowed_protocols):
         # different than the original value.
 
         # Convert all character entities in the value
-        new_value = convert_entities(value)
+        new_value = html5lib_shim.convert_entities(value)
 
         # Nix backtick, space characters, and control characters
         new_value = re.sub(
@@ -645,7 +463,9 @@ def allow_token(self, token):
 
                 # Drop href and xlink:href attr for svg elements with non-local IRIs
                 if (None, token['name']) in self.svg_allow_local_href:
-                    if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
+                    if namespaced_name in [
+                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
+                    ]:
                         if re.search(r'^\s*[^#\s]', val):
                             continue
 
@@ -676,10 +496,10 @@ def disallowed_token(self, token):
 
                 # Figure out namespaced name if the namespace is appropriate
                 # and exists; if the ns isn't in prefixes, then drop it.
-                if ns is None or ns not in prefixes:
+                if ns is None or ns not in html5lib_shim.prefixes:
                     namespaced_name = name
                 else:
-                    namespaced_name = '%s:%s' % (prefixes[ns], name)
+                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
 
                 attrs.append(' %s="%s"' % (
                     namespaced_name,
@@ -704,7 +524,7 @@ def disallowed_token(self, token):
     def sanitize_css(self, style):
         """Sanitizes css in style tags"""
         # Convert entities in the style so that it can be parsed as CSS
-        style = convert_entities(style)
+        style = html5lib_shim.convert_entities(style)
 
         # Drop any url values before we do anything else
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
@@ -737,59 +557,3 @@ def sanitize_css(self, style):
                 clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
-
-
-class BleachHTMLSerializer(HTMLSerializer):
-    """Wraps the HTMLSerializer and undoes & -> &amp; in attributes"""
-    def escape_base_amp(self, stoken):
-        """Escapes bare & in HTML attribute values"""
-        # First, undo what the HTMLSerializer did
-        stoken = stoken.replace('&amp;', '&')
-
-        # Then, escape any bare &
-        for part in next_possible_entity(stoken):
-            if not part:
-                continue
-
-            if part.startswith('&'):
-                entity = match_entity(part)
-                # Only leave entities in that are not ambiguous. If they're
-                # ambiguous, then we escape the ampersand.
-                if entity is not None and convert_entity(entity) is not None:
-                    yield '&' + entity + ';'
-
-                    # Length of the entity plus 2--one for & at the beginning
-                    # and and one for ; at the end
-                    part = part[len(entity) + 2:]
-                    if part:
-                        yield part
-                    continue
-
-            yield part.replace('&', '&amp;')
-
-    def serialize(self, treewalker, encoding=None):
-        """Wrap HTMLSerializer.serialize and escape bare & in attributes"""
-        in_tag = False
-        after_equals = False
-
-        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
-            if in_tag:
-                if stoken == '>':
-                    in_tag = False
-
-                elif after_equals:
-                    if stoken != '"':
-                        for part in self.escape_base_amp(stoken):
-                            yield part
-
-                        after_equals = False
-                        continue
-
-                elif stoken == '=':
-                    after_equals = True
-
-                yield stoken
-            else:
-                if stoken.startswith('<'):
-                    in_tag = True
-                yield stoken
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 96f0916e..10d9c893 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,10 +1,10 @@
 import os
 
-from bleach._vendor.html5lib.filters.base import Filter
 import pytest
 
 from bleach import clean
-from bleach.sanitizer import convert_entities, Cleaner
+from bleach.html5lib_shim import Filter
+from bleach.sanitizer import Cleaner
 
 
 def test_clean_idempotent():
@@ -744,24 +744,6 @@ def test_invisible_characters(data, expected):
     assert clean(data) == expected
 
 
-@pytest.mark.parametrize('data, expected', [
-    # Strings without character entities pass through as is
-    ('', ''),
-    ('abc', 'abc'),
-
-    # Handles character entities--both named and numeric
-    ('&nbsp;', u'\xa0'),
-    ('&#32;', ' '),
-    ('&#x20;', ' '),
-
-    # Handles ambiguous ampersand
-    ('&xx;', '&xx;'),
-])
-def test_convert_entities(data, expected):
-    print(repr(convert_entities(data)))
-    assert convert_entities(data) == expected
-
-
 def test_nonexistent_namespace():
     """Verify if the namespace doesn't exist, it doesn't fail with a KeyError
 
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
new file mode 100644
index 00000000..742f99da
--- /dev/null
+++ b/tests/test_html5lib_shim.py
@@ -0,0 +1,23 @@
+import pytest
+
+from bleach import html5lib_shim
+
+
+@pytest.mark.parametrize('data, expected', [
+    # Strings without character entities pass through as is
+    ('', ''),
+    ('abc', 'abc'),
+
+    # Handles character entities--both named and numeric
+    ('&nbsp;', u'\xa0'),
+    ('&#32;', ' '),
+    ('&#x20;', ' '),
+
+    # Handles ambiguous ampersand
+    ('&xx;', '&xx;'),
+
+    # Handles multiple entities in the same string
+    ('this &amp; that &amp; that', 'this & that & that'),
+])
+def test_convert_entities(data, expected):
+    assert html5lib_shim.convert_entities(data) == expected

From 78e06f7e864a8d39af0a2794b296de12f123bf24 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 4 Sep 2018 08:26:48 -0400
Subject: [PATCH 172/314] Add serializer tests; improve comments and changes

---
 CHANGES                     | 15 ++++++++---
 bleach/html5lib_shim.py     | 22 ++++++++++-----
 tests/test_html5lib_shim.py | 54 +++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/CHANGES b/CHANGES
index 517361dd..427363ea 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,7 +10,7 @@ None
 
 **Backwards incompatible changes**
 
-* A bunch of things were moved:
+* A bunch of functions were moved from one module to another:
 
   ``bleach.sanitizer`` -> ``bleach.html5lib_shim``:
 
@@ -23,12 +23,19 @@ None
   * ``BleachHTMLParser``
 
   These weren't documented and aren't part of the public API, but people
-  read code so we're considering it an incompatible API change anyhow.
+  read code and might be using them so we're considering it an incompatible
+  API change.
+
+  If you're using them, you'll need to update your code.
 
 **Features**
 
-* No longer depends on html5lib. html5lib==1.0.1 was vendored into Bleach.
-  Bleach will work fine with other libraries that depend on html5lib. (#386)
+* Bleach no longer depends on html5lib. html5lib==1.0.1 is now vendored into
+  Bleach. You can remove it from your requirements file if none of your other
+  requirements require html5lib.
+
+  This means Bleach will now work fine with other libraries that depend on
+  html5lib regardless of what version of html5lib they require. (#386)
 
 **Bug fixes**
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index a7b49ad3..e84011d6 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -54,7 +54,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
 
 class BleachHTMLParser(HTMLParser):
     """Parser that uses BleachHTMLTokenizer"""
-    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+    def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
         # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
         self.container = container
@@ -203,13 +203,16 @@ def next_possible_entity(text):
 class BleachHTMLSerializer(HTMLSerializer):
     """HTMLSerializer that undoes & -> &amp; in attributes"""
     def escape_base_amp(self, stoken):
-        """Escapes bare & in HTML attribute values"""
-        # First, undo what the HTMLSerializer did. We need to do this because
-        # html5lib's HTMLSerializer expected the tokenizer to consume all the
-        # character entities, but the BleachHTMLTokenizer doesn't.
+        """Escapes just bare & in HTML attribute values"""
+        # First, undo escaping of &. We need to do this because html5lib's
+        # HTMLSerializer expected the tokenizer to consume all the character
+        # entities and convert them to their respective characters, but the
+        # BleachHTMLTokenizer doesn't do that. For example, this fixes
+        # &amp;entity; back to &entity; .
         stoken = stoken.replace('&amp;', '&')
 
-        # Then, escape any bare &
+        # However, we do want all bare & that are not marking character
+        # entities to be changed to &amp;, so let's do that carefully here.
         for part in next_possible_entity(stoken):
             if not part:
                 continue
@@ -231,7 +234,12 @@ def escape_base_amp(self, stoken):
             yield part.replace('&', '&amp;')
 
     def serialize(self, treewalker, encoding=None):
-        """Wrap HTMLSerializer.serialize and escape bare & in attributes"""
+        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
+
+        Note that this converts & to &amp; in attribute values where the & isn't
+        already part of an unambiguous character entity.
+
+        """
         in_tag = False
         after_equals = False
 
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 742f99da..3bd859af 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -21,3 +21,57 @@
 ])
 def test_convert_entities(data, expected):
     assert html5lib_shim.convert_entities(data) == expected
+
+
+@pytest.mark.parametrize('data, expected', [
+    ('', ''),
+    ('text', 'text'),
+
+    # & in Characters is escaped
+    ('&', '&amp;'),
+
+    # FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring
+    # character entities. What it should be doing is creating Entity tokens
+    # for character entities.
+    #
+    # That was too hard at the time I was fixing it, so I fixed it in
+    # BleachSanitizerFilter. When that gest fixed correctly in the tokenizer,
+    # then this test cases will get fixed.
+    ('a &amp; b', 'a &amp;amp; b'),    # should be 'a &amp; b'
+
+    # & in HTML attribute values are escaped
+    (
+        '<a href="http://example.com?key=value&key2=value">tag</a>',
+        '<a href="http://example.com?key=value&amp;key2=value">tag</a>'
+    ),
+    # & marking character entities in HTML attribute values aren't escaped
+    (
+        '<a href="http://example.com?key=value&amp;key2=value">tag</a>',
+        '<a href="http://example.com?key=value&amp;key2=value">tag</a>'
+    ),
+    # & marking ambiguous character entities in attribute values are escaped
+    # (&curren; is a character entity)
+    (
+        '<a href="http://example.com?key=value&current=value">tag</a>',
+        '<a href="http://example.com?key=value&amp;current=value">tag</a>'
+    ),
+
+])
+def test_serializer(data, expected):
+    # Build a parser, walker, and serializer just like we do in clean()
+    parser = html5lib_shim.BleachHTMLParser(namespaceHTMLElements=False)
+    walker = html5lib_shim.getTreeWalker('etree')
+    serializer = html5lib_shim.BleachHTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
+        escape_lt_in_attrs=True,
+        resolve_entities=False,
+        sanitize=False,
+        alphabetical_attributes=False,
+    )
+
+    # Parse, walk, and then serialize the output
+    dom = parser.parseFragment(data)
+    serialized = serializer.render(walker(dom))
+
+    assert serialized == expected

From 614f4ef2da7579c65781284faddca69f0f16b0a3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 4 Sep 2018 16:50:12 -0400
Subject: [PATCH 173/314] Fix ids for regression tests

---
 tests/test_clean.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index 10d9c893..bf1d871d 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -757,10 +757,10 @@ def test_nonexistent_namespace():
     assert clean('<d {c}>') == '&lt;d c=""&gt;&lt;/d&gt;'
 
 
-def get_tests():
+def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 
-    :returns: list of ``(filename, filedata)`` tuples
+    :returns: list of ``(id, filedata)`` tuples
 
     """
     datadir = os.path.join(os.path.dirname(__file__), 'data')
@@ -772,14 +772,20 @@ def get_tests():
     tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
 
     testcases = [
-        (fn, open(fn, 'r').read()) for fn in tests
+        (os.path.basename(fn), open(fn, 'r').read())
+        for fn in tests
     ]
 
     return testcases
 
 
-@pytest.mark.parametrize('fn, test_case', get_tests())
-def test_regressions(fn, test_case):
+_regression_ids_and_tests = get_ids_and_tests()
+_regression_ids = [item[0] for item in _regression_ids_and_tests]
+_regression_tests = [item[1] for item in _regression_ids_and_tests]
+
+
+@pytest.mark.parametrize('test_case', _regression_tests, ids=_regression_ids)
+def test_regressions(test_case):
     """Regression tests for clean so we can see if there are issues"""
     test_data, expected = test_case.split('\n--\n')
 

From 6b05f3c133e84b0e4ab61e8e117f4c4b333d80d4 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 4 Sep 2018 13:22:25 -0400
Subject: [PATCH 174/314] Fix regressions from Bleach 2.0 rewrite

The Bleach 2.0 rewrite involved changing Bleach clean() from making
changes during the tokenizing step to operating as an html5lib filter
that runs between the parsing and serializing steps. There were some
consequences of that rewrite namely that the html5lib parser got to
"fix" the HTML before Bleach could strip/escape disallowed tags. Because
of that, there were a variety of regressions from Bleach 1.x behavior
and Bleach 2.x behavior. For example:

1. Added end tags:

   bleach.clean('<sarcasm>') -> '&lt;sarcasm&gt;&lt;/sarcasm&gt;'

2. Parsed malformed tags as comments:

   bleach.clean('</sarcasm>') -> ''

Et cetera.

This change fixes that in a few ways. First, it overrides emitCurrentToken()
in the tokenizer to know about the allowed tags and whether they're going
to get stripped or escaped. It does the stripping at this point and changes
tags to be escaped into character data. That way the parser doesn't do any
"fixing".

Second, it adds some additional scrutiny for the
"expected-closing-tag-but-got-char" parser error case. That gets kicked
up when the tag is malformed. The parser will treat this as a comment
and then Bleach .clean() will drop it. Instead, this fixes that to
turn that into character data to be escaped later.

Fixes #271
Fixes #279
Fixes #280
---
 bleach/html5lib_shim.py     | 195 ++++++++++++++++++++++++++++++++++++
 bleach/sanitizer.py         |  20 +++-
 tests/data/10.test          |   2 +-
 tests/data/11.test          |   2 +-
 tests/data/12.test          |   2 +-
 tests/data/13.test          |   2 +-
 tests/data/14.test          |   2 +-
 tests/data/15.test          |   2 +-
 tests/data/16.test          |   2 +-
 tests/data/17.test          |   2 +-
 tests/data/18.test          |   2 +-
 tests/data/19.test          |   2 +-
 tests/data/2.test           |   2 +-
 tests/data/20.test          |   2 +-
 tests/data/3.test           |   2 +-
 tests/data/4.test           |   2 +-
 tests/data/5.test           |   2 +-
 tests/data/9.test           |   2 +-
 tests/test_clean.py         | 145 ++++++++++++++-------------
 tests/test_html5lib_shim.py |  19 +++-
 20 files changed, 323 insertions(+), 88 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index e84011d6..b891bd2a 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -25,6 +25,7 @@
 from bleach._vendor.html5lib.filters.base import Filter
 from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
 from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
+from bleach._vendor.html5lib._inputstream import HTMLInputStream
 from bleach._vendor.html5lib.serializer import HTMLSerializer
 from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
 from bleach._vendor.html5lib._trie import Trie
@@ -37,8 +38,156 @@
 ENTITIES_TRIE = Trie(ENTITIES)
 
 
+class InputStreamWithMemory(object):
+    """Wraps an HTMLInputStream to remember what characters we've seen
+
+    It didn't make sense to implement our own HTMLInputStream, so this
+    wraps the existing ones and keeps track of what we've seen so far. We
+    do this so we can provide the original string the stream had in the case
+    where Bleach's cleaner is going to escape a disallowed tag so we can
+    escape the original string.
+
+    """
+    def __init__(self, inner_stream):
+        self._inner_stream = inner_stream
+        self._buffer = []
+
+    @property
+    def errors(self):
+        return self._inner_stream.errors
+
+    def reset(self):
+        return self._inner_stream.reset()
+
+    def position(self):
+        return self._inner_stream.position()
+
+    def char(self):
+        c = self._inner_stream.char()
+        # char() can return None if EOF, so ignore that
+        if c:
+            self._buffer.append(c)
+        return c
+
+    def charsUntil(self, characters, opposite=False):
+        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
+        self._buffer.extend(list(chars))
+        return chars
+
+    def unget(self, char):
+        if self._buffer:
+            self._buffer.pop(-1)
+        return self._inner_stream.unget(char)
+
+    def stream_history(self):
+        return self._buffer
+
+
+def get_recent_tag_string(stream_history, token):
+    """Find the original text for the tag
+
+    This goes back through the stream we've tokenized for the most recent
+    complete HTML tag-like thing as it existed in the stream. It assumes that
+    the current character in the stream ias a >.
+
+    :arg list stream_history: list of characters to look through
+    :arg dict token: the tag token we're looking for in the stream
+
+    :returns: original tag from < to >
+
+    """
+    name_reversed = list(reversed(token['name']))
+    if token['type'] == tokenTypes['EndTag']:
+        name_reversed.append('/')
+    name_reversed_len = len(name_reversed)
+
+    quotey_things = '"\''
+    pile = []
+    in_quotes = []
+    for c in reversed(stream_history):
+        if in_quotes:
+            if c == in_quotes[-1]:
+                in_quotes.pop(-1)
+            elif c in quotey_things:
+                in_quotes.append(c)
+
+            pile.append(c)
+
+        elif c in quotey_things:
+            in_quotes.append(c)
+            pile.append(c)
+
+        elif c == '<':
+            if pile[-name_reversed_len:] == name_reversed:
+                pile.append(c)
+                break
+            else:
+                pile.append(c)
+        else:
+            pile.append(c)
+
+    pile.reverse()
+    ret = six.text_type('').join(pile)
+    return ret
+
+
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
+    def __init__(self, stream, parser=None, **kwargs):
+        # Stomp on the HTMLTokenizer __init__ in order to wrap the stream.
+        self.stream = InputStreamWithMemory(HTMLInputStream(stream, **kwargs))
+
+        # Do all the things the HTMLTokenizer does in __init__
+        self.parser = parser
+        self.escapeFlag = False
+        self.lastFourChars = []
+        self.state = self.dataState
+        self.escape = False
+        self.currentToken = None
+
+    def __iter__(self):
+        last_error_token = None
+
+        for token in super(BleachHTMLTokenizer, self).__iter__():
+            if last_error_token is not None:
+                token_name = token['data'].lower().strip()
+                if ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
+                     token_name not in self.parser.tags)):
+                    # We've got either a malformed tag or a pseudo-tag or
+                    # something that html5lib wants to turn into a malformed
+                    # comment which Bleach clean() will drop so we interfere
+                    # with the token stream to handle it more correctly.
+                    #
+                    # If this is an allowed tag, it's malformed and we just let
+                    # the html5lib parser deal with it--we don't enter into this
+                    # block.
+                    #
+                    # If this is not an allowed tag, then we convert it to
+                    # characters and it'll get escaped in the sanitizer.
+
+                    # Create a fake EndTag token so get_recent_tag_string works right
+                    fake_end_tag = {'type': tokenTypes['EndTag'], 'name': token['data']}
+                    token['data'] = get_recent_tag_string(
+                        self.stream.stream_history(), fake_end_tag
+                    )
+                    token['type'] = tokenTypes['Characters']
+                    yield token
+
+                else:
+                    yield last_error_token
+                    yield token
+
+                last_error_token = None
+                continue
+
+            # If the token is a ParseError, we hold on to it so we can get the
+            # next token and potentially fix it.
+            if token['type'] == tokenTypes['ParseError']:
+                last_error_token = token
+                continue
+
+            yield token
+
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # We don't want to consume and convert entities, so this overrides the
         # html5lib tokenizer's consumeEntity so that it's now a no-op.
@@ -51,9 +200,55 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
         else:
             self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
 
+    def emitCurrentToken(self):
+        token = self.currentToken
+
+        if ((self.parser.tags is not None and
+             token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag']) and
+             token['name'].lower() not in self.parser.tags)):
+            # If this is a start/end tag for a tag that's not in our allowed
+            # list, then it gets stripped or escaped. In both of these cases
+            # it gets converted to a Characters token.
+            if self.parser.strip:
+                # If we're stripping the token, we just throw in an empty
+                # string token.
+                new_data = ''
+
+            else:
+                # If we're escaping the token, we want to escape the exact
+                # original string. Since tokenizing also normalizes data
+                # and this is a tag-like thing, we've lost some information.
+                # So we go back through the stream to get the original
+                # string and use that.
+                new_data = get_recent_tag_string(self.stream.stream_history(), token)
+
+            new_token = {
+                'type': tokenTypes['Characters'],
+                'data': new_data
+            }
+
+            self.currentToken = new_token
+            self.tokenQueue.append(new_token)
+            self.state = self.dataState
+            return
+
+        super(BleachHTMLTokenizer, self).emitCurrentToken()
+
 
 class BleachHTMLParser(HTMLParser):
     """Parser that uses BleachHTMLTokenizer"""
+    def __init__(self, tags, strip, **kwargs):
+        """
+        :arg tags: list of allowed tages--everything else is either stripped or
+            escaped; if None, then this doesn't look at tags at all
+        :arg strip: whether to strip disallowed tags (True) or escape them (False);
+            if tags=None, then this doesn't have any effect
+
+        """
+        self.tags = [tag.lower() for tag in tags] if tags is not None else None
+        self.strip = strip
+        super(BleachHTMLParser, self).__init__(**kwargs)
+
     def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
         # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 05e26753..de820274 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -62,9 +62,6 @@ class Cleaner(object):
     malicious content from a string such that it can be displayed as content in
     a web page.
 
-    This cleaner is not designed to use to transform content to be used in
-    non-web-page contexts.
-
     To use::
 
         from bleach.sanitizer import Cleaner
@@ -74,6 +71,17 @@ class Cleaner(object):
         for text in all_the_yucky_things:
             sanitized = cleaner.clean(text)
 
+    .. Note::
+
+       This cleaner is not designed to use to transform content to be used in
+       non-web-page contexts.
+
+    .. Warning::
+
+       This cleaner is not thread-safe--the html parser has internal state.
+       Create a separate cleaner per thread!
+
+
     """
 
     def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
@@ -115,7 +123,11 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         self.strip_comments = strip_comments
         self.filters = filters or []
 
-        self.parser = html5lib_shim.BleachHTMLParser(namespaceHTMLElements=False)
+        self.parser = html5lib_shim.BleachHTMLParser(
+            tags=self.tags,
+            strip=self.strip,
+            namespaceHTMLElements=False
+        )
         self.walker = html5lib_shim.getTreeWalker('etree')
         self.serializer = html5lib_shim.BleachHTMLSerializer(
             quote_attr_values='always',
diff --git a/tests/data/10.test b/tests/data/10.test
index a6db9f98..0c0ceb35 100644
--- a/tests/data/10.test
+++ b/tests/data/10.test
@@ -1,3 +1,3 @@
 <IMG SRC="javascript:alert('XSS');">
 --
-&lt;img src="javascript:alert('XSS');"&gt;
+&lt;IMG SRC="javascript:alert('XSS');"&gt;
diff --git a/tests/data/11.test b/tests/data/11.test
index 37cbbfaf..ddadeb4e 100644
--- a/tests/data/11.test
+++ b/tests/data/11.test
@@ -1,3 +1,3 @@
 <IMG SRC=javascript:alert('XSS')>
 --
-&lt;img src="javascript:alert('XSS')"&gt;
+&lt;IMG SRC=javascript:alert('XSS')&gt;
diff --git a/tests/data/12.test b/tests/data/12.test
index 04c7ea8a..4b99e5be 100644
--- a/tests/data/12.test
+++ b/tests/data/12.test
@@ -1,3 +1,3 @@
 <IMG SRC=JaVaScRiPt:alert('XSS')>
 --
-&lt;img src="JaVaScRiPt:alert('XSS')"&gt;
+&lt;IMG SRC=JaVaScRiPt:alert('XSS')&gt;
diff --git a/tests/data/13.test b/tests/data/13.test
index 36d4aaee..f4274885 100644
--- a/tests/data/13.test
+++ b/tests/data/13.test
@@ -1,3 +1,3 @@
 <IMG SRC=JaVaScRiPt:alert(&quot;XSS<WBR>&quot;)>
 --
-&lt;img src="JaVaScRiPt:alert(&amp;quot;XSS&lt;WBR"&gt;&quot;)&gt;
+&lt;IMG SRC=JaVaScRiPt:alert(&quot;XSS&lt;WBR&gt;&quot;)&gt;
diff --git a/tests/data/14.test b/tests/data/14.test
index f154c73e..4dcfe9b9 100644
--- a/tests/data/14.test
+++ b/tests/data/14.test
@@ -1,3 +1,3 @@
 <IMGSRC=&#106;&#97;&#118;&#97;&<WBR>#115;&#99;&#114;&#105;&#112;&<WBR>#116;&#58;&#97;
 --
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;&#99;&#114;&#105;&#112;&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;&#58;&#97;&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
+&lt;IMGSRC=&#106;&#97;&#118;&#97;&amp;&lt;WBR&gt;#115;&#99;&#114;&#105;&#112;&amp;&lt;WBR&gt;#116;&#58;&#97;
diff --git a/tests/data/15.test b/tests/data/15.test
index c48c3e41..50a8e88f 100644
--- a/tests/data/15.test
+++ b/tests/data/15.test
@@ -1,3 +1,3 @@
 &#108;&#101;&<WBR>#114;&#116;&#40;&#39;&#88;&#83<WBR>;&#83;&#39;&#41>
 --
-&#108;&#101;&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;&#116;&#40;&#39;&#88;&amp;#83&lt;wbr&gt;&lt;/wbr&gt;;&#83;&#39;&amp;#41&gt;
+&#108;&#101;&amp;&lt;WBR&gt;#114;&#116;&#40;&#39;&#88;&amp;#83&lt;WBR&gt;;&#83;&#39;&amp;#41&gt;
diff --git a/tests/data/16.test b/tests/data/16.test
index 938240be..96c2e5ce 100644
--- a/tests/data/16.test
+++ b/tests/data/16.test
@@ -1,3 +1,3 @@
 <IMGSRC=&#0000106&#0000097&<WBR>#0000118&#0000097&#0000115&<WBR>#0000099&#0000114&#0000105&<WBR>#0000112&#0000116&#0000058&<WBR>#0000097&#0000108&#0000101&<WBR>#0000114&#0000116&#0000040&<WBR>#0000039&#0000088&#0000083&<WBR>#0000083&#0000039&#0000041>
 --
-&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118&amp;#0000097&amp;#0000115&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099&amp;#0000114&amp;#0000105&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112&amp;#0000116&amp;#0000058&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097&amp;#0000108&amp;#0000101&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114&amp;#0000116&amp;#0000040&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039&amp;#0000088&amp;#0000083&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083&amp;#0000039&amp;#0000041&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
+&lt;IMGSRC=&amp;#0000106&amp;#0000097&amp;&lt;WBR&gt;#0000118&amp;#0000097&amp;#0000115&amp;&lt;WBR&gt;#0000099&amp;#0000114&amp;#0000105&amp;&lt;WBR&gt;#0000112&amp;#0000116&amp;#0000058&amp;&lt;WBR&gt;#0000097&amp;#0000108&amp;#0000101&amp;&lt;WBR&gt;#0000114&amp;#0000116&amp;#0000040&amp;&lt;WBR&gt;#0000039&amp;#0000088&amp;#0000083&amp;&lt;WBR&gt;#0000083&amp;#0000039&amp;#0000041&gt;
diff --git a/tests/data/17.test b/tests/data/17.test
index 166e8845..ebdf889d 100644
--- a/tests/data/17.test
+++ b/tests/data/17.test
@@ -1,3 +1,3 @@
 <IMGSRC=&#x6A&#x61&#x76&#x61&#x73&<WBR>#x63&#x72&#x69&#x70&#x74&#x3A&<WBR>#x61&#x6C&#x65&#x72&#x74&#x28&<WBR>#x27&#x58&#x53&#x53&#x27&#x29>
 --
-&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63&amp;#x72&amp;#x69&amp;#x70&amp;#x74&amp;#x3A&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61&amp;#x6C&amp;#x65&amp;#x72&amp;#x74&amp;#x28&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27&amp;#x58&amp;#x53&amp;#x53&amp;#x27&amp;#x29&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
+&lt;IMGSRC=&amp;#x6A&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;WBR&gt;#x63&amp;#x72&amp;#x69&amp;#x70&amp;#x74&amp;#x3A&amp;&lt;WBR&gt;#x61&amp;#x6C&amp;#x65&amp;#x72&amp;#x74&amp;#x28&amp;&lt;WBR&gt;#x27&amp;#x58&amp;#x53&amp;#x53&amp;#x27&amp;#x29&gt;
diff --git a/tests/data/18.test b/tests/data/18.test
index 635461f8..2bdefcb5 100644
--- a/tests/data/18.test
+++ b/tests/data/18.test
@@ -1,3 +1,3 @@
 <IMG SRC="jav&#x09;ascript:alert(<WBR>'XSS');">
 --
-&lt;img src="jav&amp;#x09;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
+&lt;IMG SRC="jav&#x09;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/19.test b/tests/data/19.test
index 1a1ebe41..4ece51ed 100644
--- a/tests/data/19.test
+++ b/tests/data/19.test
@@ -1,3 +1,3 @@
 <IMG SRC="jav&#x0A;ascript:alert(<WBR>'XSS');">
 --
-&lt;img src="jav&amp;#x0A;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
+&lt;IMG SRC="jav&#x0A;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/2.test b/tests/data/2.test
index aefcbe26..0003c418 100644
--- a/tests/data/2.test
+++ b/tests/data/2.test
@@ -1,3 +1,3 @@
 "><STYLE>@import"javascript:alert('XSS')";</STYLE>
 --
-"&gt;&lt;style&gt;@import"javascript:alert('XSS')";&lt;/style&gt;
+"&gt;&lt;STYLE&gt;@import"javascript:alert('XSS')";&lt;/STYLE&gt;
diff --git a/tests/data/20.test b/tests/data/20.test
index ceae0bd8..847a70d1 100644
--- a/tests/data/20.test
+++ b/tests/data/20.test
@@ -1,3 +1,3 @@
 <IMG SRC="jav&#x0D;ascript:alert(<WBR>'XSS');">
 --
-&lt;img src="jav&amp;#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
+&lt;IMG SRC="jav&#x0D;ascript:alert(&lt;WBR&gt;'XSS');"&gt;
diff --git a/tests/data/3.test b/tests/data/3.test
index 67f3591b..4c79e546 100644
--- a/tests/data/3.test
+++ b/tests/data/3.test
@@ -1,3 +1,3 @@
 >"'><img%20src%3D%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;XSS%26%23x20;Test%26%23x20;Successful%26quot;)>
 --
-&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;&lt;/img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
+&gt;"'&gt;&lt;img%20src%3D%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;XSS%26%23x20;Test%26%23x20;Successful%26quot;)&gt;
diff --git a/tests/data/4.test b/tests/data/4.test
index 10438d81..b4eefbb8 100644
--- a/tests/data/4.test
+++ b/tests/data/4.test
@@ -1,3 +1,3 @@
 <scr<script></script>ipt type="text/javascript">alert("foo");</<script></script>script<del></del>>
 --
-&lt;scr&lt;script&gt;ipt type="text/javascript"&gt;alert("foo");script&lt;del&gt;&lt;/del&gt;&gt;&lt;/scr&lt;script&gt;
+&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"&gt;alert("foo");&lt;/&lt;script&gt;&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;&gt;
diff --git a/tests/data/5.test b/tests/data/5.test
index dd45837a..bc9cc321 100644
--- a/tests/data/5.test
+++ b/tests/data/5.test
@@ -1,3 +1,3 @@
 >%22%27><img%20src%3d%22javascript:alert(%27%20XSS%27)%22>
 --
-&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;&lt;/img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
+&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20XSS%27)%22&gt;
diff --git a/tests/data/9.test b/tests/data/9.test
index 26d27f78..a9d7eeca 100644
--- a/tests/data/9.test
+++ b/tests/data/9.test
@@ -1,3 +1,3 @@
 '';!--"<XSS>=&{()}
 --
-'';!--"&lt;xss&gt;=&amp;{()}&lt;/xss&gt;
+'';!--"&lt;XSS&gt;=&amp;{()}
diff --git a/tests/test_clean.py b/tests/test_clean.py
index bf1d871d..b543cdfb 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -140,32 +140,14 @@ def test_comments(data, should_strip, expected):
     assert clean(data, strip_comments=should_strip) == expected
 
 
-@pytest.mark.parametrize('data, expected', [
-    # Disallowed tag is escaped
-    ('<img src="javascript:alert(\'XSS\');">', '&lt;img src="javascript:alert(\'XSS\');"&gt;'),
-
-    # Test with parens
-    ('a <script>safe()</script> test', 'a &lt;script&gt;safe()&lt;/script&gt; test'),
-
-    # Test with braces
-    ('a <style>body{}</style> test', 'a &lt;style&gt;body{}&lt;/style&gt; test'),
-])
-def test_disallowed_tags(data, expected):
-    assert clean(data) == expected
-
-
 def test_invalid_char_in_tag():
-    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
     assert (
-        clean('<script/xss src="http://xx.com/xss.js"></script>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" xss=""&gt;&lt;/script&gt;',
-            '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
+        clean('<script/xss src="http://xx.com/xss.js"></script>') ==
+        '&lt;script/xss src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
     )
     assert (
         clean('<script/src="http://xx.com/xss.js"></script>') ==
-        '&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        '&lt;script/src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
     )
 
 
@@ -176,23 +158,15 @@ def test_unclosed_tag():
     )
     assert (
         clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src="http://xx.com/xss.js&lt;b"&gt;&lt;/script&gt;'
+        '&lt;script src=http://xx.com/xss.js&lt;b&gt;'
     )
-    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
     assert (
-        clean('<script src="http://xx.com/xss.js"<b>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
-            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
+        clean('<script src="http://xx.com/xss.js"<b>') ==
+        '&lt;script src="http://xx.com/xss.js"&lt;b&gt;'
     )
-    # NOTE(willkg): Two possible outcomes because attrs aren't ordered
     assert (
-        clean('<script src="http://xx.com/xss.js" <b>') in
-        [
-            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
-            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
-        ]
+        clean('<script src="http://xx.com/xss.js" <b>') ==
+        '&lt;script src="http://xx.com/xss.js" &lt;b&gt;'
     )
 
 
@@ -207,7 +181,7 @@ def test_nested_script_tag():
     )
     assert (
         clean('<script<script>>evil()</script</script>>') ==
-        '&lt;script&lt;script&gt;&gt;evil()&gt;&lt;/script&lt;script&gt;'
+        '&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;/script&gt;&gt;'
     )
 
 
@@ -297,40 +271,96 @@ def test_character_entities_handling(text, expected):
     # All tags are allowed, so it strips nothing
     (
         'a test <em>with</em> <b>html</b> tags',
-        {'strip': True},
+        {},
         'a test <em>with</em> <b>html</b> tags'
     ),
 
     # img tag is disallowed, so it's stripped
     (
         'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
-        {'strip': True},
+        {},
         'a test <em>with</em>  <b>html</b> tags'
     ),
 
     # a tag is disallowed, so it's stripped
     (
         '<p><a href="http://example.com/">link text</a></p>',
-        {'tags': ['p'], 'strip': True},
+        {'tags': ['p']},
         '<p>link text</p>'
     ),
 
-    # handle nested disallowed tag
+    # Test nested disallowed tag
     (
         '<p><span>multiply <span>nested <span>text</span></span></span></p>',
-        {'tags': ['p'], 'strip': True},
+        {'tags': ['p']},
         '<p>multiply nested text</p>'
     ),
+    # (#271)
+    (
+        '<ul><li><script></li></ul>',
+        {'tags': ['ul', 'li']},
+        '<ul><li></li></ul>'
+    ),
 
-    # handle disallowed tag that's deep in the tree
+    # Test disallowed tag that's deep in the tree
     (
         '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>',
-        {'tags': ['a', 'p'], 'strip': True},
+        {'tags': ['a', 'p']},
         '<p><a href="http://example.com/"></a></p>'
     ),
+
+    # Test isindex -- the parser expands this to a prompt (#279)
+    ('<isindex>', {}, ''),
+
+    # Test non-tags that are well-formed HTML (#280)
+    ('Yeah right <sarcasm/>', {}, 'Yeah right '),
+    ('<sarcasm>', {}, ''),
+    ('</sarcasm>', {}, ''),
+
+    # These are non-tags, but also "malformed" so they don't get treated like
+    # tags and stripped
+    ('</ sarcasm>', {}, '&lt;/ sarcasm&gt;'),
+    ('</ sarcasm >', {}, '&lt;/ sarcasm &gt;'),
+    ('Foo <bar@example.com>', {}, 'Foo '),
+    ('Favorite movie: <name of movie>', {}, 'Favorite movie: '),
+    ('</3', {}, '&lt;/3'),
 ])
 def test_stripping_tags(data, kwargs, expected):
-    assert clean(data, **kwargs) == expected
+    assert clean(data, strip=True, **kwargs) == expected
+    assert clean('  ' + data + '  ', strip=True, **kwargs) == '  ' + expected + '  '
+    assert clean('abc ' + data + ' def', strip=True, **kwargs) == 'abc ' + expected + ' def'
+
+
+@pytest.mark.parametrize('data, expected', [
+    # Disallowed tag is escaped
+    ('<img src="javascript:alert(\'XSS\');">', '&lt;img src="javascript:alert(\'XSS\');"&gt;'),
+
+    # Test with parens
+    ('<script>safe()</script>', '&lt;script&gt;safe()&lt;/script&gt;'),
+
+    # Test with braces
+    ('<style>body{}</style>', '&lt;style&gt;body{}&lt;/style&gt;'),
+
+    # Test nested disallow tags (#271)
+    ('<ul><li><script></li></ul>', '<ul><li>&lt;script&gt;</li></ul>'),
+
+    # Test isindex -- the parser expands this to a prompt (#279)
+    ('<isindex>', '&lt;isindex&gt;'),
+
+    # Test non-tags (#280)
+    ('<sarcasm/>', '&lt;sarcasm/&gt;'),
+    ('<sarcasm>', '&lt;sarcasm&gt;'),
+    ('</sarcasm>', '&lt;/sarcasm&gt;'),
+    ('</ sarcasm>', '&lt;/ sarcasm&gt;'),
+    ('</ sarcasm >', '&lt;/ sarcasm &gt;'),
+    ('</3', '&lt;/3'),
+    ('<bar@example.com>', '&lt;bar@example.com&gt;'),
+    ('Favorite movie: <name of movie>', 'Favorite movie: &lt;name of movie&gt;'),
+])
+def test_escaping_tags(data, expected):
+    assert clean(data, strip=False) == expected
+    assert clean('  ' + data + '  ', strip=False) == '  ' + expected + '  '
+    assert clean('abc ' + data + ' def', strip=False) == 'abc ' + expected + ' def'
 
 
 @pytest.mark.parametrize('data, expected', [
@@ -710,20 +740,6 @@ def test_svg_allow_local_href_nonlocal(text, expected):
     assert clean(text, tags=TAGS, attributes=ATTRS) == expected
 
 
-def test_weird_strings():
-    s = '</3'
-    assert clean(s) == ''
-
-
-@pytest.mark.xfail(reason='regression from bleach 1.5')
-def test_sarcasm():
-    """Jokes should crash.<sarcasm/>"""
-    assert (
-        clean('Yeah right <sarcasm/>') ==
-        'Yeah right &lt;sarcasm/&gt;'
-    )
-
-
 @pytest.mark.parametrize('data, expected', [
     # Convert bell
     ('1\a23', '1?23'),
@@ -745,16 +761,11 @@ def test_invisible_characters(data, expected):
 
 
 def test_nonexistent_namespace():
-    """Verify if the namespace doesn't exist, it doesn't fail with a KeyError
-
-    The tokenizer creates "c" as a namespace and that doesn't exist in the map
-    of namespaces, so then it fails with a KeyError. I don't understand why the
-    tokenizer makes "c" into a namespace in this string.
-
-    Issue #352.
-
-    """
-    assert clean('<d {c}>') == '&lt;d c=""&gt;&lt;/d&gt;'
+    # Issue #352 involved this string kicking up a KeyError since the "c"
+    # namespace didn't exist. After the fixes for Bleach 3.0, this no longer
+    # goes through the HTML parser as a tag, so it doesn't tickle the bad
+    # namespace code.
+    assert clean('<d {c}>') == '&lt;d {c}&gt;'
 
 
 def get_ids_and_tests():
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 3bd859af..13c8725a 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -59,7 +59,11 @@ def test_convert_entities(data, expected):
 ])
 def test_serializer(data, expected):
     # Build a parser, walker, and serializer just like we do in clean()
-    parser = html5lib_shim.BleachHTMLParser(namespaceHTMLElements=False)
+    parser = html5lib_shim.BleachHTMLParser(
+        tags=None,
+        strip=True,
+        namespaceHTMLElements=False
+    )
     walker = html5lib_shim.getTreeWalker('etree')
     serializer = html5lib_shim.BleachHTMLSerializer(
         quote_attr_values='always',
@@ -75,3 +79,16 @@ def test_serializer(data, expected):
     serialized = serializer.render(walker(dom))
 
     assert serialized == expected
+
+
+def test_get_recent_tag_string():
+    history = list('  <img src="javascript:alert(\'XSS\');">')
+    token = {
+        'type': 3,
+        'name': 'img',
+        'data': [['src', "javascript:alert('XSS');"]],
+        'selfClosing': False,
+        'selfClosingAcknowledged': False
+    }
+
+    assert html5lib_shim.get_recent_tag_string(history, token) == '<img src="javascript:alert(\'XSS\');">'

From 173202fe570181ce6701f365f05168e923fce990 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 17 Sep 2018 20:43:23 -0400
Subject: [PATCH 175/314] Fix BleachHTMLTokenizer.__init__ to do less

---
 bleach/html5lib_shim.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index b891bd2a..eab97084 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -133,17 +133,11 @@ def get_recent_tag_string(stream_history, token):
 
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
-    def __init__(self, stream, parser=None, **kwargs):
-        # Stomp on the HTMLTokenizer __init__ in order to wrap the stream.
-        self.stream = InputStreamWithMemory(HTMLInputStream(stream, **kwargs))
-
-        # Do all the things the HTMLTokenizer does in __init__
-        self.parser = parser
-        self.escapeFlag = False
-        self.lastFourChars = []
-        self.state = self.dataState
-        self.escape = False
-        self.currentToken = None
+    def __init__(self, *args, **kwargs):
+        super(BleachHTMLTokenizer, self).__init__(*args, **kwargs)
+
+        # Wrap the stream with one that remembers the history
+        self.stream = InputStreamWithMemory(self.stream)
 
     def __iter__(self):
         last_error_token = None

From d3366bc688a5362cec5edf95a74ad3bdf597f3f7 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 18 Sep 2018 09:31:29 -0400
Subject: [PATCH 176/314] Fix typos and make QUOTEY_THINGS a set

---
 bleach/html5lib_shim.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index eab97084..4b0560ab 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -83,12 +83,16 @@ def stream_history(self):
         return self._buffer
 
 
+#: Set of HTML quote characters
+QUOTEY_THINGS = set(['"', '\''])
+
+
 def get_recent_tag_string(stream_history, token):
     """Find the original text for the tag
 
     This goes back through the stream we've tokenized for the most recent
     complete HTML tag-like thing as it existed in the stream. It assumes that
-    the current character in the stream ias a >.
+    the current character in the stream is a >.
 
     :arg list stream_history: list of characters to look through
     :arg dict token: the tag token we're looking for in the stream
@@ -101,25 +105,27 @@ def get_recent_tag_string(stream_history, token):
         name_reversed.append('/')
     name_reversed_len = len(name_reversed)
 
-    quotey_things = '"\''
     pile = []
     in_quotes = []
     for c in reversed(stream_history):
         if in_quotes:
             if c == in_quotes[-1]:
                 in_quotes.pop(-1)
-            elif c in quotey_things:
+
+            elif c in QUOTEY_THINGS:
                 in_quotes.append(c)
 
             pile.append(c)
 
-        elif c in quotey_things:
+        elif c in QUOTEY_THINGS:
             in_quotes.append(c)
             pile.append(c)
 
         elif c == '<':
             if pile[-name_reversed_len:] == name_reversed:
                 pile.append(c)
+                # This is the beginning of a tag, so break out of the loop
+                # and return
                 break
             else:
                 pile.append(c)
@@ -233,7 +239,7 @@ class BleachHTMLParser(HTMLParser):
     """Parser that uses BleachHTMLTokenizer"""
     def __init__(self, tags, strip, **kwargs):
         """
-        :arg tags: list of allowed tages--everything else is either stripped or
+        :arg tags: list of allowed tags--everything else is either stripped or
             escaped; if None, then this doesn't look at tags at all
         :arg strip: whether to strip disallowed tags (True) or escape them (False);
             if tags=None, then this doesn't have any effect

From a708fb830edbc168ec237ed8fb772e9b95f143af Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 18 Sep 2018 09:44:39 -0400
Subject: [PATCH 177/314] Clear stream history once a tag has been found

---
 bleach/html5lib_shim.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 4b0560ab..a0c2b3bf 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -82,6 +82,9 @@ def unget(self, char):
     def stream_history(self):
         return self._buffer
 
+    def clear_history(self):
+        self._buffer = []
+
 
 #: Set of HTML quote characters
 QUOTEY_THINGS = set(['"', '\''])
@@ -171,6 +174,11 @@ def __iter__(self):
                         self.stream.stream_history(), fake_end_tag
                     )
                     token['type'] = tokenTypes['Characters']
+
+                    # Clear the stream history up to this point
+                    self.stream.clear_history()
+
+                    # Yield the adjusted token
                     yield token
 
                 else:
@@ -222,6 +230,9 @@ def emitCurrentToken(self):
                 # string and use that.
                 new_data = get_recent_tag_string(self.stream.stream_history(), token)
 
+                # Clear the history up to this point
+                self.stream.clear_history()
+
             new_token = {
                 'type': tokenTypes['Characters'],
                 'data': new_data

From 73d08b27beda761e015aae1b60841efe128073a0 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 18 Sep 2018 17:37:44 -0400
Subject: [PATCH 178/314] Drop get_recent_tag_string()

tagOpenState() is always called for every < that denotes a StartTag,
EndTag, or ParseError and never any other time. We can (ab)use that
fact to reset the input stream history and then we don't have
to search backwards in the history for the < to find the original
tag string which is super because that was fragile and not very
performant.
---
 bleach/html5lib_shim.py     | 120 +++++++++++-------------------------
 tests/test_html5lib_shim.py |  13 ----
 2 files changed, 37 insertions(+), 96 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index a0c2b3bf..58e2e78c 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -37,31 +37,29 @@
 #: Trie of html entity string -> character representation
 ENTITIES_TRIE = Trie(ENTITIES)
 
+#: Token type constants--these never change
+START_TAG_TYPE = tokenTypes['StartTag']
+END_TAG_TYPE = tokenTypes['EndTag']
+CHARACTERS_TYPE = tokenTypes['Characters']
+
 
 class InputStreamWithMemory(object):
-    """Wraps an HTMLInputStream to remember what characters we've seen
+    """Wraps an HTMLInputStream to remember characters since last <
 
-    It didn't make sense to implement our own HTMLInputStream, so this
-    wraps the existing ones and keeps track of what we've seen so far. We
-    do this so we can provide the original string the stream had in the case
-    where Bleach's cleaner is going to escape a disallowed tag so we can
-    escape the original string.
+    This wraps existing HTMLInputStream classes to keep track of the stream
+    since the last < which marked an open tag state.
 
     """
     def __init__(self, inner_stream):
         self._inner_stream = inner_stream
+        self.reset = self._inner_stream.reset
+        self.position = self._inner_stream.position
         self._buffer = []
 
     @property
     def errors(self):
         return self._inner_stream.errors
 
-    def reset(self):
-        return self._inner_stream.reset()
-
-    def position(self):
-        return self._inner_stream.position()
-
     def char(self):
         c = self._inner_stream.char()
         # char() can return None if EOF, so ignore that
@@ -79,65 +77,24 @@ def unget(self, char):
             self._buffer.pop(-1)
         return self._inner_stream.unget(char)
 
-    def stream_history(self):
-        return self._buffer
-
-    def clear_history(self):
-        self._buffer = []
-
-
-#: Set of HTML quote characters
-QUOTEY_THINGS = set(['"', '\''])
-
+    def get_tag(self):
+        """Returns the stream history since last '<'
 
-def get_recent_tag_string(stream_history, token):
-    """Find the original text for the tag
+        Since the buffer starts at the last '<' as as seen by tagOpenState(),
+        we know that everything from that point to when this method is called
+        is the "tag" that is being tokenized.
 
-    This goes back through the stream we've tokenized for the most recent
-    complete HTML tag-like thing as it existed in the stream. It assumes that
-    the current character in the stream is a >.
+        """
+        return six.text_type('').join(self._buffer)
 
-    :arg list stream_history: list of characters to look through
-    :arg dict token: the tag token we're looking for in the stream
+    def start_tag(self):
+        """Resets stream history to just '<'
 
-    :returns: original tag from < to >
+        This gets called by tagOpenState() which marks a '<' that denotes an
+        open tag. Any time we see that, we reset the buffer.
 
-    """
-    name_reversed = list(reversed(token['name']))
-    if token['type'] == tokenTypes['EndTag']:
-        name_reversed.append('/')
-    name_reversed_len = len(name_reversed)
-
-    pile = []
-    in_quotes = []
-    for c in reversed(stream_history):
-        if in_quotes:
-            if c == in_quotes[-1]:
-                in_quotes.pop(-1)
-
-            elif c in QUOTEY_THINGS:
-                in_quotes.append(c)
-
-            pile.append(c)
-
-        elif c in QUOTEY_THINGS:
-            in_quotes.append(c)
-            pile.append(c)
-
-        elif c == '<':
-            if pile[-name_reversed_len:] == name_reversed:
-                pile.append(c)
-                # This is the beginning of a tag, so break out of the loop
-                # and return
-                break
-            else:
-                pile.append(c)
-        else:
-            pile.append(c)
-
-    pile.reverse()
-    ret = six.text_type('').join(pile)
-    return ret
+        """
+        self._buffer = ['<']
 
 
 class BleachHTMLTokenizer(HTMLTokenizer):
@@ -167,16 +124,8 @@ def __iter__(self):
                     #
                     # If this is not an allowed tag, then we convert it to
                     # characters and it'll get escaped in the sanitizer.
-
-                    # Create a fake EndTag token so get_recent_tag_string works right
-                    fake_end_tag = {'type': tokenTypes['EndTag'], 'name': token['data']}
-                    token['data'] = get_recent_tag_string(
-                        self.stream.stream_history(), fake_end_tag
-                    )
-                    token['type'] = tokenTypes['Characters']
-
-                    # Clear the stream history up to this point
-                    self.stream.clear_history()
+                    token['data'] = self.stream.get_tag()
+                    token['type'] = CHARACTERS_TYPE
 
                     # Yield the adjusted token
                     yield token
@@ -206,13 +155,21 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
             self.currentToken['data'][-1][1] += '&'
 
         else:
-            self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
+            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
+
+    def tagOpenState(self):
+        # This state marks a < that is either a StartTag, EndTag, or ParseError.
+        # In all cases, we want to drop any stream history we've collected
+        # so far and we do that by calling start_tag() on the input stream
+        # wrapper.
+        self.stream.start_tag()
+        return super(BleachHTMLTokenizer, self).tagOpenState()
 
     def emitCurrentToken(self):
         token = self.currentToken
 
         if ((self.parser.tags is not None and
-             token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag']) and
+             token['type'] in (START_TAG_TYPE, END_TAG_TYPE) and
              token['name'].lower() not in self.parser.tags)):
             # If this is a start/end tag for a tag that's not in our allowed
             # list, then it gets stripped or escaped. In both of these cases
@@ -228,13 +185,10 @@ def emitCurrentToken(self):
                 # and this is a tag-like thing, we've lost some information.
                 # So we go back through the stream to get the original
                 # string and use that.
-                new_data = get_recent_tag_string(self.stream.stream_history(), token)
-
-                # Clear the history up to this point
-                self.stream.clear_history()
+                new_data = self.stream.get_tag()
 
             new_token = {
-                'type': tokenTypes['Characters'],
+                'type': CHARACTERS_TYPE,
                 'data': new_data
             }
 
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 13c8725a..d122fa08 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -79,16 +79,3 @@ def test_serializer(data, expected):
     serialized = serializer.render(walker(dom))
 
     assert serialized == expected
-
-
-def test_get_recent_tag_string():
-    history = list('  <img src="javascript:alert(\'XSS\');">')
-    token = {
-        'type': 3,
-        'name': 'img',
-        'data': [['src', "javascript:alert('XSS');"]],
-        'selfClosing': False,
-        'selfClosingAcknowledged': False
-    }
-
-    assert html5lib_shim.get_recent_tag_string(history, token) == '<img src="javascript:alert(\'XSS\');">'

From 0cc95b4a83629b4d3bfeee4abbd9123f362f8e11 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 27 Sep 2018 14:09:01 -0400
Subject: [PATCH 179/314] Move security text higher in README; fix typo

---
 README.rst       | 30 +++++++++++++++---------------
 docs/linkify.rst |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.rst b/README.rst
index 6622ee46..21670e9f 100644
--- a/README.rst
+++ b/README.rst
@@ -51,6 +51,21 @@ please read our wiki page at
 `<https://www.mozilla.org/en-US/security/#For_Developers>`_.
 
 
+Security
+========
+
+Bleach is a security-focused library.
+
+We have a responsible security vulnerability reporting process. Please use
+that if you're reporting a security issue.
+
+Security issues are fixed in private. After we land such a fix, we'll do a
+release.
+
+For every release, we mark security issues we've fixed in the ``CHANGES`` in
+the **Security issues** section. We include any relevant CVE links.
+
+
 Installing Bleach
 =================
 
@@ -85,21 +100,6 @@ The simplest way to use Bleach is:
     u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
 
 
-Security
-========
-
-Bleach is a security-related library.
-
-We have a responsible security vulnerability reporting process. Please use
-that if you're reporting a security issue.
-
-Security issues are fixed in private. After we land such a fix, we'll do a
-release.
-
-For every release, we mark security issues we've fixed in the ``CHANGES`` in
-the **Security issues** section. We include relevant CVE links.
-
-
 Code of conduct
 ===============
 
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 74cb01eb..b0ede1a3 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -312,7 +312,7 @@ instance.
 Using ``bleach.linkifier.LinkifyFilter``
 ========================================
 
-``bleach.linkify`` works by paring an HTML fragment and then running it through
+``bleach.linkify`` works by parsing an HTML fragment and then running it through
 the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
 back into text.
 

From f967428b30b415b114632831b062e7060fd2637f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 1 Oct 2018 20:37:01 -0400
Subject: [PATCH 180/314] Tweak BleachHTMLTokenizer to use a set

This tweaks the BleachHTMLTokenizer to use a set of token types. It also
includes the EmtpyTag token type which wasn't previously included. It
also fixes some of the comments.
---
 bleach/html5lib_shim.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 58e2e78c..a0a68a9c 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -38,8 +38,9 @@
 ENTITIES_TRIE = Trie(ENTITIES)
 
 #: Token type constants--these never change
-START_TAG_TYPE = tokenTypes['StartTag']
-END_TAG_TYPE = tokenTypes['EndTag']
+TAG_TOKEN_TYPES = set([
+    tokenTypes['StartTag'], tokenTypes['EndTag'], tokenTypes['EmptyTag']
+])
 CHARACTERS_TYPE = tokenTypes['Characters']
 
 
@@ -158,10 +159,10 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
             self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
 
     def tagOpenState(self):
-        # This state marks a < that is either a StartTag, EndTag, or ParseError.
-        # In all cases, we want to drop any stream history we've collected
-        # so far and we do that by calling start_tag() on the input stream
-        # wrapper.
+        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
+        # or ParseError. In all cases, we want to drop any stream history
+        # we've collected so far and we do that by calling start_tag() on
+        # the input stream wrapper.
         self.stream.start_tag()
         return super(BleachHTMLTokenizer, self).tagOpenState()
 
@@ -169,11 +170,11 @@ def emitCurrentToken(self):
         token = self.currentToken
 
         if ((self.parser.tags is not None and
-             token['type'] in (START_TAG_TYPE, END_TAG_TYPE) and
+             token['type'] in TAG_TOKEN_TYPES and
              token['name'].lower() not in self.parser.tags)):
-            # If this is a start/end tag for a tag that's not in our allowed
-            # list, then it gets stripped or escaped. In both of these cases
-            # it gets converted to a Characters token.
+            # If this is a start/end/empty tag for a tag that's not in our
+            # allowed list, then it gets stripped or escaped. In both of these
+            # cases it gets converted to a Characters token.
             if self.parser.strip:
                 # If we're stripping the token, we just throw in an empty
                 # string token.

From 6b25f6cfaab9fdf352bee0116cbf389b28396cbe Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 2 Oct 2018 10:18:49 -0400
Subject: [PATCH 181/314] Adjust how we import things from constants

---
 bleach/html5lib_shim.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index a0a68a9c..213580b4 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -15,11 +15,10 @@
     HTMLParser,
     getTreeWalker,
 )
+from bleach._vendor.html5lib import constants
 from bleach._vendor.html5lib.constants import (
-    entities,
     namespaces,
     prefixes,
-    tokenTypes,
 )
 from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
 from bleach._vendor.html5lib.filters.base import Filter
@@ -32,16 +31,18 @@
 
 
 #: Map of entity name to expanded entity
-ENTITIES = entities
+ENTITIES = constants.entities
 
 #: Trie of html entity string -> character representation
 ENTITIES_TRIE = Trie(ENTITIES)
 
 #: Token type constants--these never change
 TAG_TOKEN_TYPES = set([
-    tokenTypes['StartTag'], tokenTypes['EndTag'], tokenTypes['EmptyTag']
+    constants.tokenTypes['StartTag'],
+    constants.tokenTypes['EndTag'],
+    constants.tokenTypes['EmptyTag']
 ])
-CHARACTERS_TYPE = tokenTypes['Characters']
+CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
 class InputStreamWithMemory(object):
@@ -140,7 +141,7 @@ def __iter__(self):
 
             # If the token is a ParseError, we hold on to it so we can get the
             # next token and potentially fix it.
-            if token['type'] == tokenTypes['ParseError']:
+            if token['type'] == constants.tokenTypes['ParseError']:
                 last_error_token = token
                 continue
 

From bea164a3a7e6ce2dfb826130e8faa6f447680dc1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 2 Oct 2018 10:31:14 -0400
Subject: [PATCH 182/314] Fix linkifier regarding non-tags

This changes the BleachHTMLParser and BleachHTMLTokenizer to optionally
consume entities. It also changes the linkifier to use those and pass
in the list of known HTML tags. In this way, linkify correctly handles
non-tags--a regression from the Bleach 2.0 rewrite.

Fixes #392
---
 bleach/html5lib_shim.py     | 43 +++++++++++++++++++++++++++++++------
 bleach/linkifier.py         | 11 ++++++++--
 bleach/sanitizer.py         |  1 +
 docs/linkify.rst            | 19 +++++++++++-----
 tests/test_html5lib_shim.py |  1 +
 tests/test_linkify.py       |  3 +--
 6 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 213580b4..5ede0d40 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -45,6 +45,19 @@
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
+#: List of HTML tags
+HTML_TAGS = [
+    tag for namespace, tag in
+    (
+        list(constants.scopingElements) +
+        list(constants.formattingElements) +
+        list(constants.specialElements) +
+        list(constants.htmlIntegrationPointElements) +
+        list(constants.mathmlTextIntegrationPointElements)
+    )
+]
+
+
 class InputStreamWithMemory(object):
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -101,8 +114,10 @@ def start_tag(self):
 
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
-    def __init__(self, *args, **kwargs):
-        super(BleachHTMLTokenizer, self).__init__(*args, **kwargs)
+    def __init__(self, consume_entities=False, **kwargs):
+        super(BleachHTMLTokenizer, self).__init__(**kwargs)
+
+        self.consume_entities = consume_entities
 
         # Wrap the stream with one that remembers the history
         self.stream = InputStreamWithMemory(self.stream)
@@ -148,10 +163,16 @@ def __iter__(self):
             yield token
 
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
-        # We don't want to consume and convert entities, so this overrides the
-        # html5lib tokenizer's consumeEntity so that it's now a no-op.
+        # If this tokenizer is set to consume entities, then we can let the
+        # superclass do its thing.
+        if self.consume_entities:
+            return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
+
+        # If this tokenizer is set to not consume entities, then we don't want
+        # to consume and convert them, so this overrides the html5lib tokenizer's
+        # consumeEntity so that it's now a no-op.
         #
-        # However, when that gets called, it's consumed an &, so we put that in
+        # However, when that gets called, it's consumed an &, so we put that back in
         # the stream.
         if fromAttribute:
             self.currentToken['data'][-1][1] += '&'
@@ -204,16 +225,19 @@ def emitCurrentToken(self):
 
 class BleachHTMLParser(HTMLParser):
     """Parser that uses BleachHTMLTokenizer"""
-    def __init__(self, tags, strip, **kwargs):
+    def __init__(self, tags, strip, consume_entities, **kwargs):
         """
         :arg tags: list of allowed tags--everything else is either stripped or
             escaped; if None, then this doesn't look at tags at all
         :arg strip: whether to strip disallowed tags (True) or escape them (False);
             if tags=None, then this doesn't have any effect
+        :arg consume_entities: whether to consume entities (default behavior) or
+            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
 
         """
         self.tags = [tag.lower() for tag in tags] if tags is not None else None
         self.strip = strip
+        self.consume_entities = consume_entities
         super(BleachHTMLParser, self).__init__(**kwargs)
 
     def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
@@ -221,7 +245,12 @@ def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kw
         self.innerHTMLMode = innerHTML
         self.container = container
         self.scripting = scripting
-        self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs)
+        self.tokenizer = BleachHTMLTokenizer(
+            stream=stream,
+            consume_entities=self.consume_entities,
+            parser=self,
+            **kwargs
+        )
         self.reset()
 
         try:
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 3c8c3eeb..6394c035 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -110,9 +110,16 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
         self.url_re = url_re
         self.email_re = email_re
 
-        self.parser = html5lib_shim.HTMLParser(namespaceHTMLElements=False)
+        # Create a parser/tokenizer that allows all HTML tags and escapes
+        # anything not in that list.
+        self.parser = html5lib_shim.BleachHTMLParser(
+            tags=html5lib_shim.HTML_TAGS,
+            strip=False,
+            consume_entities=True,
+            namespaceHTMLElements=False,
+        )
         self.walker = html5lib_shim.getTreeWalker('etree')
-        self.serializer = html5lib_shim.HTMLSerializer(
+        self.serializer = html5lib_shim.BleachHTMLSerializer(
             quote_attr_values='always',
             omit_optional_tags=False,
 
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index de820274..262915a9 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -126,6 +126,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         self.parser = html5lib_shim.BleachHTMLParser(
             tags=self.tags,
             strip=self.strip,
+            consume_entities=False,
             namespaceHTMLElements=False
         )
         self.walker = html5lib_shim.getTreeWalker('etree')
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b0ede1a3..6665300a 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -5,12 +5,20 @@
 Linkifying text fragments
 =========================
 
-:py:func:`bleach.linkify` searches text for links, URLs, and email addresses and
-lets you control how and when those links are rendered.
+Bleach comes with several tools for searching text for links, URLs, and email
+addresses and letting you specify how those links are rendered in HTML.
 
-It works by building a document tree, so it's guaranteed never to do weird
-things to URLs in attribute values, can modify the value of attributes on
-``<a>`` tags and can even do things like skip ``<pre>`` sections.
+For example, you could pass in text and have all URL things converted into
+HTML links.
+
+It works by parsing the text as HTML and building a document tree. In this
+way, it's guaranteed never to do weird things to URLs in attribute values,
+can modify the value of attributes on ``<a>`` tags and can even do things
+like skip ``<pre>`` sections.
+
+If you plan to sanitize/clean the text and linkify it, you should do that
+in a single pass using :ref:`LinkifyFilter <linkify-LinkifyFilter>`. This
+is faster and it'll use the list of allowed tags from clean.
 
 .. note::
 
@@ -308,6 +316,7 @@ instance.
 
 .. versionadded:: 2.0
 
+.. _linkify-LinkifyFilter:
 
 Using ``bleach.linkifier.LinkifyFilter``
 ========================================
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index d122fa08..5712d338 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -62,6 +62,7 @@ def test_serializer(data, expected):
     parser = html5lib_shim.BleachHTMLParser(
         tags=None,
         strip=True,
+        consume_entities=False,
         namespaceHTMLElements=False
     )
     walker = html5lib_shim.getTreeWalker('etree')
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 4fa45126..876cb84f 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -407,7 +407,6 @@ def test_end_of_clause():
     )
 
 
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
@@ -581,7 +580,7 @@ def test_hang():
     """This string would hang linkify. Issue #200"""
     assert (
         linkify("an@email.com<mailto:an@email.com>", parse_email=True) ==
-        '<a href="mailto:an@email.com">an@email.com</a><mailto:an@email.com></mailto:an@email.com>'
+        '<a href="mailto:an@email.com">an@email.com</a>&lt;mailto:<a href="mailto:an@email.com">an@email.com</a>&gt;'  # noqa
     )
 
 

From 03bdafe9e58791b8217684ef5106644541f6f8dd Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 2 Oct 2018 19:26:02 -0400
Subject: [PATCH 183/314] Update for 3.0.0 release

---
 CHANGES            | 20 ++++++++++++--------
 bleach/__init__.py |  4 ++--
 docs/dev.rst       |  2 +-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/CHANGES b/CHANGES
index 427363ea..ac5e9b5a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version 3.0.0 (in development)
-------------------------------
+Version 3.0.0 (October 3rd, 2018)
+---------------------------------
 
 **Security fixes**
 
@@ -10,9 +10,9 @@ None
 
 **Backwards incompatible changes**
 
-* A bunch of functions were moved from one module to another:
+* A bunch of functions were moved from one module to another.
 
-  ``bleach.sanitizer`` -> ``bleach.html5lib_shim``:
+  These were moved from ``bleach.sanitizer`` to ``bleach.html5lib_shim``:
 
   * ``convert_entity``
   * ``convert_entities``
@@ -22,9 +22,9 @@ None
   * ``BleachHTMLTokenizer``
   * ``BleachHTMLParser``
 
-  These weren't documented and aren't part of the public API, but people
-  read code and might be using them so we're considering it an incompatible
-  API change.
+  These functions and classes weren't documented and aren't part of the
+  public API, but people read code and might be using them so we're
+  considering it an incompatible API change.
 
   If you're using them, you'll need to update your code.
 
@@ -39,7 +39,11 @@ None
 
 **Bug fixes**
 
-None
+* Fixed tags getting added when using clean or linkify. This was a
+  long-standing regression from the Bleach 2.0 rewrite. (#280, #392)
+
+* Fixed ``<isindex>`` getting replaced with a string. Now it gets escaped or
+  stripped depending on whether it's in the allowed tags or not. (#279)
 
 
 Version 2.1.4 (August 16th, 2018)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index b4a927a5..3d1e7a69 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20181003'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.0.dev0'
+__version__ = '3.0.0'
 VERSION = parse_version(__version__)
 
 
diff --git a/docs/dev.rst b/docs/dev.rst
index abeaf913..e10899ba 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -71,7 +71,7 @@ Release process
          $ cd docs
          $ make html
 
-   3. Run the doctests::
+   3. Run the doctests (in Python 3)::
 
          $ cd docs/
          $ make doctest

From 8b36980cfdd88fc3fc418b4a31de1973b0a4b622 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 3 Oct 2018 12:47:28 -0400
Subject: [PATCH 184/314] This disables auto-release

---
 .travis.yml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a5d55009..8092243d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,14 +27,3 @@ matrix:
 
 script:
   - ./scripts/run_tests.sh $MODE
-
-deploy:
-  provider: pypi
-  user: jezdez
-  distributions: sdist bdist_wheel
-  password:
-    secure: TTLpnNBAmRBPe4qITwtM6MRXw3CvGpflnkG6V97oKYL1RJhDXmxIxxImkGyVoT2IR4Oy/jqEikWUCCC3aDoqDnIkkDVriTPmo5PGnS2WgvEmYdcaTIp+RXdKwKhpCVX8ITEuye0iCXYu28vDaySGjnxjlYAP4S0PGPUzh/tn4DY=
-  on:
-    tags: true
-    repo: mozilla/bleach
-    python: "2.7"

From fc1369fdac410f414effe0cdfb9998d26ee73d0f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 4 Oct 2018 14:44:37 -0400
Subject: [PATCH 185/314] Fix token['data'] usage in error handling

This undoes something I did for better readability. If the token is a tag,
then it doesn't have a string value for data. This fixes the handling.

Fixes #398
---
 bleach/html5lib_shim.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 5ede0d40..7ababdb0 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -127,9 +127,8 @@ def __iter__(self):
 
         for token in super(BleachHTMLTokenizer, self).__iter__():
             if last_error_token is not None:
-                token_name = token['data'].lower().strip()
                 if ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
-                     token_name not in self.parser.tags)):
+                     token['data'].lower().strip() not in self.parser.tags)):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
                     # comment which Bleach clean() will drop so we interfere

From be2f3cc5a4c79d1a9844cf904efe2c1919a51fda Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 5 Oct 2018 12:34:24 -0400
Subject: [PATCH 186/314] Add abbr to HTML_TAGS

This fixes the issue where linkify will escape <abbr>.

Fixes #400
---
 bleach/html5lib_shim.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 7ababdb0..4e6e054f 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -56,6 +56,8 @@
         list(constants.mathmlTextIntegrationPointElements)
     )
 ]
+# Add tags that aren't in html5lib.constants
+HTML_TAGS.extend(['abbr'])
 
 
 class InputStreamWithMemory(object):

From d796f53e462e54e8110662417f8e62c3c2a08681 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 5 Oct 2018 12:42:26 -0400
Subject: [PATCH 187/314] Fix vendor_verify.sh script to clean up after itself
 if successful

---
 scripts/vendor_verify.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
index 88f64b06..f2299075 100755
--- a/scripts/vendor_verify.sh
+++ b/scripts/vendor_verify.sh
@@ -21,3 +21,7 @@ diff -r \
     --exclude="pip_install_vendor.sh" \
     --exclude="__pycache__" \
     bleach/_vendor/ "${DEST}"
+
+if [ $? == 0 ]; then
+    rm -rf "${DEST}"
+fi

From 4278434e125c9e44440c7e694dc1dd6ed2f00862 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 5 Oct 2018 12:45:01 -0400
Subject: [PATCH 188/314] Flesh out comments in vendor_verify script

---
 scripts/vendor_verify.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
index f2299075..b3b46b5f 100755
--- a/scripts/vendor_verify.sh
+++ b/scripts/vendor_verify.sh
@@ -12,8 +12,14 @@ fi
 
 mkdir "${DEST}"
 
+# Get versions of pip and python
+pip --version
+
+# Install vendored dependencies into temp directory
 pip install --no-binary all --no-compile --no-deps -r bleach/_vendor/vendor.txt --target "${DEST}"
 
+# Diff contents of temp directory and bleach/_vendor/ excluding vnedoring
+# infrastructure
 diff -r \
     --exclude="__init__.py" \
     --exclude="README.rst" \
@@ -22,6 +28,7 @@ diff -r \
     --exclude="__pycache__" \
     bleach/_vendor/ "${DEST}"
 
+# If everything is cool, then delete the temp directory
 if [ $? == 0 ]; then
     rm -rf "${DEST}"
 fi

From 2052f00700a0405a312a8f2337b6f601bd34a2dd Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 8 Oct 2018 15:49:43 -0400
Subject: [PATCH 189/314] Fix vendor_verify.sh problems with RECORD

RECORD is not sorted and the lines are in a different order once in a while
causing vendor_verify.sh to fail. The order of the lines doesn't matter for
this file, so this tweaks the script to diff that file separately after
sorting the contents on both sides.
---
 scripts/vendor_verify.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
index b3b46b5f..e9abe5e0 100755
--- a/scripts/vendor_verify.sh
+++ b/scripts/vendor_verify.sh
@@ -20,15 +20,27 @@ pip install --no-binary all --no-compile --no-deps -r bleach/_vendor/vendor.txt
 
 # Diff contents of temp directory and bleach/_vendor/ excluding vnedoring
 # infrastructure
+echo "diffing directory trees..."
 diff -r \
     --exclude="__init__.py" \
     --exclude="README.rst" \
     --exclude="vendor.txt" \
     --exclude="pip_install_vendor.sh" \
     --exclude="__pycache__" \
+    --exclude="RECORD" \
     bleach/_vendor/ "${DEST}"
 
+# Go through all RECORD files and compare sorted versions; RECORD files are
+# unsorted and occasionally diff poorly
+for fn in $(cd bleach/_vendor/; find . -name RECORD); do
+    echo "diffing bleach/_vendor/${fn} and ${DEST}/${fn} ..."
+    diff <(sort bleach/_vendor/${fn}) <(sort ${DEST}/${fn})
+done
+
 # If everything is cool, then delete the temp directory
-if [ $? == 0 ]; then
+LASTEXIT=$?
+if [ ${LASTEXIT} -eq 0 ]; then
     rm -rf "${DEST}"
 fi
+
+exit ${LASTEXIT}

From cfd0be5e084e17e18f1c0c6b4b8058e34993835d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 8 Oct 2018 16:16:32 -0400
Subject: [PATCH 190/314] Add Python 3.7 environment to tox and Travis

Fixes #377
---
 .travis.yml |  9 ++++++---
 tox.ini     | 12 ++++++++++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8092243d..f315b9bd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,12 +18,15 @@ install:
 
 matrix:
   include:
-    - python: 2.7
+    - python: "2.7"
       env: MODE=lint
-    - python: 2.7
+    - python: "2.7"
       env: MODE=vendorverify
-    - python: 3.4
+    - python: "3.4"
       env: MODE=lint
+    - python: "3.7"
+      sudo: required
+      dist: xenial
 
 script:
   - ./scripts/run_tests.sh $MODE
diff --git a/tox.ini b/tox.ini
index 41d082f1..67936b43 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,9 +7,9 @@
 
 [tox]
 envlist =
-    py{27,34,35,36}
+    py{27,34,35,36,37}
     pypy
-    py{27,34,35,36}-build-no-lang
+    py{27,34,35,36,37}-build-no-lang
     docs
     lint
     vendorverify
@@ -20,6 +20,7 @@ basepython =
     py34: python3.4
     py35: python3.5
     py36: python3.6
+    py37: python3.7
 deps =
     -rrequirements-dev.txt
 commands =
@@ -54,6 +55,13 @@ setenv =
 commands =
     python setup.py build
 
+[testenv:py37-build-no-lang]
+basepython = python3.7
+setenv =
+    LANG=
+commands =
+    python setup.py build
+
 [testenv:lint]
 basepython = python3.6
 changedir = scripts

From 45f55d6dad2fe786dfc05055bbf413927948fef3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 8 Oct 2018 16:32:12 -0400
Subject: [PATCH 191/314] Update CHANGES and setup.py

---
 CHANGES  | 21 +++++++++++++++++++++
 setup.py |  1 +
 2 files changed, 22 insertions(+)

diff --git a/CHANGES b/CHANGES
index ac5e9b5a..4d7cafee 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,27 @@
 Bleach changes
 ==============
 
+Version 3.0.1 (in development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+* Support Python 3.7. (#377)
+
+**Bug fixes**
+
+* Fix `'list' object has no attribute 'lower'` in clean. (#398)
+* Fix `abbr` getting escaped in linkify. (#400)
+ 
+
 Version 3.0.0 (October 3rd, 2018)
 ---------------------------------
 
diff --git a/setup.py b/setup.py
index a7a27a8e..2903cb43 100755
--- a/setup.py
+++ b/setup.py
@@ -68,6 +68,7 @@ def get_version():
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Software Development :: Libraries :: Python Modules',

From fc3319933fd738c026e4204973328b1b99851a3b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 9 Oct 2018 09:59:56 -0400
Subject: [PATCH 192/314] Prep for 3.0.1 release

---
 CHANGES            | 12 +++++++-----
 bleach/__init__.py |  4 ++--
 setup.py           |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/CHANGES b/CHANGES
index 4d7cafee..6535f6aa 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version 3.0.1 (in development)
-------------------------------
+Version 3.0.1 (October 9th, 2018)
+---------------------------------
 
 **Security fixes**
 
@@ -14,12 +14,14 @@ None
 
 **Features**
 
-* Support Python 3.7. (#377)
+* Support Python 3.7. It supported Python 3.7 just fine, but we added 3.7 to
+  the list of Python environments we test so this is now officially supported.
+  (#377)
 
 **Bug fixes**
 
-* Fix `'list' object has no attribute 'lower'` in clean. (#398)
-* Fix `abbr` getting escaped in linkify. (#400)
+* Fix ``list`` object has no attribute ``lower`` in ``clean``. (#398)
+* Fix ``abbr`` getting escaped in ``linkify``. (#400)
  
 
 Version 3.0.0 (October 3rd, 2018)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3d1e7a69..57c4f67c 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20181003'
+__releasedate__ = '20181009'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.0'
+__version__ = '3.0.1'
 VERSION = parse_version(__version__)
 
 
diff --git a/setup.py b/setup.py
index 2903cb43..0de45ad4 100755
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@ def get_version():
     long_description=get_long_desc(),
     maintainer='Will Kahn-Greene',
     maintainer_email='willkg@mozilla.com',
-    url='http://github.com/mozilla/bleach',
+    url='https://github.com/mozilla/bleach',
     license='Apache Software License',
     packages=find_packages(),
     include_package_data=True,

From c0474eff6159d4c1c1684eb4e8531ba9f47ea100 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 9 Oct 2018 10:13:00 -0400
Subject: [PATCH 193/314] Update release process

---
 docs/dev.rst | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/dev.rst b/docs/dev.rst
index e10899ba..eb84db5e 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -88,10 +88,18 @@ Release process
 
    Copy the details from ``CHANGES`` into the tag comment.
 
-9. Push the new tag::
+9. Generate distribution files::
 
-     $ git push --tags official master
+     $ python setup.py sdist bdist_wheel
 
-   That will push the release to PyPI.
+10. Upload them to PyPI::
 
-10. Blog posts, twitter, update topic in ``#bleach``, etc.
+      $ twine upload dist/*
+
+11. Push the new tag::
+
+      $ git push --tags official master
+
+    That will push the release to PyPI.
+
+12. Blog posts, twitter, update topic in ``#bleach``, etc.

From 6847f4a02c2ad0743f55d6ab67cfcc8876e40333 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 10 Oct 2018 14:56:45 -0400
Subject: [PATCH 194/314] Merge Characters tokens

The sanitizer causes fracturing of Characters tokens. That causes problems
with anything downstream in the filters because now the things they're
looking for can be split across token boundaries.

Instead of dealing with that, this fixes the sanitizer to merge characters
tokens as they're being yielded.

Fixes #374
---
 bleach/linkifier.py   |  6 ++----
 bleach/sanitizer.py   | 38 ++++++++++++++++++++++++++++++++++++--
 tests/test_linkify.py | 16 +++++++++++++++-
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 6394c035..5d815f8b 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -499,13 +499,11 @@ def __iter__(self):
                     # the tokens we're going to yield
                     in_a = False
                     token_buffer = []
-                    continue
-
                 else:
                     token_buffer.append(token)
-                    continue
+                continue
 
-            elif token['type'] in ['StartTag', 'EmptyTag']:
+            if token['type'] in ['StartTag', 'EmptyTag']:
                 if token['name'] in self.skip_tags:
                     # Skip tags start a "special mode" where we don't linkify
                     # anything until the end tag.
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 262915a9..7c04c91a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -267,8 +267,8 @@ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
 
         return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
-    def __iter__(self):
-        for token in html5lib_shim.Filter.__iter__(self):
+    def sanitize_stream(self, token_iterator):
+        for token in token_iterator:
             ret = self.sanitize_token(token)
 
             if not ret:
@@ -280,6 +280,40 @@ def __iter__(self):
             else:
                 yield ret
 
+    def merge_characters(self, token_iterator):
+        """Merge consecutive Characters tokens in a stream"""
+        characters_buffer = []
+
+        for token in token_iterator:
+            if characters_buffer:
+                if token['type'] == 'Characters':
+                    characters_buffer.append(token)
+                    continue
+                else:
+                    # Merge all the characters tokens together into one and then
+                    # operate on it.
+                    char_token = {
+                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),
+                        'type': 'Characters'
+                    }
+                    characters_buffer = []
+                    yield char_token
+
+            elif token['type'] == 'Characters':
+                characters_buffer.append(token)
+                continue
+
+            yield token
+
+        token = {
+            'data': ''.join([char_token['data'] for char_token in characters_buffer]),
+            'type': 'Characters'
+        }
+        yield token
+
+    def __iter__(self):
+        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
+
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
 
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 876cb84f..6d353c32 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -4,7 +4,8 @@
 from six.moves.urllib_parse import quote_plus
 
 from bleach import linkify, DEFAULT_CALLBACKS as DC
-from bleach.linkifier import Linker
+from bleach.linkifier import Linker, LinkifyFilter
+from bleach.sanitizer import Cleaner
 
 
 def test_empty():
@@ -656,3 +657,16 @@ def test_only_text_is_linkified(self):
 
         with pytest.raises(TypeError):
             linkify(no_type)
+
+
+@pytest.mark.parametrize('text, expected', [
+    ('abc', 'abc'),
+    ('example.com', '<a href="http://example.com">example.com</a>'),
+    (
+        'http://example.com?b=1&c=2',
+        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
+    ),
+])
+def test_linkify_filter(text, expected):
+    cleaner = Cleaner(filters=[LinkifyFilter])
+    assert cleaner.clean(text) == expected

From fca551be2ebea623b284842392077266864584b4 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 10 Oct 2018 15:04:15 -0400
Subject: [PATCH 195/314] Fix lint issues

---
 bleach/sanitizer.py | 8 ++++----
 tests/test_clean.py | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 7c04c91a..9ba4c57b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -292,12 +292,12 @@ def merge_characters(self, token_iterator):
                 else:
                     # Merge all the characters tokens together into one and then
                     # operate on it.
-                    char_token = {
+                    new_token = {
                         'data': ''.join([char_token['data'] for char_token in characters_buffer]),
                         'type': 'Characters'
                     }
                     characters_buffer = []
-                    yield char_token
+                    yield new_token
 
             elif token['type'] == 'Characters':
                 characters_buffer.append(token)
@@ -305,11 +305,11 @@ def merge_characters(self, token_iterator):
 
             yield token
 
-        token = {
+        new_token = {
             'data': ''.join([char_token['data'] for char_token in characters_buffer]),
             'type': 'Characters'
         }
-        yield token
+        yield new_token
 
     def __iter__(self):
         return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
diff --git a/tests/test_clean.py b/tests/test_clean.py
index b543cdfb..53227677 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -58,6 +58,7 @@ def test_html_is_lowercased():
         '<a href="http://example.com">foo</a>'
     )
 
+
 def test_invalid_uri_does_not_raise_error():
     assert clean('<a href="http://example.com]">text</a>') == '<a>text</a>'
 

From 44202d5f18e18753c9e3e352b268cf9af9da8e12 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 11 Oct 2018 08:07:31 -0400
Subject: [PATCH 196/314] Add test with anchor

---
 tests/test_linkify.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 6d353c32..eeea3e32 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -666,6 +666,10 @@ def test_only_text_is_linkified(self):
         'http://example.com?b=1&c=2',
         '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
     ),
+    (
+        'link: https://example.com/watch#anchor',
+        'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>'
+    )
 ])
 def test_linkify_filter(text, expected):
     cleaner = Cleaner(filters=[LinkifyFilter])

From 34bd0664b6a2ae2dbaba90ae8600e1c19a1d103c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 11 Oct 2018 09:06:05 -0400
Subject: [PATCH 197/314] Prep for 3.0.2 release

---
 CHANGES            | 21 +++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6535f6aa..176dde1c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,27 @@
 Bleach changes
 ==============
 
+Version 3.0.2 (October 11th, 2018)
+----------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Merge ``Characters`` tokens after sanitizing them. This fixes issues in the
+  ``LinkifyFilter`` where it was only linkifying parts of urls. (#374)
+
+
 Version 3.0.1 (October 9th, 2018)
 ---------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 57c4f67c..8ed01763 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20181009'
+__releasedate__ = '20181011'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.1'
+__version__ = '3.0.2'
 VERSION = parse_version(__version__)
 
 

From e05edfc076c48a4a1fd9adc9d32f41a395830eda Mon Sep 17 00:00:00 2001
From: Chad Birch <chad.birch@gmail.com>
Date: Thu, 11 Oct 2018 15:51:18 -0600
Subject: [PATCH 198/314] Define a full list of valid elements for HTML_TAGS

Previously, the list of valid HTML tags being used by linkify was a
combination of various element lists from the html5lib constants.
However, these lists don't cover all of the valid HTML elements, so
linkify was escaping some valid tags (including <abbr>, <span>, and
more).

This commit just defines a full list of valid, non-deprecated HTML
elements for linkify to use instead.
---
 bleach/html5lib_shim.py | 122 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 111 insertions(+), 11 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 4e6e054f..50d7d3e9 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -45,19 +45,119 @@
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
-#: List of HTML tags
+#: List of valid HTML tags
 HTML_TAGS = [
-    tag for namespace, tag in
-    (
-        list(constants.scopingElements) +
-        list(constants.formattingElements) +
-        list(constants.specialElements) +
-        list(constants.htmlIntegrationPointElements) +
-        list(constants.mathmlTextIntegrationPointElements)
-    )
+    'a',
+    'abbr',
+    'address',
+    'area',
+    'article',
+    'aside',
+    'audio',
+    'b',
+    'base',
+    'bdi',
+    'bdo',
+    'blockquote',
+    'body',
+    'br',
+    'button',
+    'canvas',
+    'caption',
+    'cite',
+    'code',
+    'col',
+    'colgroup',
+    'command',
+    'data',
+    'datalist',
+    'dd',
+    'del',
+    'details',
+    'dfn',
+    'dialog',
+    'div',
+    'dl',
+    'dt',
+    'em',
+    'embed',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'head',
+    'header',
+    'hgroup',
+    'hr',
+    'html',
+    'i',
+    'iframe',
+    'img',
+    'input',
+    'ins',
+    'kbd',
+    'keygen',
+    'label',
+    'legend',
+    'li',
+    'link',
+    'map',
+    'mark',
+    'menu',
+    'meta',
+    'meter',
+    'nav',
+    'noscript',
+    'object',
+    'ol',
+    'optgroup',
+    'option',
+    'output',
+    'p',
+    'param',
+    'pre',
+    'progress',
+    'q',
+    'rp',
+    'rt',
+    'ruby',
+    's',
+    'samp',
+    'script',
+    'section',
+    'select',
+    'small',
+    'source',
+    'span',
+    'strong',
+    'style',
+    'sub',
+    'summary',
+    'sup',
+    'table',
+    'tbody',
+    'td',
+    'textarea',
+    'tfoot',
+    'th',
+    'thead',
+    'time',
+    'title',
+    'tr',
+    'track',
+    'u',
+    'ul',
+    'var',
+    'video',
+    'wbr',
 ]
-# Add tags that aren't in html5lib.constants
-HTML_TAGS.extend(['abbr'])
 
 
 class InputStreamWithMemory(object):

From 6f16fec8cff27ff665efe0b8ac22ef1cecf8c8dd Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Tue, 16 Oct 2018 17:16:01 -0400
Subject: [PATCH 199/314] fixed docs on callback location

---
 docs/linkify.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index 6665300a..d60e17b6 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -58,7 +58,7 @@ links will be removed leaving the innerText left in its place.
 The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
 included callback functions.
 
-This defaults to ``bleach.linkify.DEFAULT_CALLBACKS``.
+This defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``.
 
 
 .. autodata:: bleach.linkifier.DEFAULT_CALLBACKS

From d445230222ec710542c89046103c4bad715f7c0a Mon Sep 17 00:00:00 2001
From: Chad Birch <chad.birch@gmail.com>
Date: Wed, 17 Oct 2018 14:54:45 -0600
Subject: [PATCH 200/314] Clarify source of HTML tag list, fix discrepancies

---
 bleach/html5lib_shim.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 50d7d3e9..63a4f25c 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -45,7 +45,8 @@
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
-#: List of valid HTML tags
+#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
+#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
 HTML_TAGS = [
     'a',
     'abbr',
@@ -68,7 +69,6 @@
     'code',
     'col',
     'colgroup',
-    'command',
     'data',
     'datalist',
     'dd',
@@ -122,6 +122,7 @@
     'output',
     'p',
     'param',
+    'picture',
     'pre',
     'progress',
     'q',
@@ -133,6 +134,7 @@
     'script',
     'section',
     'select',
+    'slot',
     'small',
     'source',
     'span',
@@ -144,6 +146,7 @@
     'table',
     'tbody',
     'td',
+    'template',
     'textarea',
     'tfoot',
     'th',

From 5c72a54b5c7e1286147bf20328465f3867878d2b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 25 Oct 2018 16:05:42 -0400
Subject: [PATCH 201/314] Fix regex strings--they should be marked raw

---
 bleach/sanitizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 9ba4c57b..c60c26b3 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -425,7 +425,7 @@ def sanitize_uri_value(self, value, allowed_protocols):
 
         # Nix backtick, space characters, and control characters
         new_value = re.sub(
-            "[`\000-\040\177-\240\s]+",
+            r"[`\000-\040\177-\240\s]+",
             '',
             new_value
         )
@@ -574,7 +574,7 @@ def sanitize_css(self, style):
         style = html5lib_shim.convert_entities(style)
 
         # Drop any url values before we do anything else
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # The gauntlet of sanitization
 
@@ -589,11 +589,11 @@ def sanitize_css(self, style):
             if not gauntlet.match(part):
                 return ''
 
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
             return ''
 
         clean = []
-        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
             if not value:
                 continue
 

From bdb28650a2ffe7a994f59b4a958e8b8763b53b64 Mon Sep 17 00:00:00 2001
From: dave-shawley <daveshawley@gmail.com>
Date: Mon, 29 Oct 2018 12:47:56 -0400
Subject: [PATCH 202/314] setup.py: six >=1.9 is required. (#416)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0de45ad4..bf01a4ed 100755
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 ]
 
 install_requires = [
-    'six',
+    'six>=1.9.0',
     # html5lib requirements
     'webencodings',
 ]

From 83738b9dd8ac5242fe1ca54059b1ca1310ac6610 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 16:58:32 -0500
Subject: [PATCH 203/314] Add recognized_tags to Linker arguments

recognized_tags lets someone specify a different set of recognized tags
to the Linker when they're using it by itself. One use of this is if
you're doing a clean pass, storing that, and then linkifying on demand.
The recognized_tags argument lets you re-use the allowed tags from the
clean step and then there isn't any additional oddities occuring in
the linkify step.

Fixes #409
---
 bleach/linkifier.py   |  7 +++++--
 docs/linkify.rst      | 21 +++++++++++----------
 tests/test_linkify.py | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 5d815f8b..95baba14 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -85,7 +85,7 @@ class Linker(object):
 
     """
     def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE):
+                 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
         """Creates a Linker instance
 
         :arg list callbacks: list of callbacks to run when adjusting tag attributes;
@@ -101,6 +101,9 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
 
         :arg re email_re: email matching regex
 
+        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
+            everything else gets escaped
+
         :returns: linkified text as unicode
 
         """
@@ -113,7 +116,7 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
         # Create a parser/tokenizer that allows all HTML tags and escapes
         # anything not in that list.
         self.parser = html5lib_shim.BleachHTMLParser(
-            tags=html5lib_shim.HTML_TAGS,
+            tags=recognized_tags,
             strip=False,
             consume_entities=True,
             namespaceHTMLElements=False,
diff --git a/docs/linkify.rst b/docs/linkify.rst
index d60e17b6..b8e7884e 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -12,13 +12,14 @@ For example, you could pass in text and have all URL things converted into
 HTML links.
 
 It works by parsing the text as HTML and building a document tree. In this
-way, it's guaranteed never to do weird things to URLs in attribute values,
-can modify the value of attributes on ``<a>`` tags and can even do things
-like skip ``<pre>`` sections.
+way, you're guaranteed to get valid HTML back without weird things like
+having URLs in tag attributes getting linkified.
 
-If you plan to sanitize/clean the text and linkify it, you should do that
-in a single pass using :ref:`LinkifyFilter <linkify-LinkifyFilter>`. This
-is faster and it'll use the list of allowed tags from clean.
+.. note::
+
+   If you plan to sanitize/clean the text and linkify it, you should do that
+   in a single pass using :ref:`LinkifyFilter <linkify-LinkifyFilter>`. This
+   is faster and it'll use the list of allowed tags from clean.
 
 .. note::
 
@@ -297,8 +298,8 @@ writing callbacks that may need to behave differently if the protocol is
 Using ``bleach.linkifier.Linker``
 =================================
 
-If you're linking a lot of text and passing the same argument values or you want
-more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+If you're linking a lot of text and passing the same argument values or you
+need more configurability, consider using a :py:class:`bleach.linkifier.Linker`
 instance.
 
 .. doctest::
@@ -325,8 +326,8 @@ Using ``bleach.linkifier.LinkifyFilter``
 the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
 back into text.
 
-You can use this filter wherever you can use an html5lib Filter. For example, you
-could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+You can use this filter wherever you can use an html5lib Filter. This lets you
+use it with ``bleach.Cleaner`` to clean and linkify in one step.
 
 For example, using all the defaults:
 
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index eeea3e32..d29a5c82 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -625,6 +625,23 @@ def test_email_re_arg():
     )
 
 
+def test_recognized_tags_arg():
+    """Verifies that recognized_tags works"""
+    # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it
+    linker = Linker(recognized_tags=['p'])
+    assert (
+        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
+        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p>&lt;sarcasm&gt;'  # noqa
+    )
+
+    # The html parser recognizes "sarcasm" as a tag and fixes it
+    linker = Linker(recognized_tags=['p', 'sarcasm'])
+    assert (
+        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
+        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>'  # noqa
+    )
+
+
 def test_linkify_idempotent():
     dirty = '<span>invalid & </span> < extra http://link.com<em>'
     assert linkify(linkify(dirty)) == linkify(dirty)

From 821a0ff8959b96d91f1e2b1779c9ad8b42da4173 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 19:12:37 -0500
Subject: [PATCH 204/314] Update for 3.0.3 development

---
 CHANGES            | 22 ++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 176dde1c..31d62dcd 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,28 @@
 Bleach changes
 ==============
 
+Version 3.0.3 (In development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+* Add ``recognized_tags`` argument to the linkify ``Linker`` class. This
+  fixes issues when linkifying on its own and having some tags get escaped.
+  It defaults to a list of HTML5 tags. Thank you, Chad Birch! (#409)
+
+**Bug fixes**
+
+* Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416)
+
+
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8ed01763..14049e2d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20181011'
+__releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.2'
+__version__ = '3.0.3'
 VERSION = parse_version(__version__)
 
 

From 33c4a23886ed7e33995529a25247a724b1feac1c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 28 Dec 2018 10:05:20 -0500
Subject: [PATCH 205/314] Drop invalid attribute names (#419)

It's possible for the tokenizer to kick up an
invalid-character-in-attribute-name error. When it does that, the
BleachHTMLTokenizer should drop the attribute with the invalid name.
This fixes that.
---
 bleach/__init__.py      |  2 +-
 bleach/html5lib_shim.py | 32 ++++++++++++++++++++++++++++----
 tests/test_linkify.py   | 13 ++++++++++++-
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 14049e2d..a6445d02 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.3'
+__version__ = '3.0.3.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 63a4f25c..88876678 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -43,6 +43,7 @@
     constants.tokenTypes['EmptyTag']
 ])
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
+PARSEERROR_TYPE = constants.tokenTypes['ParseError']
 
 
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
@@ -232,7 +233,21 @@ def __iter__(self):
 
         for token in super(BleachHTMLTokenizer, self).__iter__():
             if last_error_token is not None:
-                if ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
+                if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
+                     token['type'] in TAG_TOKEN_TYPES and
+                     token.get('data'))):
+                    # Remove attribute names that have ', " or < in them
+                    # because those characters are invalid for attribute names.
+                    token['data'] = [
+                        item for item in token['data']
+                        if ('"' not in item[0] and
+                            "'" not in item[0] and
+                            '<' not in item[0])
+                    ]
+                    last_error_token = None
+                    yield token
+
+                elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
                      token['data'].lower().strip() not in self.parser.tags)):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
@@ -248,24 +263,33 @@ def __iter__(self):
                     token['data'] = self.stream.get_tag()
                     token['type'] = CHARACTERS_TYPE
 
-                    # Yield the adjusted token
+                    last_error_token = None
                     yield token
 
+                elif token['type'] == PARSEERROR_TYPE:
+                    # If the token is a parse error, then let the last_error_token
+                    # go, and make token the new last_error_token
+                    yield last_error_token
+                    last_error_token = token
+
                 else:
                     yield last_error_token
                     yield token
+                    last_error_token = None
 
-                last_error_token = None
                 continue
 
             # If the token is a ParseError, we hold on to it so we can get the
             # next token and potentially fix it.
-            if token['type'] == constants.tokenTypes['ParseError']:
+            if token['type'] == PARSEERROR_TYPE:
                 last_error_token = token
                 continue
 
             yield token
 
+        if last_error_token:
+            yield last_error_token
+
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # If this tokenizer is set to consume entities, then we can let the
         # superclass do its thing.
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index d29a5c82..584a5b0d 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -69,6 +69,17 @@ def ft(attrs, new=False):
     )
 
 
+def test_invalid_attribute_names():
+    """Test that "invalid-character-in-attribute-name" errors in tokenizing
+    result in attributes with invalid names get dropped.
+
+    """
+    assert (
+        linkify('<a href="http://example.com/"">') ==
+        '<a href="http://example.com/" rel="nofollow"></a>'
+    )
+
+
 @pytest.mark.parametrize('data,parse_email,expected', [
     (
         'a james@example.com mailto',
@@ -119,7 +130,7 @@ def test_email_link(data, parse_email, expected):
     assert linkify(data, parse_email=parse_email) == expected
 
 
-@pytest.mark.parametrize('data,expected', [
+@pytest.mark.parametrize('data, expected', [
     (
         '"james"@example.com',
         '''<a href='mailto:"james"@example.com'>"james"@example.com</a>'''

From 4a4c4956ff6bfd9e354b5db330f1bbd9f2828453 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Dec 2018 16:15:35 -0500
Subject: [PATCH 206/314] Add note to CHANGES

---
 CHANGES | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGES b/CHANGES
index 31d62dcd..eb3dc23b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -22,6 +22,9 @@ None
 
 * Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416)
 
+* Fix cases where attribute names could have invalid characters in them.
+  (#419)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------

From 8d7fd48179b5020d9b1521be7b81e06648d868d3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 19:29:17 -0500
Subject: [PATCH 207/314] Convert &amp; to & as a Characters token

This fixes a problem in LinkifyFilter when using it with the Cleaner where
the Cleaner sets up the tokenizer to not consume entities. So character
entities end up in their own Entity tokens and Linkifyfilter can't match
links that cross token boundaries. If there's a &amp;, then LinkifyFilter
won't match across that.

This fixes that by converting &amp; to & in the sanitizer when it's pulling out
entities and putting them in separate Entity tokens. The & Characters tokens
will get merged by BleachSanitizerFilter.__iter__ and & will get converted
back to &amp; in the serialier.

Fixes #422
---
 bleach/sanitizer.py   | 13 ++++++++++++-
 tests/test_linkify.py |  4 ++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c60c26b3..79b80f5b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -395,7 +395,18 @@ def sanitize_characters(self, token):
             if part.startswith('&'):
                 entity = html5lib_shim.match_entity(part)
                 if entity is not None:
-                    new_tokens.append({'type': 'Entity', 'name': entity})
+                    if entity == 'amp':
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({'type': 'Characters', 'data': '&'})
+                    else:
+                        new_tokens.append({'type': 'Entity', 'name': entity})
+
                     # Length of the entity plus 2--one for & at the beginning
                     # and and one for ; at the end
                     remainder = part[len(entity) + 2:]
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 584a5b0d..ab1c5134 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -694,6 +694,10 @@ def test_only_text_is_linkified(self):
         'http://example.com?b=1&c=2',
         '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
     ),
+    (
+        'http://example.com?b=1&amp;c=2',
+        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
+    ),
     (
         'link: https://example.com/watch#anchor',
         'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>'

From cb156cb9054c34b817f8ed2dff92801a594b9107 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 10:33:54 -0500
Subject: [PATCH 208/314] Fix parsing "meta" tag with encoding attribute

When parsing a <meta encoding=""> tag, the parser calls charEncoding
and changeEncoding in the input stream, but the InputStreamWithMemory
wrapper didn't have those methods. This fixes that.

This also creates a new test set for BleachHTMLParser functionality.

Fixes #431
---
 CHANGES                     |  8 ++++-
 bleach/__init__.py          |  2 +-
 bleach/html5lib_shim.py     |  8 +++++
 tests/test_html5lib_shim.py | 62 +++++++++++++++++++++++++++++++++++++
 tests/test_linkify.py       | 11 -------
 5 files changed, 78 insertions(+), 13 deletions(-)

diff --git a/CHANGES b/CHANGES
index eb3dc23b..4fe065e8 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,7 +1,7 @@
 Bleach changes
 ==============
 
-Version 3.0.3 (In development)
+Version 3.1.0 (In development)
 ------------------------------
 
 **Security fixes**
@@ -25,6 +25,12 @@ None
 * Fix cases where attribute names could have invalid characters in them.
   (#419)
 
+* Fix problems with ``LinkifyFilter`` not being able to match links
+  across ``&amp;``. (#422)
+
+* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
+  parsing ``meta`` tags. (#431)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index a6445d02..6249bf81 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.3.dev0'
+__version__ = '3.1.0.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 88876678..25e3e955 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -181,6 +181,14 @@ def __init__(self, inner_stream):
     def errors(self):
         return self._inner_stream.errors
 
+    @property
+    def charEncoding(self):
+        return self._inner_stream.charEncoding
+
+    @property
+    def changeEncoding(self):
+        return self._inner_stream.changeEncoding
+
     def char(self):
         c = self._inner_stream.char()
         # char() can return None if EOF, so ignore that
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 5712d338..ce15de7e 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -80,3 +80,65 @@ def test_serializer(data, expected):
     serialized = serializer.render(walker(dom))
 
     assert serialized == expected
+
+
+@pytest.mark.parametrize('parser_args, data, expected', [
+    # Make sure InputStreamWithMemory has charEncoding and changeEncoding
+    (
+        {},
+        '<meta charset="utf-8">',
+        '<meta charset="utf-8">'
+    ),
+    # Handle consume entities False--all entities are passed along and then
+    # escaped when serialized
+    (
+        {'consume_entities': False},
+        'text &amp;&gt;&quot;',
+        'text &amp;amp;&amp;gt;&amp;quot;'
+    ),
+    # Handle consume entities True--all entities are consumed and converted
+    # to their character equivalents and then &, <, and > are escaped when
+    # serialized
+    (
+        {'consume_entities': True},
+        'text &amp;&gt;&quot;',
+        'text &amp;&gt;"'
+    ),
+    # Test that "invalid-character-in-attribute-name" errors in tokenizing
+    # result in attributes with invalid names getting dropped
+    (
+        {},
+        '<a href="http://example.com"">',
+        '<a href="http://example.com"></a>'
+    ),
+    (
+        {},
+        '<a href=\'http://example.com\'\'>',
+        '<a href="http://example.com"></a>'
+    )
+])
+def test_bleach_html_parser(parser_args, data, expected):
+    args = {
+        'tags': None,
+        'strip': True,
+        'consume_entities': True
+    }
+    args.update(parser_args)
+
+    # Build a parser, walker, and serializer just like we do in clean()
+    parser = html5lib_shim.BleachHTMLParser(**args)
+    walker = html5lib_shim.getTreeWalker('etree')
+    serializer = html5lib_shim.BleachHTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
+        escape_lt_in_attrs=True,
+        resolve_entities=False,
+        sanitize=False,
+        alphabetical_attributes=False,
+    )
+
+    # Parse, walk, and then serialize the output
+    dom = parser.parseFragment(data)
+    serialized = serializer.render(walker(dom))
+
+    assert serialized == expected
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index ab1c5134..f1211894 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -69,17 +69,6 @@ def ft(attrs, new=False):
     )
 
 
-def test_invalid_attribute_names():
-    """Test that "invalid-character-in-attribute-name" errors in tokenizing
-    result in attributes with invalid names get dropped.
-
-    """
-    assert (
-        linkify('<a href="http://example.com/"">') ==
-        '<a href="http://example.com/" rel="nofollow"></a>'
-    )
-
-
 @pytest.mark.parametrize('data,parse_email,expected', [
     (
         'a james@example.com mailto',

From 245c21c3cef788dbfdb380514434497866443e87 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 12:11:15 -0500
Subject: [PATCH 209/314] Fix doctest failures

This reworks the doctests to run and pass in Python 3.

Fixes #357
---
 docs/clean.rst   | 46 +++++++++++++++++-----------------
 docs/linkify.rst | 64 ++++++++++++++++++++++++------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index 68178ce5..c786f2a3 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -63,10 +63,10 @@ For example:
    >>> import bleach
 
    >>> bleach.clean(
-   ...     u'<b><i>an example</i></b>',
+   ...     '<b><i>an example</i></b>',
    ...     tags=['b'],
    ... )
-   u'<b>&lt;i&gt;an example&lt;/i&gt;</b>'
+   '<b>&lt;i&gt;an example&lt;/i&gt;</b>'
 
 
 The default value is a relatively conservative list found in
@@ -106,12 +106,12 @@ For example:
    >>> import bleach
 
    >>> bleach.clean(
-   ...     u'<p class="foo" style="color: red; font-weight: bold;">blah blah blah</p>',
+   ...     '<p class="foo" style="color: red; font-weight: bold;">blah blah blah</p>',
    ...     tags=['p'],
    ...     attributes=['style'],
    ...     styles=['color'],
    ... )
-   u'<p style="color: red;">blah blah blah</p>'
+   '<p style="color: red;">blah blah blah</p>'
 
 
 As a dict
@@ -135,11 +135,11 @@ and "class" for any tag (including "a" and "img"):
    ... }
 
    >>> bleach.clean(
-   ...    u'<img alt="an example" width=500>',
+   ...    '<img alt="an example" width=500>',
    ...    tags=['img'],
    ...    attributes=attrs
    ... )
-   u'<img alt="an example">'
+   '<img alt="an example">'
 
 
 Using functions
@@ -161,11 +161,11 @@ For example:
    ...     return name[0] == 'h'
 
    >>> bleach.clean(
-   ...    u'<a href="http://example.com" title="link">link</a>',
+   ...    '<a href="http://example.com" title="link">link</a>',
    ...    tags=['a'],
    ...    attributes=allow_h,
    ... )
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 You can also pass a callable as a value in an attributes dict and it'll run for
@@ -173,7 +173,7 @@ attributes for specified tags:
 
 .. doctest::
 
-   >>> from urlparse import urlparse
+   >>> from six.moves.urllib.parse import urlparse
    >>> import bleach
 
    >>> def allow_src(tag, name, value):
@@ -185,13 +185,13 @@ attributes for specified tags:
    ...     return False
 
    >>> bleach.clean(
-   ...    u'<img src="http://example.com" alt="an example">',
+   ...    '<img src="http://example.com" alt="an example">',
    ...    tags=['img'],
    ...    attributes={
    ...        'img': allow_src
    ...    }
    ... )
-   u'<img alt="an example">'
+   '<img alt="an example">'
 
 
 .. versionchanged:: 2.0
@@ -223,12 +223,12 @@ For example, to allow users to set the color and font-weight of text:
    >>> styles = ['color', 'font-weight']
 
    >>> bleach.clean(
-   ...     u'<p style="font-weight: heavy;">my html</p>',
+   ...     '<p style="font-weight: heavy;">my html</p>',
    ...     tags=tags,
    ...     attributes=attrs,
    ...     styles=styles
    ... )
-   u'<p style="font-weight: heavy;">my html</p>'
+   '<p style="font-weight: heavy;">my html</p>'
 
 
 Default styles are stored in ``bleach.sanitizer.ALLOWED_STYLES``.
@@ -252,7 +252,7 @@ For example, this sets allowed protocols to http, https and smb:
    ...     '<a href="smb://more_text">allowed protocol</a>',
    ...     protocols=['http', 'https', 'smb']
    ... )
-   u'<a href="smb://more_text">allowed protocol</a>'
+   '<a href="smb://more_text">allowed protocol</a>'
 
 
 This adds smb to the Bleach-specified set of allowed protocols:
@@ -265,7 +265,7 @@ This adds smb to the Bleach-specified set of allowed protocols:
    ...     '<a href="smb://more_text">allowed protocol</a>',
    ...     protocols=bleach.ALLOWED_PROTOCOLS + ['smb']
    ... )
-   u'<a href="smb://more_text">allowed protocol</a>'
+   '<a href="smb://more_text">allowed protocol</a>'
 
 
 Default protocols are in ``bleach.sanitizer.ALLOWED_PROTOCOLS``.
@@ -284,10 +284,10 @@ and invalid markup. For example:
    >>> import bleach
 
    >>> bleach.clean('<span>is not allowed</span>')
-   u'&lt;span&gt;is not allowed&lt;/span&gt;'
+   '&lt;span&gt;is not allowed&lt;/span&gt;'
 
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
-   u'<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
+   '<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
 
 
 If you would rather Bleach stripped this markup entirely, you can pass
@@ -298,10 +298,10 @@ If you would rather Bleach stripped this markup entirely, you can pass
    >>> import bleach
 
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
-   u'is not allowed'
+   'is not allowed'
 
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
-   u'<b>is not allowed</b>'
+   '<b>is not allowed</b>'
 
 
 Stripping comments (``strip_comments``)
@@ -317,10 +317,10 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
    >>> html = 'my<!-- commented --> html'
 
    >>> bleach.clean(html)
-   u'my html'
+   'my html'
 
    >>> bleach.clean(html, strip_comments=False)
-   u'my<!-- commented --> html'
+   'my<!-- commented --> html'
 
 
 Using ``bleach.sanitizer.Cleaner``
@@ -353,7 +353,7 @@ Trivial Filter example:
 .. doctest::
 
    >>> from bleach.sanitizer import Cleaner
-   >>> from html5lib.filters.base import Filter
+   >>> from bleach.html5lib_shim import Filter
 
    >>> class MooFilter(Filter):
    ...     def __iter__(self):
@@ -371,7 +371,7 @@ Trivial Filter example:
    >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
    >>> dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
    >>> cleaner.clean(dirty)
-   u'this is cute! <img rel="moo" src="moo">'
+   'this is cute! <img rel="moo" src="moo">'
 
 
 .. Warning::
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b8e7884e..b5d9d20f 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -80,12 +80,12 @@ For example, you could add a ``title`` attribute to all links:
    >>> from bleach.linkifier import Linker
 
    >>> def set_title(attrs, new=False):
-   ...     attrs[(None, u'title')] = u'link in user text'
+   ...     attrs[(None, 'title')] = 'link in user text'
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[set_title])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
+   'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
 
 
 This would set the value of the ``rel`` attribute, stomping on a previous value
@@ -96,21 +96,21 @@ an external link:
 
 .. doctest::
 
-   >>> from urlparse import urlparse
+   >>> from six.moves.urllib.parse import urlparse
    >>> from bleach.linkifier import Linker
 
    >>> def set_target(attrs, new=False):
-   ...     p = urlparse(attrs[(None, u'href')])
+   ...     p = urlparse(attrs[(None, 'href')])
    ...     if p.netloc not in ['my-domain.com', 'other-domain.com']:
-   ...         attrs[(None, u'target')] = u'_blank'
-   ...         attrs[(None, u'class')] = u'external'
+   ...         attrs[(None, 'target')] = '_blank'
+   ...         attrs[(None, 'class')] = 'external'
    ...     else:
-   ...         attrs.pop((None, u'target'), None)
+   ...         attrs.pop((None, 'target'), None)
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[set_target])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a class="external" href="http://example.com" target="_blank">http://example.com</a> def'
+   'abc <a class="external" href="http://example.com" target="_blank">http://example.com</a> def'
 
 
 Removing Attributes
@@ -127,17 +127,17 @@ sanitizing attributes.)
    >>> def allowed_attrs(attrs, new=False):
    ...     """Only allow href, target, rel and title."""
    ...     allowed = [
-   ...         (None, u'href'),
-   ...         (None, u'target'),
-   ...         (None, u'rel'),
-   ...         (None, u'title'),
-   ...         u'_text',
+   ...         (None, 'href'),
+   ...         (None, 'target'),
+   ...         (None, 'rel'),
+   ...         (None, 'title'),
+   ...         '_text',
    ...     ]
    ...     return dict((k, v) for k, v in attrs.items() if k in allowed)
    ...
    >>> linker = Linker(callbacks=[allowed_attrs])
    >>> linker.linkify('<a style="font-weight: super bold;" href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 Or you could remove a specific attribute, if it exists:
@@ -147,15 +147,15 @@ Or you could remove a specific attribute, if it exists:
    >>> from bleach.linkifier import Linker
 
    >>> def remove_title(attrs, new=False):
-   ...     attrs.pop((None, u'title'), None)
+   ...     attrs.pop((None, 'title'), None)
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[remove_title])
    >>> linker.linkify('<a href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
    >>> linker.linkify('<a title="bad title" href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 Altering Attributes
@@ -177,14 +177,14 @@ Example of shortening link text:
    ...     if not new:
    ...         return attrs
    ...     # _text will be the same as the URL for new links
-   ...     text = attrs[u'_text']
+   ...     text = attrs['_text']
    ...     if len(text) > 25:
-   ...         attrs[u'_text'] = text[0:22] + u'...'
+   ...         attrs['_text'] = text[0:22] + '...'
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[shorten_url])
    >>> linker.linkify('http://example.com/longlonglonglonglongurl')
-   u'<a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a>'
+   '<a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a>'
 
 
 Example of switching all links to go through a bouncer first:
@@ -196,7 +196,7 @@ Example of switching all links to go through a bouncer first:
 
    >>> def outgoing_bouncer(attrs, new=False):
    ...     """Send outgoing links through a bouncer."""
-   ...     href_key = (None, u'href')
+   ...     href_key = (None, 'href')
    ...     p = urlparse(attrs.get(href_key, None))
    ...     if p.netloc not in ['example.com', 'www.example.com', '']:
    ...         bouncer = 'http://bn.ce/?destination=%s'
@@ -205,10 +205,10 @@ Example of switching all links to go through a bouncer first:
    ...
    >>> linker = Linker(callbacks=[outgoing_bouncer])
    >>> linker.linkify('http://example.com')
-   u'<a href="http://example.com">http://example.com</a>'
+   '<a href="http://example.com">http://example.com</a>'
 
    >>> linker.linkify('http://foo.com')
-   u'<a href="http://bn.ce/?destination=http%3A//foo.com">http://foo.com</a>'
+   '<a href="http://bn.ce/?destination=http%3A//foo.com">http://foo.com</a>'
 
 
 Preventing Links
@@ -230,7 +230,7 @@ write the following callback:
    ...         return attrs
    ...     # If the TLD is '.py', make sure it starts with http: or https:.
    ...     # Use _text because that's the original text
-   ...     link_text = attrs[u'_text']
+   ...     link_text = attrs['_text']
    ...     if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
    ...         # This looks like a Python file, not a URL. Don't make a link.
    ...         return None
@@ -239,10 +239,10 @@ write the following callback:
    ...
    >>> linker = Linker(callbacks=[dont_linkify_python])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a href="http://example.com">http://example.com</a> def'
+   'abc <a href="http://example.com">http://example.com</a> def'
 
    >>> linker.linkify('abc models.py def')
-   u'abc models.py def'
+   'abc models.py def'
 
 
 .. _Crate: https://crate.io/
@@ -261,13 +261,13 @@ For example, this removes any ``mailto:`` links:
    >>> from bleach.linkifier import Linker
 
    >>> def remove_mailto(attrs, new=False):
-   ...     if attrs[(None, u'href')].startswith(u'mailto:'):
+   ...     if attrs[(None, 'href')].startswith('mailto:'):
    ...         return None
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[remove_mailto])
    >>> linker.linkify('<a href="mailto:janet@example.com">mail janet!</a>')
-   u'mail janet!'
+   'mail janet!'
 
 
 Skipping links in specified tag blocks (``skip_tags``)
@@ -308,7 +308,7 @@ instance.
 
    >>> linker = Linker(skip_tags=['pre'])
    >>> linker.linkify('a b c http://example.com d e f')
-   u'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
+   'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
 
 
 .. autoclass:: bleach.linkifier.Linker
@@ -340,11 +340,11 @@ For example, using all the defaults:
 
    >>> cleaner = Cleaner(tags=['pre'])
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre>http://example.com</pre>'
+   '<pre>http://example.com</pre>'
 
    >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter])
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre><a href="http://example.com">http://example.com</a></pre>'
+   '<pre><a href="http://example.com">http://example.com</a></pre>'
 
 
 And passing parameters to ``LinkifyFilter``:
@@ -362,7 +362,7 @@ And passing parameters to ``LinkifyFilter``:
    ... )
    ...
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre>http://example.com</pre>'
+   '<pre>http://example.com</pre>'
 
 
 .. autoclass:: bleach.linkifier.LinkifyFilter

From ad910ce30926f8698cf7c8f4ec8b32d00d0897b2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 12:05:35 -0500
Subject: [PATCH 210/314] Update for 3.1.0 release

---
 CHANGES            |  6 ++++--
 CONTRIBUTORS       | 12 ++++++++++--
 bleach/__init__.py |  4 ++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/CHANGES b/CHANGES
index 4fe065e8..838393b3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version 3.1.0 (In development)
-------------------------------
+Version 3.1.0 (January 9th, 2019)
+---------------------------------
 
 **Security fixes**
 
@@ -31,6 +31,8 @@ None
 * Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
   parsing ``meta`` tags. (#431)
 
+* Fix doctests. (#357)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 5783ab17..2b0137d0 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -18,21 +18,25 @@ Contributors:
 - Adam Lofts
 - Adrian "ThiefMaster"
 - Alek
-- Alexandre Macabies
-- Alexandr N. Zamaraev
 - Alex Defsen
 - Alex Ehlke
+- Alexandre Macabies
+- Alexandr N. Zamaraev
 - Alireza Savand
 - Andreas Malecki
 - Andy Freeland
 - Antoine Leclair
+- Anton Backer
 - Anton Kovalyov
+- Chad Birch
 - Chris Beaven
 - Dan Gayle
+- dave-shawley
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
 - Greg Guthe
+- hugovk
 - Istvan Albert
 - Jaime Irurzun
 - James Socol
@@ -49,6 +53,7 @@ Contributors:
 - Mark Lee
 - Mark Paschal
 - mdxs
+- Nikita Sobolev
 - nikolas
 - Oh Jinkyun
 - Paul Craciunoiu
@@ -56,8 +61,11 @@ Contributors:
 - Ryan Niemeyer
 - Sébastien Fievet
 - sedrubal
+- Stephane Blondon
+- Stu Cox
 - Tim Dumol
 - Timothy Fitz
+- Vadim Kotov
 - Vitaly Volkov
 - Will Kahn-Greene
 - Zoltán
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 6249bf81..9816549b 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20190109'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.0.dev0'
+__version__ = '3.1.0'
 VERSION = parse_version(__version__)
 
 

From 4b166191f1d72af8e3318d1b92ad3e8d2864a5ad Mon Sep 17 00:00:00 2001
From: Hideki Sekine <sekineh@users.noreply.github.com>
Date: Sat, 19 Jan 2019 00:18:43 +0900
Subject: [PATCH 211/314] fix #426 (#434)

Fix style filter to work with non-ascii attributes

Fix #426
---
 bleach/sanitizer.py |  3 ++-
 tests/test_css.py   | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 79b80f5b..6ccd78c6 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -593,7 +593,8 @@ def sanitize_css(self, style):
         # the whole thing.
         parts = style.split(';')
         gauntlet = re.compile(
-            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
+            r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
+            flags=re.U
         )
 
         for part in parts:
diff --git a/tests/test_css.py b/tests/test_css.py
index 12f27f3c..7e34027d 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,6 +1,8 @@
+# -*- coding: utf-8 -*-
 from functools import partial
 
 import pytest
+import six
 
 from bleach import clean
 
@@ -14,6 +16,11 @@
         ['color'],
         'color: red;'
     ),
+    (
+        u'font-family: メイリオ; color: red; float: left; background-color: red;',
+        [u'color'],
+        u'color: red;'
+    ),
     (
         'border: 1px solid blue; color: red; float: left;',
         ['color'],
@@ -70,11 +77,19 @@ def test_allowed_css(data, styles, expected):
     p_double = "<p style='{0!s}'>bar</p>"
 
     if '"' in data:
+        if is_python2_unicode(data):
+            p_double = unicode(p_double)
         assert clean(p_double.format(data), styles=styles) == p_double.format(expected)
     else:
+        if is_python2_unicode(data):
+            p_single = unicode(p_single)
         assert clean(p_single.format(data), styles=styles) == p_single.format(expected)
 
 
+def is_python2_unicode(data):
+    return six.PY2 and isinstance(data, unicode)
+
+
 def test_valid_css():
     """The sanitizer should fix missing CSS values."""
     styles = ['color', 'float']

From 4bb6d9a7b69689ebd75a48eed028d0f6ce4e138b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 18 Jan 2019 10:38:14 -0500
Subject: [PATCH 212/314] Fix style tests

This fixes the style tests so they're more explicit and less
composed-on-the-fly which makes them harder to verify.
---
 tests/test_css.py | 77 ++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 44 deletions(-)

diff --git a/tests/test_css.py b/tests/test_css.py
index 7e34027d..98c9bda8 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,8 +1,6 @@
-# -*- coding: utf-8 -*-
 from functools import partial
 
 import pytest
-import six
 
 from bleach import clean
 
@@ -12,82 +10,73 @@
 
 @pytest.mark.parametrize('data, styles, expected', [
     (
-        'font-family: Arial; color: red; float: left; background-color: red;',
+        '<p style="font-family: Arial; color: red; float: left; background-color: red;">bar</p>',
         ['color'],
-        'color: red;'
+        '<p style="color: red;">bar</p>'
     ),
     (
-        u'font-family: メイリオ; color: red; float: left; background-color: red;',
-        [u'color'],
-        u'color: red;'
-    ),
-    (
-        'border: 1px solid blue; color: red; float: left;',
+        '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
         ['color'],
-        'color: red;'
+        '<p style="color: red;">bar</p>'
     ),
     (
-        'border: 1px solid blue; color: red; float: left;',
+        '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
         ['color', 'float'],
-        'color: red; float: left;'
+        '<p style="color: red; float: left;">bar</p>'
     ),
     (
-        'color: red; float: left; padding: 1em;',
+        '<p style="color: red; float: left; padding: 1em;">bar</p>',
         ['color', 'float'],
-        'color: red; float: left;'
+        '<p style="color: red; float: left;">bar</p>'
     ),
     (
-        'color: red; float: left; padding: 1em;',
+        '<p style="color: red; float: left; padding: 1em;">bar</p>',
         ['color'],
-        'color: red;'
+        '<p style="color: red;">bar</p>'
     ),
+    # Handle leading - in attributes
     (
-        'cursor: -moz-grab;',
+        '<p style="cursor: -moz-grab;">bar</p>',
         ['cursor'],
-        'cursor: -moz-grab;'
+        '<p style="cursor: -moz-grab;">bar</p>'
     ),
+    # Handle () in attributes
     (
-        'color: hsl(30,100%,50%);',
+        '<p style="color: hsl(30,100%,50%);">bar</p>',
         ['color'],
-        'color: hsl(30,100%,50%);'
+        '<p style="color: hsl(30,100%,50%);">bar</p>',
     ),
     (
-        'color: rgba(255,0,0,0.4);',
+        '<p style="color: rgba(255,0,0,0.4);">bar</p>',
         ['color'],
-        'color: rgba(255,0,0,0.4);'
+        '<p style="color: rgba(255,0,0,0.4);">bar</p>',
     ),
+    # Handle ' in attributes
     (
-        "text-overflow: ',' ellipsis;",
+        '<p style="text-overflow: \',\' ellipsis;">bar</p>',
         ['text-overflow'],
-        "text-overflow: ',' ellipsis;"
+        '<p style="text-overflow: \',\' ellipsis;">bar</p>'
     ),
+    # Handle " in attributes
     (
-        'text-overflow: "," ellipsis;',
+        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>',
         ['text-overflow'],
-        'text-overflow: "," ellipsis;'
+        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>'
     ),
     (
-        'font-family: "Arial";',
+        '<p style=\'font-family: "Arial";\'>bar</p>',
         ['font-family'],
-        'font-family: "Arial";'
+        '<p style=\'font-family: "Arial";\'>bar</p>'
+    ),
+    # Handle non-ascii characters in attributes
+    (
+        u'<p style="font-family: \u30e1\u30a4\u30ea\u30aa; color: blue;">bar</p>',
+        [u'color'],
+        u'<p style="color: blue;">bar</p>'
     ),
 ])
 def test_allowed_css(data, styles, expected):
-    p_single = '<p style="{0!s}">bar</p>'
-    p_double = "<p style='{0!s}'>bar</p>"
-
-    if '"' in data:
-        if is_python2_unicode(data):
-            p_double = unicode(p_double)
-        assert clean(p_double.format(data), styles=styles) == p_double.format(expected)
-    else:
-        if is_python2_unicode(data):
-            p_single = unicode(p_single)
-        assert clean(p_single.format(data), styles=styles) == p_single.format(expected)
-
-
-def is_python2_unicode(data):
-    return six.PY2 and isinstance(data, unicode)
+    assert clean(data, styles=styles) == expected
 
 
 def test_valid_css():

From a130d9a7abc09a414649fafcbfeb03471caccba8 Mon Sep 17 00:00:00 2001
From: Mozilla-GitHub-Standards
 <48073334+Mozilla-GitHub-Standards@users.noreply.github.com>
Date: Wed, 27 Mar 2019 20:35:42 -0700
Subject: [PATCH 213/314] Add Mozilla Code of Conduct file

Fixes #440.

_(Message COC002)_
---
 CODE_OF_CONDUCT.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 CODE_OF_CONDUCT.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..498baa3f
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,15 @@
+# Community Participation Guidelines
+
+This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
+For more details, please read the
+[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 
+
+## How to Report
+For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.
+
+<!--
+## Project Specific Etiquette
+
+In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
+Please update for your project.
+-->

From b776610202a3cd354f0161d08fe4d0e1c153c955 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 08:28:29 -0700
Subject: [PATCH 214/314] Simplify tox configuration

Remove boilerplate comment from the top of the file.

In envlist, move pypy to same line as rest of the default Python
environments.

Don't override basepython. The default uses the correct Python
executable:

https://tox.readthedocs.io/en/latest/config.html#conf-basepython

> If not specified, the virtual environments factors (e.g. name part)
> will be used to automatically set one. For example, py37 means
> python3.7, py3 means python3 and py means python.
---
 tox.ini | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/tox.ini b/tox.ini
index 67936b43..ef3284f6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,26 +1,14 @@
-# Tox (http://tox.testrun.org/) is a tool for running tests
-# in multiple virtualenvs. This configuration file will run the
-# test suite on all supported python versions. To use it, "pip install tox"
-# and then run "tox" from this directory.
-
 # Note: If you update this, make sure to update .travis.yml, too.
 
 [tox]
 envlist =
-    py{27,34,35,36,37}
-    pypy
+    py{27,34,35,36,37,py}
     py{27,34,35,36,37}-build-no-lang
     docs
     lint
     vendorverify
 
 [testenv]
-basepython =
-    py27: python2.7
-    py34: python3.4
-    py35: python3.5
-    py36: python3.6
-    py37: python3.7
 deps =
     -rrequirements-dev.txt
 commands =
@@ -28,35 +16,30 @@ commands =
     python setup.py build
 
 [testenv:py27-build-no-lang]
-basepython = python2.7
 setenv =
     LANG=
 commands =
     python setup.py build
 
 [testenv:py34-build-no-lang]
-basepython = python3.4
 setenv =
     LANG=
 commands =
     python setup.py build
 
 [testenv:py35-build-no-lang]
-basepython = python3.5
 setenv =
     LANG=
 commands =
      python setup.py build
 
 [testenv:py36-build-no-lang]
-basepython = python3.6
 setenv =
     LANG=
 commands =
     python setup.py build
 
 [testenv:py37-build-no-lang]
-basepython = python3.7
 setenv =
     LANG=
 commands =

From 43f8a576a819d551464ec587d412f5f0b0f78291 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 08:17:31 -0700
Subject: [PATCH 215/314] Remove u string prefix in files with unicode_literals

Using `from __future__ import unicode_literals` guarantees that all
string literals will be Unicode strings by default. Using the u prefix
is redundant. Cleans up the code towards a Python 3 style.
---
 bleach/callbacks.py         | 20 +++++------
 bleach/html5lib_shim.py     |  2 +-
 bleach/linkifier.py         | 66 ++++++++++++++++++-------------------
 bleach/sanitizer.py         |  4 +--
 tests/test_callbacks.py     | 12 ++++---
 tests/test_clean.py         | 22 +++++++------
 tests/test_css.py           |  8 +++--
 tests/test_html5lib_shim.py |  4 ++-
 tests/test_linkify.py       | 14 ++++----
 9 files changed, 81 insertions(+), 71 deletions(-)

diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index 99d56b80..c9625029 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -3,31 +3,31 @@
 
 
 def nofollow(attrs, new=False):
-    href_key = (None, u'href')
+    href_key = (None, 'href')
 
     if href_key not in attrs:
         return attrs
 
-    if attrs[href_key].startswith(u'mailto:'):
+    if attrs[href_key].startswith('mailto:'):
         return attrs
 
-    rel_key = (None, u'rel')
-    rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
-    if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
-        rel_values.append(u'nofollow')
-    attrs[rel_key] = u' '.join(rel_values)
+    rel_key = (None, 'rel')
+    rel_values = [val for val in attrs.get(rel_key, '').split(' ') if val]
+    if 'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append('nofollow')
+    attrs[rel_key] = ' '.join(rel_values)
 
     return attrs
 
 
 def target_blank(attrs, new=False):
-    href_key = (None, u'href')
+    href_key = (None, 'href')
 
     if href_key not in attrs:
         return attrs
 
-    if attrs[href_key].startswith(u'mailto:'):
+    if attrs[href_key].startswith('mailto:'):
         return attrs
 
-    attrs[(None, u'target')] = u'_blank'
+    attrs[(None, 'target')] = '_blank'
     return attrs
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 25e3e955..f877d2f6 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -447,7 +447,7 @@ def convert_entities(text):
 
         new_text.append(part)
 
-    return u''.join(new_text)
+    return ''.join(new_text)
 
 
 def match_entity(stream):
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 95baba14..fbbf94c1 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -149,7 +149,7 @@ def linkify(self, text):
         text = force_unicode(text)
 
         if not text:
-            return u''
+            return ''
 
         dom = self.parser.parseFragment(text)
         filtered = LinkifyFilter(
@@ -250,7 +250,7 @@ def extract_character_data(self, token_list):
             if token_type in ['Characters', 'SpaceCharacters']:
                 out.append(token['data'])
 
-        return u''.join(out)
+        return ''.join(out)
 
     def handle_email_addresses(self, src_iter):
         """Handle email addresses in character tokens"""
@@ -264,31 +264,31 @@ def handle_email_addresses(self, src_iter):
                 for match in self.email_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
-                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                            {'type': 'Characters', 'data': text[end:match.start()]}
                         )
 
                     # Run attributes through the callbacks to see what we
                     # should do with this match
                     attrs = {
-                        (None, u'href'): u'mailto:%s' % match.group(0),
-                        u'_text': match.group(0)
+                        (None, 'href'): 'mailto:%s' % match.group(0),
+                        '_text': match.group(0)
                     }
                     attrs = self.apply_callbacks(attrs, True)
 
                     if attrs is None:
                         # Just add the text--but not as a link
                         new_tokens.append(
-                            {u'type': u'Characters', u'data': match.group(0)}
+                            {'type': 'Characters', 'data': match.group(0)}
                         )
 
                     else:
                         # Add an "a" tag for the new link
-                        _text = attrs.pop(u'_text', '')
+                        _text = attrs.pop('_text', '')
                         attrs = alphabetize_attributes(attrs)
                         new_tokens.extend([
-                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
-                            {u'type': u'Characters', u'data': force_unicode(_text)},
-                            {u'type': u'EndTag', u'name': 'a'}
+                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
+                            {'type': 'Characters', 'data': force_unicode(_text)},
+                            {'type': 'EndTag', 'name': 'a'}
                         ])
                     end = match.end()
 
@@ -296,7 +296,7 @@ def handle_email_addresses(self, src_iter):
                     # Yield the adjusted set of tokens and then continue
                     # through the loop
                     if end < len(text):
-                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
 
                     for new_token in new_tokens:
                         yield new_token
@@ -316,12 +316,12 @@ def strip_non_url_bits(self, fragment):
         while fragment:
             # Try removing ( from the beginning and, if it's balanced, from the
             # end, too
-            if fragment.startswith(u'('):
-                prefix = prefix + u'('
+            if fragment.startswith('('):
+                prefix = prefix + '('
                 fragment = fragment[1:]
 
-                if fragment.endswith(u')'):
-                    suffix = u')' + suffix
+                if fragment.endswith(')'):
+                    suffix = ')' + suffix
                     fragment = fragment[:-1]
                 continue
 
@@ -331,21 +331,21 @@ def strip_non_url_bits(self, fragment):
             #
             #     "i looked at the site (at http://example.com)"
 
-            if fragment.endswith(u')') and u'(' not in fragment:
+            if fragment.endswith(')') and '(' not in fragment:
                 fragment = fragment[:-1]
-                suffix = u')' + suffix
+                suffix = ')' + suffix
                 continue
 
             # Handle commas
-            if fragment.endswith(u','):
+            if fragment.endswith(','):
                 fragment = fragment[:-1]
-                suffix = u',' + suffix
+                suffix = ',' + suffix
                 continue
 
             # Handle periods
-            if fragment.endswith(u'.'):
+            if fragment.endswith('.'):
                 fragment = fragment[:-1]
-                suffix = u'.' + suffix
+                suffix = '.' + suffix
                 continue
 
             # Nothing matched, so we're done
@@ -374,7 +374,7 @@ def handle_links(self, src_iter):
                 for match in self.url_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
-                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                            {'type': 'Characters', 'data': text[end:match.start()]}
                         )
 
                     url = match.group(0)
@@ -388,39 +388,39 @@ def handle_links(self, src_iter):
                     if PROTO_RE.search(url):
                         href = url
                     else:
-                        href = u'http://%s' % url
+                        href = 'http://%s' % url
 
                     attrs = {
-                        (None, u'href'): href,
-                        u'_text': url
+                        (None, 'href'): href,
+                        '_text': url
                     }
                     attrs = self.apply_callbacks(attrs, True)
 
                     if attrs is None:
                         # Just add the text
                         new_tokens.append(
-                            {u'type': u'Characters', u'data': prefix + url + suffix}
+                            {'type': 'Characters', 'data': prefix + url + suffix}
                         )
 
                     else:
                         # Add the "a" tag!
                         if prefix:
                             new_tokens.append(
-                                {u'type': u'Characters', u'data': prefix}
+                                {'type': 'Characters', 'data': prefix}
                             )
 
-                        _text = attrs.pop(u'_text', '')
+                        _text = attrs.pop('_text', '')
                         attrs = alphabetize_attributes(attrs)
 
                         new_tokens.extend([
-                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
-                            {u'type': u'Characters', u'data': force_unicode(_text)},
-                            {u'type': u'EndTag', u'name': 'a'},
+                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
+                            {'type': 'Characters', 'data': force_unicode(_text)},
+                            {'type': 'EndTag', 'name': 'a'},
                         ])
 
                         if suffix:
                             new_tokens.append(
-                                {u'type': u'Characters', u'data': suffix}
+                                {'type': 'Characters', 'data': suffix}
                             )
 
                     end = match.end()
@@ -429,7 +429,7 @@ def handle_links(self, src_iter):
                     # Yield the adjusted set of tokens and then continue
                     # through the loop
                     if end < len(text):
-                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
 
                     for new_token in new_tokens:
                         yield new_token
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 6ccd78c6..0a60cd80 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -162,7 +162,7 @@ def clean(self, text):
             raise TypeError(message)
 
         if not text:
-            return u''
+            return ''
 
         text = force_unicode(text)
 
@@ -528,7 +528,7 @@ def allow_token(self, token):
                             continue
 
                 # If it's a style attribute, sanitize it
-                if namespaced_name == (None, u'style'):
+                if namespaced_name == (None, 'style'):
                     val = self.sanitize_css(val)
 
                 # At this point, we want to keep the attribute, so add it in
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 1d447a2e..69c9a133 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from bleach.callbacks import nofollow, target_blank
 
 
@@ -45,19 +47,19 @@ def test_empty(self):
         assert target_blank(attrs) == attrs
 
     def test_mailto(self):
-        attrs = {(None, u'href'): u'mailto:joe@example.com'}
+        attrs = {(None, 'href'): 'mailto:joe@example.com'}
         assert target_blank(attrs) == attrs
 
     def test_add_target(self):
-        attrs = {(None, u'href'): u'http://example.com'}
+        attrs = {(None, 'href'): 'http://example.com'}
         assert (
             target_blank(attrs) ==
-            {(None, u'href'): u'http://example.com', (None, u'target'): u'_blank'}
+            {(None, 'href'): 'http://example.com', (None, 'target'): '_blank'}
         )
 
     def test_stomp_target(self):
-        attrs = {(None, u'href'): u'http://example.com', (None, u'target'): u'foo'}
+        attrs = {(None, 'href'): 'http://example.com', (None, 'target'): 'foo'}
         assert (
             target_blank(attrs) ==
-            {(None, u'href'): 'http://example.com', (None, u'target'): u'_blank'}
+            {(None, 'href'): 'http://example.com', (None, 'target'): '_blank'}
         )
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 53227677..f1a36722 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import os
 
 import pytest
@@ -471,10 +473,10 @@ def test_attributes_callable():
     ATTRS = lambda tag, name, val: name == 'title'
     TAGS = ['a']
 
-    text = u'<a href="/foo" title="blah">example</a>'
+    text = '<a href="/foo" title="blah">example</a>'
     assert (
         clean(text, tags=TAGS, attributes=ATTRS) ==
-        u'<a title="blah">example</a>'
+        '<a title="blah">example</a>'
     )
 
 
@@ -501,8 +503,8 @@ def test_attributes_wildcard_callable():
     TAGS = ['a']
 
     assert (
-        clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-        u'<a title="blah">example</a>'
+        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        '<a title="blah">example</a>'
     )
 
 
@@ -519,12 +521,12 @@ def img_test(tag, name, val):
     text = 'foo <img src="http://example.com" alt="blah"> baz'
     assert (
         clean(text, tags=TAGS, attributes=ATTRS) ==
-        u'foo <img> baz'
+        'foo <img> baz'
     )
     text = 'foo <img src="https://example.com" alt="blah"> baz'
     assert (
         clean(text, tags=TAGS, attributes=ATTRS) ==
-        u'foo <img src="https://example.com"> baz'
+        'foo <img src="https://example.com"> baz'
     )
 
 
@@ -536,8 +538,8 @@ def test_attributes_tag_list():
     TAGS = ['a']
 
     assert (
-        clean(u'<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-        u'<a title="blah">example</a>'
+        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
+        '<a title="blah">example</a>'
     )
 
 
@@ -546,10 +548,10 @@ def test_attributes_list():
     ATTRS = ['title']
     TAGS = ['a']
 
-    text = u'<a href="/foo" title="blah">example</a>'
+    text = '<a href="/foo" title="blah">example</a>'
     assert (
         clean(text, tags=TAGS, attributes=ATTRS) ==
-        u'<a title="blah">example</a>'
+        '<a title="blah">example</a>'
     )
 
 
diff --git a/tests/test_css.py b/tests/test_css.py
index 98c9bda8..92fe9553 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from functools import partial
 
 import pytest
@@ -70,9 +72,9 @@
     ),
     # Handle non-ascii characters in attributes
     (
-        u'<p style="font-family: \u30e1\u30a4\u30ea\u30aa; color: blue;">bar</p>',
-        [u'color'],
-        u'<p style="color: blue;">bar</p>'
+        '<p style="font-family: \u30e1\u30a4\u30ea\u30aa; color: blue;">bar</p>',
+        ['color'],
+        '<p style="color: blue;">bar</p>'
     ),
 ])
 def test_allowed_css(data, styles, expected):
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index ce15de7e..f4e19980 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import pytest
 
 from bleach import html5lib_shim
@@ -9,7 +11,7 @@
     ('abc', 'abc'),
 
     # Handles character entities--both named and numeric
-    ('&nbsp;', u'\xa0'),
+    ('&nbsp;', '\xa0'),
     ('&#32;', ' '),
     ('&#x20;', ' '),
 
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index f1211894..683498a6 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 import pytest
@@ -190,7 +192,7 @@ def test_set_attrs():
     """We can set random attributes on links."""
 
     def set_attr(attrs, new=False):
-        attrs[(None, u'rev')] = u'canonical'
+        attrs[(None, 'rev')] = 'canonical'
         return attrs
 
     assert (
@@ -562,14 +564,14 @@ def test_drop_link_tags():
 
 
 @pytest.mark.parametrize('text, expected', [
-    (u'&lt;br&gt;', u'&lt;br&gt;'),
+    ('&lt;br&gt;', '&lt;br&gt;'),
     (
-        u'&lt;br&gt; http://example.com',
-        u'&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>'
+        '&lt;br&gt; http://example.com',
+        '&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>'
     ),
     (
-        u'&lt;br&gt; <br> http://example.com',
-        u'&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>'
+        '&lt;br&gt; <br> http://example.com',
+        '&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>'
     )
 ])
 def test_naughty_unescaping(text, expected):

From 04541600fea357bb2811a2fb31ac992c17f05f16 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 08:23:22 -0700
Subject: [PATCH 216/314] Add pypy3 testing to tox and Travis

In Travis, use `dist: xenial` by default to provide support for all test
platforms.
---
 .travis.yml | 8 ++++----
 tox.ini     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f315b9bd..d1aebd45 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,5 @@
 # Note: If you update this, make sure to update tox.ini, too.
+dist: xenial
 sudo: false
 language: python
 cache:
@@ -10,7 +11,9 @@ python:
   - "3.4"
   - "3.5"
   - "3.6"
-  - "pypy"
+  - "3.7"
+  - "pypy2.7-6.0"
+  - "pypy3.5-6.0"
 
 install:
   - pip install -U pip setuptools>=18.5
@@ -24,9 +27,6 @@ matrix:
       env: MODE=vendorverify
     - python: "3.4"
       env: MODE=lint
-    - python: "3.7"
-      sudo: required
-      dist: xenial
 
 script:
   - ./scripts/run_tests.sh $MODE
diff --git a/tox.ini b/tox.ini
index ef3284f6..5500cac1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,7 +2,7 @@
 
 [tox]
 envlist =
-    py{27,34,35,36,37,py}
+    py{27,34,35,36,37,py,py3}
     py{27,34,35,36,37}-build-no-lang
     docs
     lint

From b1546881ae0f7187d3af9a43d05e42c246a0b20a Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 18:38:33 -0700
Subject: [PATCH 217/314] Fix ResourceWarning emitted during tests

Always close the open file by using a context manager. Previously
Appeared as:

    tests/test_clean.py:790
      tests/test_clean.py:790: ResourceWarning: unclosed file <_io.TextIOWrapper name='.../tests/data/1.test' mode='r' encoding='UTF-8'>
        for fn in tests
---
 tests/test_clean.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index f1a36722..e306cc50 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -785,10 +785,11 @@ def get_ids_and_tests():
     # Sort numerically which makes it easier to iterate through them
     tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
 
-    testcases = [
-        (os.path.basename(fn), open(fn, 'r').read())
-        for fn in tests
-    ]
+    testcases = []
+    for fn in tests:
+        with open(fn) as fp:
+            data = fp.read()
+        testcases.append((os.path.basename(fn), data))
 
     return testcases
 

From df28d50f066883492926be69fab5d8299cb01ab7 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 18:30:10 -0700
Subject: [PATCH 218/314] Fix ResourceWarnings emitted from setup.py

Use io over codecs. It is the more recently developed library and
follows the same semantics as Python3's builtin open.

Warnings appeared as:

    setup.py:37: ResourceWarning: unclosed file <_io.BufferedReader name='bleach/__init__.py'>
      version_file = codecs.open(fn, mode='r', encoding='utf-8').read()
    ResourceWarning: Enable tracemalloc to get the object allocation traceback
    setup.py:28: ResourceWarning: unclosed file <_io.BufferedReader name='README.rst'>
      desc = codecs.open('README.rst', encoding='utf-8').read()
    ResourceWarning: Enable tracemalloc to get the object allocation traceback
    setup.py:30: ResourceWarning: unclosed file <_io.BufferedReader name='CHANGES'>
      desc += codecs.open('CHANGES', encoding='utf-8').read()
---
 setup.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index bf01a4ed..f5b675b1 100755
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import codecs
+import io
 import os
 import re
 import sys
@@ -25,16 +25,19 @@
 
 
 def get_long_desc():
-    desc = codecs.open('README.rst', encoding='utf-8').read()
+    with io.open('README.rst', encoding='utf-8') as fp:
+        desc = fp.read()
     desc += '\n\n'
-    desc += codecs.open('CHANGES', encoding='utf-8').read()
+    with io.open('CHANGES', encoding='utf-8') as fp:
+        desc += fp.read()
     return desc
 
 
 def get_version():
     fn = os.path.join('bleach', '__init__.py')
     vsre = r"""^__version__ = ['"]([^'"]*)['"]"""
-    version_file = codecs.open(fn, mode='r', encoding='utf-8').read()
+    with io.open(fn, encoding='utf-8') as fp:
+        version_file = fp.read()
     return re.search(vsre, version_file, re.M).group(1)
 
 

From b5ebc86a9956dde90e91955be703b31fc9724ece Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Fri, 12 Apr 2019 18:19:34 -0700
Subject: [PATCH 219/314] Use set literals

---
 bleach/html5lib_shim.py         | 4 ++--
 tests_website/open_test_page.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index f877d2f6..e2dc66f0 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -37,11 +37,11 @@
 ENTITIES_TRIE = Trie(ENTITIES)
 
 #: Token type constants--these never change
-TAG_TOKEN_TYPES = set([
+TAG_TOKEN_TYPES = {
     constants.tokenTypes['StartTag'],
     constants.tokenTypes['EndTag'],
     constants.tokenTypes['EmptyTag']
-])
+}
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 PARSEERROR_TYPE = constants.tokenTypes['ParseError']
 
diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py
index 79f4adf2..a01de73b 100755
--- a/tests_website/open_test_page.py
+++ b/tests_website/open_test_page.py
@@ -3,7 +3,7 @@
 import webbrowser
 
 
-TEST_BROWSERS = set([
+TEST_BROWSERS = {
     # 'mozilla',
     'firefox',
     # 'netscape',
@@ -27,7 +27,7 @@
     'chrome',
     # 'chromium',
     # 'chromium-browser',
-])
+}
 REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
 
 

From d804b24c2793e57e3426231310568e7dfd178daf Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Sun, 21 Apr 2019 08:08:31 -0700
Subject: [PATCH 220/314] Update PyPI link to new location

For details on the new PyPI, see the blog post:

https://pythoninsider.blogspot.ca/2018/04/new-pypi-launched-legacy-pypi-shutting.html
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 21670e9f..05dea178 100644
--- a/README.rst
+++ b/README.rst
@@ -114,4 +114,4 @@ Participation Guidelines
 .. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
-.. _PyPI: http://pypi.python.org/pypi/bleach
+.. _PyPI: https://pypi.org/project/bleach/

From 27693345f4439bff5fb7e92ec4caa145f7cdd13f Mon Sep 17 00:00:00 2001
From: Chris Lamb <chris@chris-lamb.co.uk>
Date: Wed, 7 Aug 2019 09:47:45 +0100
Subject: [PATCH 221/314] Make the documentation reproducible.

Whilst working on the Reproducible Builds effort [0] we noticed
that python-bleach could not be built reproducibly.

This is because the documentation included a default arguments that
was (originally) generated from a "frozenset" type which are iterated
over at runtime in a nondeterministic order.

This commit therefore ensures a regular expression is constructed in
a deterministic manner, rendering the documentation reproducible.

This bug was originally filed in Debian as #934120 [1]

 [0] https://reproducible-builds.org/
 [1] https://bugs.debian.org/934120
---
 bleach/linkifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index fbbf94c1..fe822293 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -49,7 +49,7 @@ def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
         (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
             # /path/zz (excluding "unsafe" chars from RFC 1738,
             # except for # and ~, which happen in practice)
-        """.format('|'.join(protocols), '|'.join(tlds)),
+        """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))),
         re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 

From dab32d54699a7b35c5481296b62e688e10a47934 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Tue, 20 Aug 2019 05:48:22 -0700
Subject: [PATCH 222/314] Stop using deprecated Travis container environment

https://blog.travis-ci.com/2018-11-19-required-linux-infrastructure-migration
---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index d1aebd45..c5b81883 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,5 @@
 # Note: If you update this, make sure to update tox.ini, too.
 dist: xenial
-sudo: false
 language: python
 cache:
   directories:

From ad3bae8c0325045baf34ee5779b4f4bc706116dc Mon Sep 17 00:00:00 2001
From: ")_(" <zborboa@google.com>
Date: Fri, 16 Aug 2019 08:00:00 -0700
Subject: [PATCH 223/314] Fix typo

---
 bleach/html5lib_shim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index e2dc66f0..1cb4efe0 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -552,7 +552,7 @@ def escape_base_amp(self, stoken):
                     yield '&' + entity + ';'
 
                     # Length of the entity plus 2--one for & at the beginning
-                    # and and one for ; at the end
+                    # and one for ; at the end
                     part = part[len(entity) + 2:]
                     if part:
                         yield part

From 2c1833a917998291e69e24e05f12a8e2a5c8ec5a Mon Sep 17 00:00:00 2001
From: ")_(" <zborboa@google.com>
Date: Fri, 16 Aug 2019 08:01:38 -0700
Subject: [PATCH 224/314] Fix typo

---
 bleach/linkifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index fe822293..5cde429b 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -241,7 +241,7 @@ def extract_character_data(self, token_list):
         # linkify callables.
         #
         # I'm not really sure how else to support that ``_text`` fauxttribute and
-        # maintain some modicum of backwards compatability with previous versions
+        # maintain some modicum of backwards compatibility with previous versions
         # of Bleach.
 
         out = []

From bcb63a293088db5412b6ba70144031a627ef27d5 Mon Sep 17 00:00:00 2001
From: ")_(" <zborboa@google.com>
Date: Fri, 16 Aug 2019 08:02:02 -0700
Subject: [PATCH 225/314] Fix typo

---
 bleach/sanitizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0a60cd80..54f624fa 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -408,7 +408,7 @@ def sanitize_characters(self, token):
                         new_tokens.append({'type': 'Entity', 'name': entity})
 
                     # Length of the entity plus 2--one for & at the beginning
-                    # and and one for ; at the end
+                    # and one for ; at the end
                     remainder = part[len(entity) + 2:]
                     if remainder:
                         new_tokens.append({'type': 'Characters', 'data': remainder})

From 7d29a8c23975afe266994b89ca87ca51912b4b8c Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Mon, 10 Jun 2019 13:43:51 -0700
Subject: [PATCH 226/314] =?UTF-8?q?Correct=20typo:=20bleach.clean(user=5Fb?=
 =?UTF-8?q?io}=20=E2=86=92=20bleach.clean(user=5Fbio)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/clean.rst | 2 +-
 docs/goals.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index c786f2a3..9279765a 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -34,7 +34,7 @@ return ``unicode``.
 
    This is a **not safe** use of ``clean`` output in an HTML attribute::
 
-     <body data-bio="{{ bleach.clean(user_bio} }}">
+     <body data-bio="{{ bleach.clean(user_bio) }}">
 
 
    If you need to use the output of ``bleach.clean()`` in an HTML attribute, you
diff --git a/docs/goals.rst b/docs/goals.rst
index 4bbe5ebc..d7f10d6d 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -90,7 +90,7 @@ For example, this is a safe use of ``clean`` output in an HTML context::
 
 This is a **not safe** use of ``clean`` output in an HTML attribute::
 
-    <body data-bio="{{ bleach.clean(user_bio} }}">
+    <body data-bio="{{ bleach.clean(user_bio) }}">
 
 
 If you need to use the output of ``bleach.clean()`` in an HTML attribute, you

From f336af3b34458d635d17389a346307ecfbe020d8 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Sep 2019 17:17:32 -0400
Subject: [PATCH 227/314] docs: add IDN homograph attacks to goals and linkify

refs: #464
---
 docs/goals.rst   |  4 ++++
 docs/linkify.rst | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/docs/goals.rst b/docs/goals.rst
index d7f10d6d..231a570b 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -59,6 +59,10 @@ the :ref:`linkify() method <linkify-chapter>` is flexible enough to allow the
 creation, alteration, and removal of links based on an extremely wide range of
 use cases.
 
+Bleach does not try to verify the validity or safety of the domains
+linked to beyond being well-formed (see :ref:`Linkifying text
+fragments <linkify-chapter>` for details).
+
 
 Non-Goals
 =========
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b5d9d20f..bcb0503e 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -26,9 +26,22 @@ having URLs in tag attributes getting linkified.
    You may pass a ``string`` or ``unicode`` object, but Bleach will always
    return ``unicode``.
 
+.. note::
 
-.. autofunction:: bleach.linkify
+   By default `linkify` **does not** attempt to protect users from bad
+   or deceptive links including:
+
+   * links to malicious or deceptive domains
+   * shortened or tracking links
+   * deceptive links using internationalized domain names (IDN) that
+     resemble legitimate domains for `IDN homograph attacks
+     <https://en.wikipedia.org/wiki/IDN_homograph_attack>`_ (font
+     styling, background color, and other context is unavailable)
 
+   We recommend using additional callbacks or other controls to check
+   these properties.
+
+.. autofunction:: bleach.linkify
 
 Callbacks for adjusting attributes (``callbacks``)
 ==================================================

From b75f2e4c510f8377a2ab419b88388cab03ae62d9 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Sep 2019 14:54:58 -0400
Subject: [PATCH 228/314] require pytest <5

fixes #467
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 758459aa..ed92d83f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 -e .
 
 # Requirements to run the test suite
-pytest
+pytest<5
 pytest-wholenodeid
 flake8
 tox

From 3ed13943c6b05b3ff7e9f79f91da5a18c66bc945 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Sep 2019 14:00:28 -0400
Subject: [PATCH 229/314] docs: Add linkify Linker examples using custom TLDs
 and protocols

fixes: #472
---
 docs/linkify.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index bcb0503e..1883acc5 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -324,6 +324,33 @@ instance.
    'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
 
 
+It includes optional keyword arguments to specify allowed top-level
+domains (TLDs) and URL protocols/schemes:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker, build_url_re
+
+   >>> only_fish_tld_url_re = build_url_re(tlds=['fish'])
+   >>> linker = Linker(url_re=only_fish_tld_url_re)
+
+   >>> linker.linkify('com TLD does not link https://example.com')
+   'com TLD does not link https://example.com'
+   >>> linker.linkify('fish TLD links https://example.fish')
+   'fish TLD links <a href="https://example.fish" rel="nofollow">https://example.fish</a>'
+
+
+   >>> only_https_url_re = build_url_re(protocols=['https'])
+   >>> linker = Linker(url_re=only_https_url_re)
+
+   >>> linker.linkify('gopher does not link gopher://example.link')
+   'gopher does not link gopher://example.link'
+   >>> linker.linkify('https links https://example.com/')
+   'https links <a href="https://example.com/" rel="nofollow">https://example.com/</a>'
+
+
+:ref:`LinkifyFilter <linkify-LinkifyFilter>` also accepts these options.
+
 .. autoclass:: bleach.linkifier.Linker
    :members:
 

From 62fd85c779457a4ea11668d0b2cb8d91f2bab71b Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Sep 2019 16:13:09 -0400
Subject: [PATCH 230/314] docs: Add linkify TLD encoding example for localized
 TLDs

refs: #368
---
 docs/linkify.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index 1883acc5..4c34813f 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -349,6 +349,21 @@ domains (TLDs) and URL protocols/schemes:
    'https links <a href="https://example.com/" rel="nofollow">https://example.com/</a>'
 
 
+Specify localized TLDs with and without punycode encoding to handle
+both formats:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker, build_url_re
+
+   >>> linker = Linker(url_re=build_url_re(tlds=['рф']))
+   >>> linker.linkify('https://xn--80aaksdi3bpu.xn--p1ai/ https://дайтрафик.рф/')
+   'https://xn--80aaksdi3bpu.xn--p1ai/ <a href="https://дайтрафик.рф/" rel="nofollow">https://дайтрафик.рф/</a>'
+
+   >>> puny_linker = Linker(url_re=build_url_re(tlds=['рф', 'xn--p1ai']))
+   >>> puny_linker.linkify('https://xn--80aaksdi3bpu.xn--p1ai/ https://дайтрафик.рф/')
+   '<a href="https://xn--80aaksdi3bpu.xn--p1ai/" rel="nofollow">https://xn--80aaksdi3bpu.xn--p1ai/</a> <a href="https://дайтрафик.рф/" rel="nofollow">https://дайтрафик.рф/</a>'
+
 :ref:`LinkifyFilter <linkify-LinkifyFilter>` also accepts these options.
 
 .. autoclass:: bleach.linkifier.Linker

From f0306aad47d6cb1cf8e73735851d884d3fbc2a84 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 25 Sep 2019 09:39:00 -0400
Subject: [PATCH 231/314] Fix tox.ini to run doctests in docs

---
 .travis.yml          | 2 ++
 scripts/run_tests.sh | 2 ++
 tox.ini              | 1 +
 3 files changed, 5 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index c5b81883..6b570a95 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,6 +26,8 @@ matrix:
       env: MODE=vendorverify
     - python: "3.4"
       env: MODE=lint
+    - python: "3.5"
+      env: MODE=docs
 
 script:
   - ./scripts/run_tests.sh $MODE
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index d3b3cc65..6be281f6 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -16,6 +16,8 @@ case "${MODE}" in
     flake8 bleach/ ;;
   vendorverify)
     ./scripts/vendor_verify.sh ;;
+  docs)
+    tox -e docs ;;
   *)
     echo "Unknown mode $MODE."
     exit 1
diff --git a/tox.ini b/tox.ini
index 5500cac1..3e0a4035 100644
--- a/tox.ini
+++ b/tox.ini
@@ -68,3 +68,4 @@ deps =
     -rrequirements-dev.txt
 commands =
     sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html
+    sphinx-build -b doctest -d {envtmpdir}/doctrees . {envtmpdir}/doctest

From 0b85bb5882e30e4371cab151bf4fcb5f16e0e330 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 25 Sep 2019 09:46:58 -0400
Subject: [PATCH 232/314] Fix docs pass in CI to use python 3.6

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 6b570a95..73994de3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,7 +26,7 @@ matrix:
       env: MODE=vendorverify
     - python: "3.4"
       env: MODE=lint
-    - python: "3.5"
+    - python: "3.6"
       env: MODE=docs
 
 script:

From 31ee817a3acecfe08e88e19be55708ab8fd051ae Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Sun, 10 Nov 2019 16:11:32 -0500
Subject: [PATCH 233/314] Remove references to IRC

Fixes #482
---
 README.rst   | 1 -
 docs/dev.rst | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 05dea178..5171df49 100644
--- a/README.rst
+++ b/README.rst
@@ -30,7 +30,6 @@ fixes. You can find full documentation on `ReadTheDocs`_.
 :Code:           https://github.com/mozilla/bleach
 :Documentation:  https://bleach.readthedocs.io/
 :Issue tracker:  https://github.com/mozilla/bleach/issues
-:IRC:            ``#bleach`` on irc.mozilla.org
 :License:        Apache License v2; see LICENSE file
 
 
diff --git a/docs/dev.rst b/docs/dev.rst
index eb84db5e..fcb90666 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -102,4 +102,4 @@ Release process
 
     That will push the release to PyPI.
 
-12. Blog posts, twitter, update topic in ``#bleach``, etc.
+12. Blog posts, twitter, etc.

From 3ad258e6f83e667e1f92f776e70abfab274e4859 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Mon, 11 Nov 2019 10:41:32 -0800
Subject: [PATCH 234/314] Add Python 3.8 to the test matrix

---
 .travis.yml | 1 +
 setup.py    | 1 +
 tox.ini     | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 73994de3..992f069e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,7 @@ python:
   - "3.5"
   - "3.6"
   - "3.7"
+  - "3.8"
   - "pypy2.7-6.0"
   - "pypy3.5-6.0"
 
diff --git a/setup.py b/setup.py
index f5b675b1..c46a3963 100755
--- a/setup.py
+++ b/setup.py
@@ -72,6 +72,7 @@ def get_version():
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Software Development :: Libraries :: Python Modules',
diff --git a/tox.ini b/tox.ini
index 3e0a4035..3592caef 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,8 +2,8 @@
 
 [tox]
 envlist =
-    py{27,34,35,36,37,py,py3}
-    py{27,34,35,36,37}-build-no-lang
+    py{27,34,35,36,37,38,py,py3}
+    py{27,34,35,36,37,38}-build-no-lang
     docs
     lint
     vendorverify

From 24907609ca727947a588230d9691f1d2930374bb Mon Sep 17 00:00:00 2001
From: Emin Mastizada <emin@linux.com>
Date: Wed, 16 Oct 2019 17:47:27 +0300
Subject: [PATCH 235/314] Support pytest5 syntax for tests

---
 .gitignore           | 2 ++
 requirements-dev.txt | 2 +-
 tests/test_clean.py  | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index c4abbd13..88f9443f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ docs/_build/
 *~
 *.swp
 __pycache__
+venv/
+.idea/
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ed92d83f..758459aa 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 -e .
 
 # Requirements to run the test suite
-pytest<5
+pytest
 pytest-wholenodeid
 flake8
 tox
diff --git a/tests/test_clean.py b/tests/test_clean.py
index e306cc50..d041ffab 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -24,11 +24,11 @@ def test_only_text_is_cleaned():
 
     with pytest.raises(TypeError) as e:
         clean(some_type)
-    assert "argument cannot be of 'type' type" in str(e)
+    assert "argument cannot be of 'type' type" in str(e.value)
 
     with pytest.raises(TypeError) as e:
         clean(no_type)
-    assert "NoneType" in str(e)
+    assert "NoneType" in str(e.value)
 
 
 def test_empty():

From a8f7323eb900fd2ab796d5b149d439295f82de57 Mon Sep 17 00:00:00 2001
From: Per Bodin <per.bodin@gmail.com>
Date: Fri, 5 Jul 2019 13:08:39 +0200
Subject: [PATCH 236/314] Added failing test

BleachHTMLTokenizer tries to iterate parser.tags
---
 tests/test_html5lib_shim.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index f4e19980..c296d86a 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -117,6 +117,11 @@ def test_serializer(data, expected):
         {},
         '<a href=\'http://example.com\'\'>',
         '<a href="http://example.com"></a>'
+    ),
+    (
+        {},
+        '</ chars',
+        '&lt;/ chars',
     )
 ])
 def test_bleach_html_parser(parser_args, data, expected):

From 6ba1dd0d49a8d7a31f71d172483822a5a6f2da23 Mon Sep 17 00:00:00 2001
From: Per Bodin <per.bodin@gmail.com>
Date: Fri, 5 Jul 2019 13:13:43 +0200
Subject: [PATCH 237/314] Check parser.tags before trying to iterate

---
 bleach/html5lib_shim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 1cb4efe0..950c2751 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -256,7 +256,7 @@ def __iter__(self):
                     yield token
 
                 elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
-                     token['data'].lower().strip() not in self.parser.tags)):
+                     (self.parser.tags is None or token['data'].lower().strip() not in self.parser.tags))):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
                     # comment which Bleach clean() will drop so we interfere

From 92769bc0db11ef01140945bc62a19188805eff61 Mon Sep 17 00:00:00 2001
From: Per Bodin <per.bodin@gmail.com>
Date: Wed, 25 Sep 2019 23:20:18 +0200
Subject: [PATCH 238/314] Fixed 'expected-closing-tag-but-got-char' not to
 enter if tags=None

Updated test accordingly since html5lib turns it into a comment.
---
 bleach/html5lib_shim.py     | 3 ++-
 tests/test_html5lib_shim.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 950c2751..f18e4225 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -256,7 +256,8 @@ def __iter__(self):
                     yield token
 
                 elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
-                     (self.parser.tags is None or token['data'].lower().strip() not in self.parser.tags))):
+                       self.parser.tags is not None and
+                       token['data'].lower().strip() not in self.parser.tags)):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
                     # comment which Bleach clean() will drop so we interfere
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index c296d86a..a7da8647 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -118,10 +118,11 @@ def test_serializer(data, expected):
         '<a href=\'http://example.com\'\'>',
         '<a href="http://example.com"></a>'
     ),
+    # Test that "expected-closing-tag-but-got-char" works when tags is None
     (
         {},
         '</ chars',
-        '&lt;/ chars',
+        '<!-- chars-->',
     )
 ])
 def test_bleach_html_parser(parser_args, data, expected):

From f2e3728500d44ac0decc217393a18ab4c5f15069 Mon Sep 17 00:00:00 2001
From: Greg Guthe <g-k@users.noreply.github.com>
Date: Tue, 3 Dec 2019 16:04:55 -0500
Subject: [PATCH 239/314] Create SECURITY.md

fixes: #479
copied reporting info from readme
---
 SECURITY.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..85e42426
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,20 @@
+# Security Policy
+
+## Supported Versions
+
+Use this section to tell people about which versions of your project are
+currently being supported with security updates.
+
+| Version | Supported          |
+| ------- | ------------------ |
+| 3.1.x   | :white_check_mark: |
+| < 3.1   | :x:                |
+
+## Reporting a Vulnerability
+
+If you believe that you've found a security vulnerability, please [file a secure
+bug report in our bug tracker](https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security) or send an email to *security AT mozilla DOT org*.
+
+For more information on security-related bug disclosure and the PGP key to use
+for sending encrypted mail or to verify responses received from that address,
+please read our wiki page at https://www.mozilla.org/en-US/security/#For_Developers

From 567eb46b21616b5bf6fe0214d720ff6bdc944f16 Mon Sep 17 00:00:00 2001
From: Martin Gross <gross@rami.io>
Date: Fri, 22 Nov 2019 16:43:31 +0100
Subject: [PATCH 240/314] Add build_email_re() and use list of TLDs

---
 bleach/linkifier.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 5cde429b..ed8925fd 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -59,15 +59,31 @@ def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
 PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
 
-EMAIL_RE = re.compile(
-    r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
-    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-        |\\[\001-\011\013\014\016-\177])*"  # quoted-string
-    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
-    """,
-    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+def build_email_re(tlds=TLDS):
+    """Builds the email regex used by linkifier
+
+   If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
+
+       from bleach import linkifier
+
+       my_email_re = linkifier.build_email_re(my_tlds_list)
+
+       linker = LinkifyFilter(email_re=my_url_re)
+
+    """
+    # open and closing braces doubled below for format string
+    return re.compile(
+        r"""(?<!//)
+        (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
+            (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
+        |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+            |\\[\001-\011\013\014\016-\177])*"  # quoted-string
+        )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
+        """.format('|'.join(tlds)),
+        re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+
+EMAIL_RE = build_email_re()
 
 
 class Linker(object):

From 05038634395b7728d90b31bab867fa8e25cc14c6 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Dec 2019 10:26:27 -0500
Subject: [PATCH 241/314] docs: add build_email_re with tlds example

---
 docs/linkify.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index 4c34813f..113583ed 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -364,6 +364,23 @@ both formats:
    >>> puny_linker.linkify('https://xn--80aaksdi3bpu.xn--p1ai/ https://дайтрафик.рф/')
    '<a href="https://xn--80aaksdi3bpu.xn--p1ai/" rel="nofollow">https://xn--80aaksdi3bpu.xn--p1ai/</a> <a href="https://дайтрафик.рф/" rel="nofollow">https://дайтрафик.рф/</a>'
 
+
+Similarly, using ``build_email_re`` with the ``email_re`` argument to
+customize recognized email TLDs:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker, build_email_re
+
+   >>> only_fish_tld_url_re = build_email_re(tlds=['fish'])
+   >>> linker = Linker(email_re=only_fish_tld_url_re, parse_email=True)
+
+   >>> linker.linkify('does not link email: foo@example.com')
+   'does not link email: foo@example.com'
+   >>> linker.linkify('links email foo@example.fish')
+   'links email <a href="mailto:foo@example.fish">foo@example.fish</a>'
+
+
 :ref:`LinkifyFilter <linkify-LinkifyFilter>` also accepts these options.
 
 .. autoclass:: bleach.linkifier.Linker

From 02a22a0838722c9b9819dce9a6202f60fe1ca723 Mon Sep 17 00:00:00 2001
From: sblondon <sblondon@users.noreply.github.com>
Date: Wed, 11 Dec 2019 10:58:14 +0100
Subject: [PATCH 242/314] Add missing quote

The ending quote for a string in code example is missing. This PR adds it.
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 5171df49..4f6d3064 100644
--- a/README.rst
+++ b/README.rst
@@ -96,7 +96,7 @@ The simplest way to use Bleach is:
     u'an &lt;script&gt;evil()&lt;/script&gt; example'
 
     >>> bleach.linkify('an http://example.com url')
-    u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
+    u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url'
 
 
 Code of conduct

From 3c525ac264cb442616b1dca62bbab7e4be0ec667 Mon Sep 17 00:00:00 2001
From: Tim Gates <tim.gates@iress.com>
Date: Sat, 21 Dec 2019 19:17:36 +1100
Subject: [PATCH 243/314] Fix simple typo: curren -> current

Closes #504
---
 tests/test_clean.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index d041ffab..c06ee69b 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -220,7 +220,7 @@ def test_bare_entities_get_escaped_correctly(text, expected):
     ('this &amp that', 'this &amp;amp that'),
 
     # Test a thing that looks like a character entity, but isn't because it's
-    # missing a ; (&curren)
+    # missing a ; (&current)
     (
         'http://example.com?active=true&current=true',
         'http://example.com?active=true&amp;current=true'

From c3c74a69f2ab890b5bb7498037852b9d47d6f986 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Sun, 5 Jan 2020 12:48:22 -0800
Subject: [PATCH 244/314] Drop deprecated 'setup.py test' support

Since setuptools v41.5.0 (27 Oct 2019), the 'test' command is formally
deprecated and should not be used. Contributors should run test tests
using tox, pytest, run_test.sh or some other entry point.

For details on the setuptools change, see:

https://setuptools.readthedocs.io/en/latest/history.html#v41-5-0
---
 setup.cfg |  3 ---
 setup.py  | 12 ------------
 2 files changed, 15 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 0d149b6d..ea29d36f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,3 @@
-[aliases]
-test=pytest
-
 [flake8]
 exclude =
     .git/,
diff --git a/setup.py b/setup.py
index c46a3963..93159293 100755
--- a/setup.py
+++ b/setup.py
@@ -3,20 +3,10 @@
 import io
 import os
 import re
-import sys
 
 from setuptools import setup, find_packages
 
 
-setup_requires = []
-if 'test' in sys.argv:
-    # Only add pytest-runner to setup_requires if running tests
-    setup_requires.append('pytest-runner>=2.0,<3dev')
-
-tests_require = [
-    'pytest>=3.0.0',
-]
-
 install_requires = [
     'six>=1.9.0',
     # html5lib requirements
@@ -56,8 +46,6 @@ def get_version():
     zip_safe=False,
     python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
     install_requires=install_requires,
-    setup_requires=setup_requires,
-    tests_require=tests_require,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
         'Environment :: Web Environment',

From 575de68eb6186a8cd8e4657948893d4a070c20fa Mon Sep 17 00:00:00 2001
From: dbxnr <dbxnr@users.noreply.github.com>
Date: Sat, 11 Jan 2020 02:53:12 +0000
Subject: [PATCH 245/314] Add relative link to code of conduct

---
 README.rst | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/README.rst b/README.rst
index 4f6d3064..e006c997 100644
--- a/README.rst
+++ b/README.rst
@@ -99,15 +99,12 @@ The simplest way to use Bleach is:
     u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url'
 
 
-Code of conduct
+Code of Conduct
 ===============
 
 This project and repository is governed by Mozilla's code of conduct and
-etiquette guidelines. For more details please see the `Mozilla Community
-Participation Guidelines
-<https://www.mozilla.org/about/governance/policies/participation/>`_ and
-`Developer Etiquette Guidelines
-<https://bugzilla.mozilla.org/page.cgi?id=etiquette.html>`_.
+etiquette guidelines. For more details please see the `CODE_OF_CONDUCT.md
+</CODE_OF_CONDUCT.md>`_
 
 
 .. _html5lib: https://github.com/html5lib/html5lib-python

From a8af49e5ae02515ee89f2d9153416913f25a18d0 Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Tue, 28 Jan 2020 11:05:57 +0200
Subject: [PATCH 246/314] Test on PyPy 7

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 992f069e..6ac08e82 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,8 +12,8 @@ python:
   - "3.6"
   - "3.7"
   - "3.8"
-  - "pypy2.7-6.0"
-  - "pypy3.5-6.0"
+  - "pypy"
+  - "pypy3"
 
 install:
   - pip install -U pip setuptools>=18.5

From 8d416c5ee780efe5daa237368bd9bd1b1af75c12 Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Tue, 28 Jan 2020 11:10:38 +0200
Subject: [PATCH 247/314] Drop support for EOL Python 3.4

---
 .travis.yml |  3 +--
 setup.py    |  3 +--
 tox.ini     | 10 ++--------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6ac08e82..cba5b98b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,6 @@ cache:
 
 python:
   - "2.7"
-  - "3.4"
   - "3.5"
   - "3.6"
   - "3.7"
@@ -25,7 +24,7 @@ matrix:
       env: MODE=lint
     - python: "2.7"
       env: MODE=vendorverify
-    - python: "3.4"
+    - python: "3.8"
       env: MODE=lint
     - python: "3.6"
       env: MODE=docs
diff --git a/setup.py b/setup.py
index 93159293..dfd0a46e 100755
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ def get_version():
     include_package_data=True,
     package_data={'': ['README.rst']},
     zip_safe=False,
-    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
+    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
     install_requires=install_requires,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
@@ -56,7 +56,6 @@ def get_version():
         'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
diff --git a/tox.ini b/tox.ini
index 3592caef..65c2fa3a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,8 +2,8 @@
 
 [tox]
 envlist =
-    py{27,34,35,36,37,38,py,py3}
-    py{27,34,35,36,37,38}-build-no-lang
+    py{27,35,36,37,38,py,py3}
+    py{27,35,36,37,38}-build-no-lang
     docs
     lint
     vendorverify
@@ -21,12 +21,6 @@ setenv =
 commands =
     python setup.py build
 
-[testenv:py34-build-no-lang]
-setenv =
-    LANG=
-commands =
-    python setup.py build
-
 [testenv:py35-build-no-lang]
 setenv =
     LANG=

From 996cde7a2439a2323f9c4b2567c8b8449d393351 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 16:09:52 -0500
Subject: [PATCH 248/314] fix bug 1615315

---
 bleach/html5lib_shim.py |  7 ++++++-
 tests/test_clean.py     | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 25e3e955..169c4027 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -376,7 +376,12 @@ def __init__(self, tags, strip, consume_entities, **kwargs):
         self.consume_entities = consume_entities
         super(BleachHTMLParser, self).__init__(**kwargs)
 
-    def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
+    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
+        # set scripting=True to parse <noscript> as though JS is enabled to
+        # match the expected context in browsers
+        #
+        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+        #
         # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
         self.container = container
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 53227677..f3c00001 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -769,6 +769,34 @@ def test_nonexistent_namespace():
     assert clean('<d {c}>') == '&lt;d {c}&gt;'
 
 
+# tags that get content passed through (i.e. parsed with parseRCDataRawtext)
+_raw_tags = [
+    "title",
+    "textarea",
+    "script",
+    "style",
+    "noembed",
+    "noframes",
+    "iframe",
+    "xmp",
+]
+
+@pytest.mark.parametrize(
+    "raw_tag, data, expected",
+    [
+        (
+            raw_tag,
+            "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
+            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+        )
+        for raw_tag in _raw_tags
+    ],
+)
+def test_noscript_rawtag_(raw_tag, data, expected):
+    # refs: bug 1615315 / GHSA-q65m-pv3f-wr5r
+    assert clean(data, tags=["noscript", raw_tag]) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 0d88dd83e425c4ba381d5b83fe61bfae5bbbd627 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 12:49:57 -0500
Subject: [PATCH 249/314] Update for v3.1.1 release

---
 CHANGES            | 37 ++++++++++++++++++++++++++++++++++++-
 bleach/__init__.py |  4 ++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index 838393b3..6cf295e1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,41 @@
 Bleach changes
 ==============
 
+Version 3.1.1 (February 13th, 2020)
+-----------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing ``noscript`` tags did not match
+  browser behavior.
+
+  Calls to ``bleach.clean`` allowing ``noscript`` and one or more of
+  the raw text tags (``title``, ``textarea``, ``script``, ``style``,
+  ``noembed``, ``noframes``, ``iframe``, and ``xmp``) were vulnerable
+  to a mutation XSS.
+
+  This security issue was confirmed in Bleach versions v2.1.4, v3.0.2,
+  and v3.1.0. Earlier versions are probably affected too.
+
+  Anyone using Bleach <=v3.1.0 is highly encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1615315
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+Bleach changes
+==============
+
 Version 3.1.0 (January 9th, 2019)
 ---------------------------------
 
@@ -76,7 +111,7 @@ None
 
 * Fix ``list`` object has no attribute ``lower`` in ``clean``. (#398)
 * Fix ``abbr`` getting escaped in ``linkify``. (#400)
- 
+
 
 Version 3.0.0 (October 3rd, 2018)
 ---------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 9816549b..30f8fb84 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20190109'
+__releasedate__ = '20200213'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.0'
+__version__ = '3.1.1'
 VERSION = parse_version(__version__)
 
 

From f77e0f6392177a06e46a49abd61a4d9f035e57fd Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 16:09:52 -0500
Subject: [PATCH 250/314] fix bug 1615315

---
 bleach/html5lib_shim.py |  7 ++++++-
 tests/test_clean.py     | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index f18e4225..20d7e22b 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -377,7 +377,12 @@ def __init__(self, tags, strip, consume_entities, **kwargs):
         self.consume_entities = consume_entities
         super(BleachHTMLParser, self).__init__(**kwargs)
 
-    def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
+    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
+        # set scripting=True to parse <noscript> as though JS is enabled to
+        # match the expected context in browsers
+        #
+        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+        #
         # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
         self.container = container
diff --git a/tests/test_clean.py b/tests/test_clean.py
index c06ee69b..8f64beb2 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -771,6 +771,34 @@ def test_nonexistent_namespace():
     assert clean('<d {c}>') == '&lt;d {c}&gt;'
 
 
+# tags that get content passed through (i.e. parsed with parseRCDataRawtext)
+_raw_tags = [
+    "title",
+    "textarea",
+    "script",
+    "style",
+    "noembed",
+    "noframes",
+    "iframe",
+    "xmp",
+]
+
+@pytest.mark.parametrize(
+    "raw_tag, data, expected",
+    [
+        (
+            raw_tag,
+            "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
+            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+        )
+        for raw_tag in _raw_tags
+    ],
+)
+def test_noscript_rawtag_(raw_tag, data, expected):
+    # refs: bug 1615315 / GHSA-q65m-pv3f-wr5r
+    assert clean(data, tags=["noscript", raw_tag]) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From e0ad450828832e9548d256f0938823a366337368 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 12:49:57 -0500
Subject: [PATCH 251/314] Update for v3.1.1 release

---
 CHANGES            | 37 ++++++++++++++++++++++++++++++++++++-
 bleach/__init__.py |  4 ++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index 838393b3..6cf295e1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,41 @@
 Bleach changes
 ==============
 
+Version 3.1.1 (February 13th, 2020)
+-----------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing ``noscript`` tags did not match
+  browser behavior.
+
+  Calls to ``bleach.clean`` allowing ``noscript`` and one or more of
+  the raw text tags (``title``, ``textarea``, ``script``, ``style``,
+  ``noembed``, ``noframes``, ``iframe``, and ``xmp``) were vulnerable
+  to a mutation XSS.
+
+  This security issue was confirmed in Bleach versions v2.1.4, v3.0.2,
+  and v3.1.0. Earlier versions are probably affected too.
+
+  Anyone using Bleach <=v3.1.0 is highly encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1615315
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+Bleach changes
+==============
+
 Version 3.1.0 (January 9th, 2019)
 ---------------------------------
 
@@ -76,7 +111,7 @@ None
 
 * Fix ``list`` object has no attribute ``lower`` in ``clean``. (#398)
 * Fix ``abbr`` getting escaped in ``linkify``. (#400)
- 
+
 
 Version 3.0.0 (October 3rd, 2018)
 ---------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 9816549b..30f8fb84 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20190109'
+__releasedate__ = '20200213'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.0'
+__version__ = '3.1.1'
 VERSION = parse_version(__version__)
 
 

From e4e9e21e7aebff40c88fafa4319bba4636a602d9 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:52:36 -0400
Subject: [PATCH 252/314] fix bug 1621692

---
 bleach/html5lib_shim.py | 13 ++++++++++++-
 tests/test_clean.py     | 27 +++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 169c4027..dcd98a1a 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -533,7 +533,18 @@ def next_possible_entity(text):
 
 
 class BleachHTMLSerializer(HTMLSerializer):
-    """HTMLSerializer that undoes & -> &amp; in attributes"""
+    """HTMLSerializer that undoes & -> &amp; in attributes and sets
+    escape_rcdata to True
+    """
+
+    # per the HTMLSerializer.__init__ docstring:
+    #
+    # Whether to escape characters that need to be
+    # escaped within normal elements within rcdata elements such as
+    # style.
+    #
+    escape_rcdata = True
+
     def escape_base_amp(self, stoken):
         """Escapes just bare & in HTML attribute values"""
         # First, undo escaping of &. We need to do this because html5lib's
diff --git a/tests/test_clean.py b/tests/test_clean.py
index f3c00001..cd5360b8 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -5,7 +5,7 @@
 from bleach import clean
 from bleach.html5lib_shim import Filter
 from bleach.sanitizer import Cleaner
-
+from bleach._vendor.html5lib.constants import rcdataElements
 
 def test_clean_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
@@ -787,7 +787,7 @@ def test_nonexistent_namespace():
         (
             raw_tag,
             "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
-            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+            "<noscript>&lt;%s&gt;</noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
         )
         for raw_tag in _raw_tags
     ],
@@ -797,6 +797,29 @@ def test_noscript_rawtag_(raw_tag, data, expected):
     assert clean(data, tags=["noscript", raw_tag]) == expected
 
 
+@pytest.mark.parametrize(
+    "namespace_tag, rc_data_element_tag, data, expected",
+    [
+        (
+            namespace_tag,
+            rc_data_element_tag,
+            "<%s><%s><img src=x onerror=alert(1)>" % (namespace_tag, rc_data_element_tag),
+            "<%s><%s>&lt;img src=x onerror=alert(1)&gt;</%s></%s>" % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag),
+        )
+        for namespace_tag in ["math", "svg"]
+        # https://dev.w3.org/html5/html-author/#rcdata-elements
+        # https://html.spec.whatwg.org/index.html#parsing-html-fragments
+        # in html5lib: 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', and 'noscript'
+        for rc_data_element_tag in rcdataElements
+    ],
+)
+def test_namespace_rc_data_element_strip_false(namespace_tag, rc_data_element_tag, data, expected):
+    # refs: bug 1621692 / GHSA-m6xf-fq7q-8743
+    #
+    # browsers will pull the img out of the namespace and rc data tag resulting in XSS
+    assert clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 7b625ff9f6888a08037700269fb23e3ef863b8a7 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:56:43 -0400
Subject: [PATCH 253/314] add wheel to requirements-dev

---
 requirements-dev.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 758459aa..2e999d52 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,3 +11,6 @@ Sphinx
 
 # Requirements for updating package
 twine
+
+# Requirements for running setup.py bdist_wheel
+wheel

From 78a06726dd6c72a42c90c1f7a8fa5d21ebcfa587 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 16:17:57 -0400
Subject: [PATCH 254/314] Update for v3.1.2 release

---
 CHANGES            | 34 ++++++++++++++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6cf295e1..2005da7c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,40 @@
 Bleach changes
 ==============
 
+Version 3.1.2 (March 11th, 2020)
+--------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing embedded MathML and SVG content
+  with RCDATA tags did not match browser behavior and could result in
+  a mutation XSS.
+
+  Calls to ``bleach.clean`` with ``strip=False`` and ``math`` or
+  ``svg`` tags and one or more of the RCDATA tags ``script``,
+  ``noscript``, ``style``, ``noframes``, ``iframe``, ``noembed``, or
+  ``xmp`` in the allowed tags whitelist were vulnerable to a mutation
+  XSS.
+
+  This security issue was confirmed in Bleach version v3.1.1. Earlier
+  versions are likely affected too.
+
+  Anyone using Bleach <=v3.1.1 is encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1621692
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
 Version 3.1.1 (February 13th, 2020)
 -----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 30f8fb84..bb5a5b6d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200213'
+__releasedate__ = '20200311'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.1'
+__version__ = '3.1.2'
 VERSION = parse_version(__version__)
 
 

From 175f67740e7951e1d80cefb7831e6c3e4efeb986 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:52:36 -0400
Subject: [PATCH 255/314] fix bug 1621692

---
 bleach/html5lib_shim.py | 13 ++++++++++++-
 tests/test_clean.py     | 27 +++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 20d7e22b..be06aead 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -534,7 +534,18 @@ def next_possible_entity(text):
 
 
 class BleachHTMLSerializer(HTMLSerializer):
-    """HTMLSerializer that undoes & -> &amp; in attributes"""
+    """HTMLSerializer that undoes & -> &amp; in attributes and sets
+    escape_rcdata to True
+    """
+
+    # per the HTMLSerializer.__init__ docstring:
+    #
+    # Whether to escape characters that need to be
+    # escaped within normal elements within rcdata elements such as
+    # style.
+    #
+    escape_rcdata = True
+
     def escape_base_amp(self, stoken):
         """Escapes just bare & in HTML attribute values"""
         # First, undo escaping of &. We need to do this because html5lib's
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 8f64beb2..133cd822 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -7,7 +7,7 @@
 from bleach import clean
 from bleach.html5lib_shim import Filter
 from bleach.sanitizer import Cleaner
-
+from bleach._vendor.html5lib.constants import rcdataElements
 
 def test_clean_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
@@ -789,7 +789,7 @@ def test_nonexistent_namespace():
         (
             raw_tag,
             "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
-            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+            "<noscript>&lt;%s&gt;</noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
         )
         for raw_tag in _raw_tags
     ],
@@ -799,6 +799,29 @@ def test_noscript_rawtag_(raw_tag, data, expected):
     assert clean(data, tags=["noscript", raw_tag]) == expected
 
 
+@pytest.mark.parametrize(
+    "namespace_tag, rc_data_element_tag, data, expected",
+    [
+        (
+            namespace_tag,
+            rc_data_element_tag,
+            "<%s><%s><img src=x onerror=alert(1)>" % (namespace_tag, rc_data_element_tag),
+            "<%s><%s>&lt;img src=x onerror=alert(1)&gt;</%s></%s>" % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag),
+        )
+        for namespace_tag in ["math", "svg"]
+        # https://dev.w3.org/html5/html-author/#rcdata-elements
+        # https://html.spec.whatwg.org/index.html#parsing-html-fragments
+        # in html5lib: 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', and 'noscript'
+        for rc_data_element_tag in rcdataElements
+    ],
+)
+def test_namespace_rc_data_element_strip_false(namespace_tag, rc_data_element_tag, data, expected):
+    # refs: bug 1621692 / GHSA-m6xf-fq7q-8743
+    #
+    # browsers will pull the img out of the namespace and rc data tag resulting in XSS
+    assert clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 3f39d489ab7a1b38df8c245e9bd66217c1698369 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:56:43 -0400
Subject: [PATCH 256/314] add wheel to requirements-dev

---
 requirements-dev.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 758459aa..2e999d52 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,3 +11,6 @@ Sphinx
 
 # Requirements for updating package
 twine
+
+# Requirements for running setup.py bdist_wheel
+wheel

From 59cc502cee44bd18adc78619e6baed7a108c3ba1 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 16:17:57 -0400
Subject: [PATCH 257/314] Update for v3.1.2 release

---
 CHANGES            | 34 ++++++++++++++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6cf295e1..2005da7c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,40 @@
 Bleach changes
 ==============
 
+Version 3.1.2 (March 11th, 2020)
+--------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing embedded MathML and SVG content
+  with RCDATA tags did not match browser behavior and could result in
+  a mutation XSS.
+
+  Calls to ``bleach.clean`` with ``strip=False`` and ``math`` or
+  ``svg`` tags and one or more of the RCDATA tags ``script``,
+  ``noscript``, ``style``, ``noframes``, ``iframe``, ``noembed``, or
+  ``xmp`` in the allowed tags whitelist were vulnerable to a mutation
+  XSS.
+
+  This security issue was confirmed in Bleach version v3.1.1. Earlier
+  versions are likely affected too.
+
+  Anyone using Bleach <=v3.1.1 is encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1621692
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
 Version 3.1.1 (February 13th, 2020)
 -----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 30f8fb84..bb5a5b6d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200213'
+__releasedate__ = '20200311'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.1'
+__version__ = '3.1.2'
 VERSION = parse_version(__version__)
 
 

From e4b1c50e098c33f82c862a34bb2a40f9c4458f46 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 17 Mar 2020 11:16:06 -0400
Subject: [PATCH 258/314] Update for v3.1.3 release

---
 CHANGES            | 27 +++++++++++++++++++++++++++
 CONTRIBUTORS       |  6 +++++-
 bleach/__init__.py |  4 ++--
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index 2005da7c..f4e09d8f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,33 @@
 Bleach changes
 ==============
 
+Version 3.1.3 (March 17th, 2020)
+--------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+* Add relative link to code of conduct. (#442)
+
+* Drop deprecated 'setup.py test' support. (#507)
+
+* Fix typo: curren -> current in tests/test_clean.py (#504)
+
+* Test on PyPy 7
+
+* Drop test support for end of life Python 3.4
+
+**Bug fixes**
+
+None
+
 Version 3.1.2 (March 11th, 2020)
 --------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 2b0137d0..4d8f6b48 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,12 +1,13 @@
 Bleach was originally written and maintained by James Socol and various
 contributors within and without the Mozilla Corporation and Foundation.
 
-It is currently maintained by Will Kahn-Greene an Greg Guthe.
+It is currently maintained by Will Kahn-Greene, Greg Guthe, and Jon Dufresne.
 
 Maintainers:
 
 - Will Kahn-Greene <willkg@mozilla.com>
 - Greg Guthe <gguthe@mozilla.com>
+- Jon Dufresne <jon.dufresne@gmail.com>
 
 Maintainer emeritus:
 
@@ -32,6 +33,7 @@ Contributors:
 - Chris Beaven
 - Dan Gayle
 - dave-shawley
+- dbxnr
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
@@ -44,6 +46,7 @@ Contributors:
 - Janusz Kamieński
 - Jeff Balogh
 - Jonathan Vanasco
+- Jon Dufresne
 - Lee, Cheon-il
 - Les Orchard
 - Lorenz Schori
@@ -65,6 +68,7 @@ Contributors:
 - Stu Cox
 - Tim Dumol
 - Timothy Fitz
+- Tim Gates
 - Vadim Kotov
 - Vitaly Volkov
 - Will Kahn-Greene
diff --git a/bleach/__init__.py b/bleach/__init__.py
index bb5a5b6d..5d48813b 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200311'
+__releasedate__ = '20200317'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.2'
+__version__ = '3.1.3'
 VERSION = parse_version(__version__)
 
 

From d6018f2539d271963c3e7f54f36ef11900363c69 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Mon, 23 Mar 2020 13:46:36 -0400
Subject: [PATCH 259/314] fix bug 1623633

expand and comment on sanitize_css gauntlet regex per
https://github.com/mozilla/bleach/pull/61/files#r677453
---
 bleach/sanitizer.py | 10 ++++++++--
 tests/test_css.py   | 33 +++++++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 54f624fa..0f6babda 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -593,8 +593,14 @@ def sanitize_css(self, style):
         # the whole thing.
         parts = style.split(';')
         gauntlet = re.compile(
-            r"""^([-/:,#%.'"\s!\w]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""",
-            flags=re.U
+            r"""^(  # consider a style attribute value as composed of:
+[/:,#%!.\s\w]    # a non-newline character
+|\w-\w           # 3 characters in the form \w-\w
+|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
+|"[\s\w]+"       # a double quoted string of [\s\w]+
+|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
+)*$""",
+            flags=re.U | re.VERBOSE
         )
 
         for part in parts:
diff --git a/tests/test_css.py b/tests/test_css.py
index 92fe9553..6a3d8eb9 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 from functools import partial
+from timeit import timeit
 
 import pytest
 
@@ -37,10 +38,12 @@
         '<p style="color: red;">bar</p>'
     ),
     # Handle leading - in attributes
-    (
+    # regressed with the fix for bug 1623633
+    pytest.param(
         '<p style="cursor: -moz-grab;">bar</p>',
         ['cursor'],
-        '<p style="cursor: -moz-grab;">bar</p>'
+        '<p style="cursor: -moz-grab;">bar</p>',
+        marks=pytest.mark.xfail,
     ),
     # Handle () in attributes
     (
@@ -54,16 +57,20 @@
         '<p style="color: rgba(255,0,0,0.4);">bar</p>',
     ),
     # Handle ' in attributes
-    (
+    # regressed with the fix for bug 1623633
+    pytest.param(
         '<p style="text-overflow: \',\' ellipsis;">bar</p>',
         ['text-overflow'],
-        '<p style="text-overflow: \',\' ellipsis;">bar</p>'
+        '<p style="text-overflow: \',\' ellipsis;">bar</p>',
+        marks=pytest.mark.xfail,
     ),
     # Handle " in attributes
-    (
+    # regressed with the fix for bug 1623633
+    pytest.param(
         '<p style=\'text-overflow: "," ellipsis;\'>bar</p>',
         ['text-overflow'],
-        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>'
+        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>',
+        marks=pytest.mark.xfail,
     ),
     (
         '<p style=\'font-family: "Arial";\'>bar</p>',
@@ -223,3 +230,17 @@ def test_style_hang():
 def test_css_parsing_with_entities(data, styles, expected):
     """The sanitizer should be ok with character entities"""
     assert clean(data, tags=['p'], attributes={'p': ['style']}, styles=styles) == expected
+
+
+@pytest.mark.parametrize('overlap_test_char', ["\"", "'", "-"])
+def test_css_parsing_gauntlet_regex_backtracking(overlap_test_char):
+    """The sanitizer gauntlet regex should not catastrophically backtrack"""
+    # refs: https://bugzilla.mozilla.org/show_bug.cgi?id=1623633
+
+    def time_clean(test_char, size):
+        style_attr_value = (test_char + 'a' + test_char) * size + '^'
+        stmt = """clean('''<a style='%s'></a>''', attributes={'a': ['style']})""" % style_attr_value
+        return timeit(stmt=stmt, setup='from bleach import clean', number=1)
+
+    # should complete in less than one second
+    assert time_clean(overlap_test_char, 22) < 1.0

From 6e74a5027b57055cdaeb040343d32934121392a7 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Mon, 23 Mar 2020 17:07:49 -0400
Subject: [PATCH 260/314] Update for v3.1.4 release

---
 CHANGES            | 33 +++++++++++++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index f4e09d8f..0f73b17b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,39 @@
 Bleach changes
 ==============
 
+Version 3.1.4 (March 24th, 2020)
+--------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing style attributes could result in a
+  regular expression denial of service (ReDoS).
+
+  Calls to ``bleach.clean`` with an allowed tag with an allowed
+  ``style`` attribute were vulnerable to ReDoS. For example,
+  ``bleach.clean(..., attributes={'a': ['style']})``.
+
+  This issue was confirmed in Bleach versions v3.1.3, v3.1.2, v3.1.1,
+  v3.1.0, v3.0.0, v2.1.4, and v2.1.3. Earlier versions used a similar
+  regular expression and should be considered vulnerable too.
+
+  Anyone using Bleach <=v3.1.3 is encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1623633
+
+**Backwards incompatible changes**
+
+* Style attributes with dashes, or single or double quoted values are
+  cleaned instead of passed through.
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
 Version 3.1.3 (March 17th, 2020)
 --------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 5d48813b..d168fabb 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200317'
+__releasedate__ = '20200324'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.3'
+__version__ = '3.1.4'
 VERSION = parse_version(__version__)
 
 

From 2df82b5e61af5f597bb479396853f020ab15134d Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 26 Mar 2020 10:46:57 -0400
Subject: [PATCH 261/314] fix long line lint failure bleach/sanitizer.py L601

---
 bleach/sanitizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0f6babda..3a5a9527 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -598,8 +598,8 @@ def sanitize_css(self, style):
 |\w-\w           # 3 characters in the form \w-\w
 |'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
 |"[\s\w]+"       # a double quoted string of [\s\w]+
-|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
-)*$""",
+|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
+)*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
             flags=re.U | re.VERBOSE
         )
 

From b7240abe93c34338a0315e224341e1f8e7411458 Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <benjamin@python.org>
Date: Tue, 28 Apr 2020 14:29:24 -0700
Subject: [PATCH 262/314] Replace setuptools dependency with packaging.

bleach had a (undeclared) dependency on setuptools via the import of pkg_resources in __init__.py. pkg_resources is mostly deprecated, and it's use should be avoided. For bleach's usage, we can directly employ the packaging library.
---
 bleach/__init__.py | 4 ++--
 setup.py           | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index d168fabb..41a39a20 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,7 +2,7 @@
 
 from __future__ import unicode_literals
 
-from pkg_resources import parse_version
+import packaging.version
 
 from bleach.linkifier import (
     DEFAULT_CALLBACKS,
@@ -21,7 +21,7 @@
 __releasedate__ = '20200324'
 # x.y.z or x.y.z.dev0 -- semver
 __version__ = '3.1.4'
-VERSION = parse_version(__version__)
+VERSION = packaging.version.Version(__version__)
 
 
 __all__ = ['clean', 'linkify']
diff --git a/setup.py b/setup.py
index dfd0a46e..e53002fa 100755
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 
 
 install_requires = [
+    'packaging',
     'six>=1.9.0',
     # html5lib requirements
     'webencodings',

From 366e79ed5152cb6108af57bfd0dfcb53ec97aa92 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 29 Apr 2020 14:21:39 -0400
Subject: [PATCH 263/314] Update for v3.1.5 release

---
 CHANGES            | 15 +++++++++++++++
 CONTRIBUTORS       |  1 +
 bleach/__init__.py |  4 ++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 0f73b17b..1832d292 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,21 @@
 Bleach changes
 ==============
 
+Version 3.1.5 (April 29th, 2020)
+--------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* replace missing ``setuptools`` dependency with ``packaging``. Thank you Benjamin Peterson.
+
 Version 3.1.4 (March 24th, 2020)
 --------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 4d8f6b48..56b6b130 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -29,6 +29,7 @@ Contributors:
 - Antoine Leclair
 - Anton Backer
 - Anton Kovalyov
+- Benjamin Peterson
 - Chad Birch
 - Chris Beaven
 - Dan Gayle
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 41a39a20..1692674e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200324'
+__releasedate__ = '20200429'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.4'
+__version__ = '3.1.5'
 VERSION = packaging.version.Version(__version__)
 
 

From 2d41050033ca9187912ec2ce92caed2fa9056653 Mon Sep 17 00:00:00 2001
From: Thomas Grainger <tagrain@gmail.com>
Date: Fri, 24 Jul 2020 15:11:22 +0100
Subject: [PATCH 264/314] update tests_website terminology

---
 tests_website/index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests_website/index.html b/tests_website/index.html
index 21ece427..8544cc4e 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -29,7 +29,7 @@ <h2>Python Bleach 2.0.0</h2>
         <div class="demo">
             <h3>Demo</h3>
             <p>
-                This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes.
+                This is the demo for <a href="https://bleach.readthedocs.io/en/latest/index.html">Bleach</a>, an allowed-list-based HTML sanitizing library that escapes or strips markup and attributes.
                 Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.
             </p>
 

From a9ff437d71daf8ed8ac95a4dec050fb045e0537e Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 8 Sep 2020 17:06:54 -0400
Subject: [PATCH 265/314] vendor: update vendored html5lib to v1.1

---
 .../html5lib-1.0.1.dist-info/DESCRIPTION.rst  | 489 ------------
 .../_vendor/html5lib-1.0.1.dist-info/RECORD   |  42 -
 .../html5lib-1.0.1.dist-info/metadata.json    |   1 -
 .../html5lib-1.1.dist-info/AUTHORS.rst        |  66 ++
 .../INSTALLER                                 |   0
 .../LICENSE}                                  |   0
 .../METADATA                                  |  72 +-
 bleach/_vendor/html5lib-1.1.dist-info/RECORD  |  41 +
 .../WHEEL                                     |   2 +-
 .../top_level.txt                             |   0
 bleach/_vendor/html5lib/__init__.py           |   2 +-
 bleach/_vendor/html5lib/_ihatexml.py          |   5 +-
 bleach/_vendor/html5lib/_inputstream.py       |  55 +-
 bleach/_vendor/html5lib/_tokenizer.py         |  16 +-
 bleach/_vendor/html5lib/_trie/__init__.py     |  13 +-
 bleach/_vendor/html5lib/_trie/_base.py        |   5 +-
 bleach/_vendor/html5lib/_trie/datrie.py       |  44 --
 bleach/_vendor/html5lib/_utils.py             |  49 +-
 bleach/_vendor/html5lib/constants.py          |   9 +-
 bleach/_vendor/html5lib/filters/sanitizer.py  |  20 +
 bleach/_vendor/html5lib/html5parser.py        | 734 +++++++++---------
 bleach/_vendor/html5lib/serializer.py         |   2 +-
 bleach/_vendor/html5lib/treebuilders/base.py  |   8 +-
 bleach/_vendor/html5lib/treebuilders/dom.py   |   5 +-
 bleach/_vendor/html5lib/treebuilders/etree.py |  27 +-
 .../html5lib/treebuilders/etree_lxml.py       |  64 +-
 .../_vendor/html5lib/treewalkers/__init__.py  |   6 +-
 bleach/_vendor/html5lib/treewalkers/etree.py  |   1 +
 .../html5lib/treewalkers/etree_lxml.py        |   4 +-
 bleach/_vendor/vendor.txt                     |   6 +-
 30 files changed, 719 insertions(+), 1069 deletions(-)
 delete mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
 delete mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
 delete mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
 create mode 100644 bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
 rename bleach/_vendor/{html5lib-1.0.1.dist-info => html5lib-1.1.dist-info}/INSTALLER (100%)
 rename bleach/_vendor/{html5lib-1.0.1.dist-info/LICENSE.txt => html5lib-1.1.dist-info/LICENSE} (100%)
 rename bleach/_vendor/{html5lib-1.0.1.dist-info => html5lib-1.1.dist-info}/METADATA (88%)
 create mode 100644 bleach/_vendor/html5lib-1.1.dist-info/RECORD
 rename bleach/_vendor/{html5lib-1.0.1.dist-info => html5lib-1.1.dist-info}/WHEEL (69%)
 rename bleach/_vendor/{html5lib-1.0.1.dist-info => html5lib-1.1.dist-info}/top_level.txt (100%)
 delete mode 100644 bleach/_vendor/html5lib/_trie/datrie.py

diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
deleted file mode 100644
index c05f8c00..00000000
--- a/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
+++ /dev/null
@@ -1,489 +0,0 @@
-html5lib
-========
-
-.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
-  :target: https://travis-ci.org/html5lib/html5lib-python
-
-html5lib is a pure-python library for parsing HTML. It is designed to
-conform to the WHATWG HTML specification, as is implemented by all major
-web browsers.
-
-
-Usage
------
-
-Simple usage follows this pattern:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      document = html5lib.parse(f)
-
-or:
-
-.. code-block:: python
-
-  import html5lib
-  document = html5lib.parse("<p>Hello World!")
-
-By default, the ``document`` will be an ``xml.etree`` element instance.
-Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
-
-Two other tree types are supported: ``xml.dom.minidom`` and
-``lxml.etree``. To use an alternative format, specify the name of
-a treebuilder:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
-
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
-  from contextlib import closing
-  from urllib2 import urlopen
-  import html5lib
-
-  with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
-
-When using with ``urllib.request`` (Python 3), the charset from HTTP
-should be pass into html5lib as follows:
-
-.. code-block:: python
-
-  from urllib.request import urlopen
-  import html5lib
-
-  with urlopen("http://example.com/") as f:
-      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
-
-To have more control over the parser, create a parser object explicitly.
-For instance, to make the parser raise exceptions on parse errors, use:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      parser = html5lib.HTMLParser(strict=True)
-      document = parser.parse(f)
-
-When you're instantiating parser objects explicitly, pass a treebuilder
-class as the ``tree`` keyword argument to use an alternative document
-format:
-
-.. code-block:: python
-
-  import html5lib
-  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
-  minidom_document = parser.parse("<p>Hello World!")
-
-More documentation is available at https://html5lib.readthedocs.io/.
-
-
-Installation
-------------
-
-html5lib works on CPython 2.7+, CPython 3.3+ and PyPy.  To install it,
-use:
-
-.. code-block:: bash
-
-    $ pip install html5lib
-
-
-Optional Dependencies
----------------------
-
-The following third-party libraries may be used for additional
-functionality:
-
-- ``datrie`` can be used under CPython to improve parsing performance
-  (though in almost all cases the improvement is marginal);
-
-- ``lxml`` is supported as a tree format (for both building and
-  walking) under CPython (but *not* PyPy where it is known to cause
-  segfaults);
-
-- ``genshi`` has a treewalker (but not builder); and
-
-- ``chardet`` can be used as a fallback when character encoding cannot
-  be determined.
-
-
-Bugs
-----
-
-Please report any bugs on the `issue tracker
-<https://github.com/html5lib/html5lib-python/issues>`_.
-
-
-Tests
------
-
-Unit tests require the ``pytest`` and ``mock`` libraries and can be
-run using the ``py.test`` command in the root directory.
-
-Test data are contained in a separate `html5lib-tests
-<https://github.com/html5lib/html5lib-tests>`_ repository and included
-as a submodule, thus for git checkouts they must be initialized::
-
-  $ git submodule init
-  $ git submodule update
-
-If you have all compatible Python implementations available on your
-system, you can run tests on all of them using the ``tox`` utility,
-which can be found on PyPI.
-
-
-Questions?
-----------
-
-There's a mailing list available for support on Google Groups,
-`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
-though you may get a quicker response asking on IRC in `#whatwg on
-irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
-
-Change Log
-----------
-
-1.0.1
-~~~~~
-
-Released on December 7, 2017
-
-Breaking changes:
-
-* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
-* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
-
-Features:
-
-* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
-  Will Kahn-Greene!)
-* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
-* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
-* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
-* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
-* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
-  Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
-* Semver-compliant version number.
-
-Bug fixes:
-
-* Add support for setuptools < 18.5 to support environment markers. (Thank you,
-  John Vandenberg!)
-* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
-* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
-  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
-* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
-  Kahn-Greene!)
-* Include license file in generated wheel package. (#350) (Thank you, Jon
-  Dufresne!)
-* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
-* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
-  Komal Dembla, Hugo!)
-
-
-1.0
-~~~
-
-Released and unreleased on December 7, 2017. Badly packaged release.
-
-
-0.999999999/1.0b10
-~~~~~~~~~~~~~~~~~~
-
-Released on July 15, 2016
-
-* Fix attribute order going to the tree builder to be document order
-  instead of reverse document order(!).
-
-
-0.99999999/1.0b9
-~~~~~~~~~~~~~~~~
-
-Released on July 14, 2016
-
-* **Added ordereddict as a mandatory dependency on Python 2.6.**
-
-* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
-  extras that will do the right thing based on the specific
-  interpreter implementation.
-
-* Now requires the ``mock`` package for the testsuite.
-
-* Cease supporting DATrie under PyPy.
-
-* **Remove PullDOM support, as this hasn't ever been properly
-  tested, doesn't entirely work, and as far as I can tell is
-  completely unused by anyone.**
-
-* Move testsuite to ``py.test``.
-
-* **Fix #124: move to webencodings for decoding the input byte stream;
-  this makes html5lib compliant with the Encoding Standard, and
-  introduces a required dependency on webencodings.**
-
-* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
-
-* **Fix comments containing double-dash with lxml 3.5 and above.**
-
-* **Use scripting disabled by default (as we don't implement
-  scripting).**
-
-* **Fix #11, avoiding the XSS bug potentially caused by serializer
-  allowing attribute values to be escaped out of in old browser versions,
-  changing the quote_attr_values option on serializer to take one of
-  three values, "always" (the old True value), "legacy" (the new option,
-  and the new default), and "spec" (the old False value, and the old
-  default).**
-
-* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
-  (instead of the tokenizer); as such, this will require amending all
-  callers of it to use it via the treewalker API.**
-
-* **Drop support of charade, now that chardet is supported once more.**
-
-* **Replace the charset keyword argument on parse and related methods
-  with a set of keyword arguments: override_encoding, transport_encoding,
-  same_origin_parent_encoding, likely_encoding, and default_encoding.**
-
-* **Move filters._base, treebuilder._base, and treewalkers._base to .base
-  to clarify their status as public.**
-
-* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
-  sanitizer.htmlsanitizer module and move that to sanitizer. This means
-  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
-  code changes.**
-
-* **Rename treewalkers.lxmletree to .etree_lxml and
-  treewalkers.genshistream to .genshi to have a consistent API.**
-
-* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
-  utils) to be underscore prefixed to clarify their status as private.
-
-
-0.9999999/1.0b8
-~~~~~~~~~~~~~~~
-
-Released on September 10, 2015
-
-* Fix #195: fix the sanitizer to drop broken URLs (it threw an
-  exception between 0.9999 and 0.999999).
-
-
-0.999999/1.0b7
-~~~~~~~~~~~~~~
-
-Released on July 7, 2015
-
-* Fix #189: fix the sanitizer to allow relative URLs again (as it did
-  prior to 0.9999/1.0b5).
-
-
-0.99999/1.0b6
-~~~~~~~~~~~~~
-
-Released on April 30, 2015
-
-* Fix #188: fix the sanitizer to not throw an exception when sanitizing
-  bogus data URLs.
-
-
-0.9999/1.0b5
-~~~~~~~~~~~~
-
-Released on April 29, 2015
-
-* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
-  this sounds, this has no known security implications.  No known version
-  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
-  Chrome (1 to current), or Opera (12 to current) will run any script
-  provided in these attributes.
-
-* Pass error message to the ParseError exception in strict parsing mode.
-
-* Allow data URIs in the sanitizer, with a whitelist of content-types.
-
-* Add support for Python implementations that don't support lone
-  surrogates (read: Jython). Fixes #2.
-
-* Remove localization of error messages. This functionality was totally
-  unused (and untested that everything was localizable), so we may as
-  well follow numerous browsers in not supporting translating technical
-  strings.
-
-* Expose treewalkers.pprint as a public API.
-
-* Add a documentEncoding property to HTML5Parser, fix #121.
-
-
-0.999
-~~~~~
-
-Released on December 23, 2013
-
-* Fix #127: add work-around for CPython issue #20007: .read(0) on
-  http.client.HTTPResponse drops the rest of the content.
-
-* Fix #115: lxml treewalker can now deal with fragments containing, at
-  their root level, text nodes with non-ASCII characters on Python 2.
-
-
-0.99
-~~~~
-
-Released on September 10, 2013
-
-* No library changes from 1.0b3; released as 0.99 as pip has changed
-  behaviour from 1.4 to avoid installing pre-release versions per
-  PEP 440.
-
-
-1.0b3
-~~~~~
-
-Released on July 24, 2013
-
-* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
-  implementation using it should be moved to
-  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
-  for years.
-
-* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
-  object, thereby fixing any case where html5lib is passed a
-  non-seekable RawIOBase-like object.
-
-
-1.0b2
-~~~~~
-
-Released on June 27, 2013
-
-* Removed reordering of attributes within the serializer. There is now
-  an ``alphabetical_attributes`` option which preserves the previous
-  behaviour through a new filter. This allows attribute order to be
-  preserved through html5lib if the tree builder preserves order.
-
-* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
-  ``treeadapters.sax.to_sax`` which is generic and supports any
-  treewalker; it also resolves all known bugs with ``dom2sax``.
-
-* Fix treewalker assertions on hitting bytes strings on
-  Python 2. Previous to 1.0b1, treewalkers coped with mixed
-  bytes/unicode data on Python 2; this reintroduces this prior
-  behaviour on Python 2. Behaviour is unchanged on Python 3.
-
-
-1.0b1
-~~~~~
-
-Released on May 17, 2013
-
-* Implementation updated to implement the `HTML specification
-  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
-  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
-
-* Python 3.2+ supported in a single codebase using the ``six`` library.
-
-* Removed support for Python 2.5 and older.
-
-* Removed the deprecated Beautiful Soup 3 treebuilder.
-  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
-  since it doesn't support namespaces, foreign content like SVG and
-  MathML is parsed incorrectly.
-
-* Removed ``simpletree`` from the package. The default tree builder is
-  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
-  available, and ``xml.etree.ElementTree`` otherwise).
-
-* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
-  output was well-formed XML, and hence provided little of use.
-
-* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
-  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
-  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
-
-* Optional heuristic character encoding detection now based on
-  ``charade`` for Python 2.6 - 3.3 compatibility.
-
-* Optional ``Genshi`` treewalker support fixed.
-
-* Many bugfixes, including:
-
-  * #33: null in attribute value breaks XML AttValue;
-
-  * #4: nested, indirect descendant, <button> causes infinite loop;
-
-  * `Google Code 215
-    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
-    detect seekable streams;
-
-  * `Google Code 206
-    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
-    support for <video preload=...>, <audio preload=...>;
-
-  * `Google Code 205
-    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
-    support for <video poster=...>;
-
-  * `Google Code 202
-    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
-    file breaks InputStream.
-
-* Source code is now mostly PEP 8 compliant.
-
-* Test harness has been improved and now depends on ``nose``.
-
-* Documentation updated and moved to https://html5lib.readthedocs.io/.
-
-
-0.95
-~~~~
-
-Released on February 11, 2012
-
-
-0.90
-~~~~
-
-Released on January 17, 2010
-
-
-0.11.1
-~~~~~~
-
-Released on June 12, 2008
-
-
-0.11
-~~~~
-
-Released on June 10, 2008
-
-
-0.10
-~~~~
-
-Released on October 7, 2007
-
-
-0.9
-~~~
-
-Released on March 11, 2007
-
-
-0.2
-~~~
-
-Released on January 8, 2007
-
-
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD b/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
deleted file mode 100644
index bad12d0a..00000000
--- a/bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
+++ /dev/null
@@ -1,42 +0,0 @@
-html5lib/__init__.py,sha256=q1D20NqqzRVgmTHW2xiVtaQT2eKna-iit3tL62Yn5OI,1145
-html5lib/_ihatexml.py,sha256=3LBtJMlzgwM8vpQiU1TvGmEEmNH72sV0yD8yS53y07A,16705
-html5lib/_inputstream.py,sha256=WtC-hb3nS7Du6XvdL9JACOQgD5ydPKb7f9z0q4OIvRM,32499
-html5lib/_tokenizer.py,sha256=JFZ4kiYfas1f62q2bdXH8Ch5DtXAWEZg0KYkRF4boRQ,76568
-html5lib/_utils.py,sha256=UHC4fXEZRJ0YM44Z4DeLem66auCjb08vSPcN6Y714Iw,4003
-html5lib/constants.py,sha256=4lmZWLtEPRLnl8NzftOoYTJdo6jpeMtP6dqQC0g_bWQ,83518
-html5lib/html5parser.py,sha256=eeMsctZUonbJZPegB_CElFye2lGufMcMsxQxsJtf7Mg,118951
-html5lib/serializer.py,sha256=cmZQjjaXThEe2_6yzDqeb3yXS_hUggv0cCa2VBD9e2Y,15746
-html5lib/_trie/__init__.py,sha256=8VR1bcgD2OpeS2XExpu5yBhP_Q1K-lwKbBKICBPf1kU,289
-html5lib/_trie/_base.py,sha256=uJHVhzif9S0MJXgy9F98iEev5evi_rgUk5BmEbUSp8c,930
-html5lib/_trie/datrie.py,sha256=rGMj61020CBiR97e4kyMyqn_FSIJzgDcYT2uj7PZkoo,1166
-html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
-html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
-html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
-html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
-html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
-html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
-html5lib/filters/sanitizer.py,sha256=V6_cpCq9EXgXkL1CblWEUxSgHy466Hy8k0453x8PSs8,26236
-html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
-html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
-html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
-html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
-html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
-html5lib/treebuilders/base.py,sha256=JEFLxUEsluRl7vY-6cnAk44HxgCAkaj4GpEOBpg8tao,14567
-html5lib/treebuilders/dom.py,sha256=SY3MsijXyzdNPc8aK5IQsupBoM8J67y56DgNtGvsb9g,8835
-html5lib/treebuilders/etree.py,sha256=R0zaNrdtPel3XHV8PUVcQzVnMuiOm_8fpZof7tU7ips,12752
-html5lib/treebuilders/etree_lxml.py,sha256=9V0dXxbJYYq-Skgb5-_OL2NkVYpjioEb4CHajo0e9yI,14122
-html5lib/treewalkers/__init__.py,sha256=yhXxHpjlSqfQyUag3v8-vWjMPriFBU8YRAPNpDgBTn8,5714
-html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
-html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
-html5lib/treewalkers/etree.py,sha256=gRzfuNnWg6r-fvtXRp4xPVTC1CHPowcn8Dc4-WcDoOg,4538
-html5lib/treewalkers/etree_lxml.py,sha256=AR07dDrdkDqrQT4yNK_5WeGiZMHfOrM3ZmmII6YrSgs,6297
-html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
-html5lib-1.0.1.dist-info/DESCRIPTION.rst,sha256=1QkiA38mSikkzyQO1kAQXkBUtQSTl-MR63Zd2TMe06s,13763
-html5lib-1.0.1.dist-info/LICENSE.txt,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
-html5lib-1.0.1.dist-info/METADATA,sha256=ViKKPHTrTam-_oHIB2cxmtg4gKgqdfl4ahDnIWBdyUE,15484
-html5lib-1.0.1.dist-info/RECORD,,
-html5lib-1.0.1.dist-info/WHEEL,sha256=5wvfB7GvgZAbKBSE9uX9Zbi6LCL-_KgezgHblXhCRnM,113
-html5lib-1.0.1.dist-info/metadata.json,sha256=bbLAvG6pYvgK2EdWNVi1mNa5pu9bI4qLwGm0IvuesFA,1731
-html5lib-1.0.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
-html5lib-1.0.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json b/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
deleted file mode 100644
index 23cd6e41..00000000
--- a/bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Markup :: HTML"], "description_content_type": "UNKNOWN", "extensions": {"python.details": {"contacts": [{"email": "james@hoppipolla.co.uk", "name": "James Graham", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst", "license": "LICENSE.txt"}, "project_urls": {"Home": "https://github.com/html5lib/html5lib-python"}}}, "extras": ["all", "chardet", "datrie", "genshi", "lxml"], "generator": "bdist_wheel (0.30.0.a0)", "license": "MIT License", "metadata_version": "2.0", "name": "html5lib", "run_requires": [{"extra": "all", "requires": ["chardet (>=2.2)", "genshi"]}, {"extra": "chardet", "requires": ["chardet (>=2.2)"]}, {"extra": "genshi", "requires": ["genshi"]}, {"requires": ["six (>=1.9)", "webencodings"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "all", "requires": ["datrie", "lxml"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "datrie", "requires": ["datrie"]}, {"environment": "platform_python_implementation == 'CPython'", "extra": "lxml", "requires": ["lxml"]}], "summary": "HTML parser based on the WHATWG HTML specification", "version": "1.0.1"}
\ No newline at end of file
diff --git a/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst b/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
new file mode 100644
index 00000000..90401390
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
@@ -0,0 +1,66 @@
+Credits
+=======
+
+``html5lib`` is written and maintained by:
+
+- James Graham
+- Sam Sneddon
+- Łukasz Langa
+- Will Kahn-Greene
+
+
+Patches and suggestions
+-----------------------
+(In chronological order, by first commit:)
+
+- Anne van Kesteren
+- Lachlan Hunt
+- lantis63
+- Sam Ruby
+- Thomas Broyer
+- Tim Fletcher
+- Mark Pilgrim
+- Ryan King
+- Philip Taylor
+- Edward Z. Yang
+- fantasai
+- Philip Jägenstedt
+- Ms2ger
+- Mohammad Taha Jahangir
+- Andy Wingo
+- Andreas Madsack
+- Karim Valiev
+- Juan Carlos Garcia Segovia
+- Mike West
+- Marc DM
+- Simon Sapin
+- Michael[tm] Smith
+- Ritwik Gupta
+- Marc Abramowitz
+- Tony Lopes
+- lilbludevil
+- Kevin
+- Drew Hubl
+- Austin Kumbera
+- Jim Baker
+- Jon Dufresne
+- Donald Stufft
+- Alex Gaynor
+- Nik Nyby
+- Jakub Wilk
+- Sigmund Cherem
+- Gabi Davar
+- Florian Mounier
+- neumond
+- Vitalik Verhovodov
+- Kovid Goyal
+- Adam Chainz
+- John Vandenberg
+- Eric Amorde
+- Benedikt Morbach
+- Jonathan Vanasco
+- Tom Most
+- Ville Skyttä
+- Hugo van Kemenade
+- Mark Vasilkov
+
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER b/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
similarity index 100%
rename from bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER
rename to bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt b/bleach/_vendor/html5lib-1.1.dist-info/LICENSE
similarity index 100%
rename from bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt
rename to bleach/_vendor/html5lib-1.1.dist-info/LICENSE
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/METADATA b/bleach/_vendor/html5lib-1.1.dist-info/METADATA
similarity index 88%
rename from bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
rename to bleach/_vendor/html5lib-1.1.dist-info/METADATA
index f8131d7b..ee83c1f8 100644
--- a/bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
+++ b/bleach/_vendor/html5lib-1.1.dist-info/METADATA
@@ -1,12 +1,11 @@
-Metadata-Version: 2.0
+Metadata-Version: 2.1
 Name: html5lib
-Version: 1.0.1
+Version: 1.1
 Summary: HTML parser based on the WHATWG HTML specification
 Home-page: https://github.com/html5lib/html5lib-python
-Author: James Graham
-Author-email: james@hoppipolla.co.uk
+Maintainer: James Graham
+Maintainer-email: james@hoppipolla.co.uk
 License: MIT License
-Description-Content-Type: UNKNOWN
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
@@ -16,34 +15,34 @@ Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 2
 Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.3
-Classifier: Programming Language :: Python :: 3.4
 Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: Markup :: HTML
+Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
 Requires-Dist: six (>=1.9)
 Requires-Dist: webencodings
 Provides-Extra: all
-Requires-Dist: genshi; extra == 'all'
-Requires-Dist: chardet (>=2.2); extra == 'all'
-Provides-Extra: all
-Requires-Dist: datrie; platform_python_implementation == 'CPython' and extra == 'all'
-Requires-Dist: lxml; platform_python_implementation == 'CPython' and extra == 'all'
+Requires-Dist: genshi ; extra == 'all'
+Requires-Dist: chardet (>=2.2) ; extra == 'all'
+Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
 Provides-Extra: chardet
-Requires-Dist: chardet (>=2.2); extra == 'chardet'
-Provides-Extra: datrie
-Requires-Dist: datrie; platform_python_implementation == 'CPython' and extra == 'datrie'
+Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
 Provides-Extra: genshi
-Requires-Dist: genshi; extra == 'genshi'
+Requires-Dist: genshi ; extra == 'genshi'
 Provides-Extra: lxml
-Requires-Dist: lxml; platform_python_implementation == 'CPython' and extra == 'lxml'
+Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
 
 html5lib
 ========
 
-.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
-  :target: https://travis-ci.org/html5lib/html5lib-python
+.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
+    :target: https://travis-ci.org/html5lib/html5lib-python
+
 
 html5lib is a pure-python library for parsing HTML. It is designed to
 conform to the WHATWG HTML specification, as is implemented by all major
@@ -131,13 +130,15 @@ More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 
-html5lib works on CPython 2.7+, CPython 3.3+ and PyPy.  To install it,
-use:
+html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
 
 .. code-block:: bash
 
     $ pip install html5lib
 
+The goal is to support a (non-strict) superset of the versions that `pip
+supports
+<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
 
 Optional Dependencies
 ---------------------
@@ -145,9 +146,6 @@ Optional Dependencies
 The following third-party libraries may be used for additional
 functionality:
 
-- ``datrie`` can be used under CPython to improve parsing performance
-  (though in almost all cases the improvement is marginal);
-
 - ``lxml`` is supported as a tree format (for both building and
   walking) under CPython (but *not* PyPy where it is known to cause
   segfaults);
@@ -194,6 +192,30 @@ irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
 Change Log
 ----------
 
+1.1
+~~~
+
+UNRELEASED
+
+Breaking changes:
+
+* Drop support for Python 3.3. (#358)
+* Drop support for Python 3.4. (#421)
+
+Deprecations:
+
+* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
+  ``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
+  <https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
+  use. (#443)
+
+Other changes:
+
+* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
+  ``html5lib`` keeps working in future Python versions. (#403)
+* Drop optional ``datrie`` dependency. (#442)
+
+
 1.0.1
 ~~~~~
 
@@ -213,7 +235,7 @@ Features:
 * Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
 * Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
 * Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
-  Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
+  Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
 * Semver-compliant version number.
 
 Bug fixes:
diff --git a/bleach/_vendor/html5lib-1.1.dist-info/RECORD b/bleach/_vendor/html5lib-1.1.dist-info/RECORD
new file mode 100644
index 00000000..7ec514b8
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.1.dist-info/RECORD
@@ -0,0 +1,41 @@
+html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
+html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
+html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
+html5lib-1.1.dist-info/RECORD,,
+html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
+html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
+html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
+html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
+html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
+html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
+html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
+html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
+html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
+html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
+html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
+html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
+html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
+html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
+html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
+html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
+html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
+html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
+html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
+html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
+html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
+html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
+html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
+html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
+html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
+html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
+html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
+html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
+html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
+html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
+html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
+html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
+html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
+html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL b/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
similarity index 69%
rename from bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
rename to bleach/_vendor/html5lib-1.1.dist-info/WHEEL
index 7bf9daa1..ef99c6cf 100644
--- a/bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
+++ b/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.30.0.a0)
+Generator: bdist_wheel (0.34.2)
 Root-Is-Purelib: true
 Tag: py2-none-any
 Tag: py3-none-any
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt b/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
similarity index 100%
rename from bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt
rename to bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
diff --git a/bleach/_vendor/html5lib/__init__.py b/bleach/_vendor/html5lib/__init__.py
index ba01065e..320e0c3b 100644
--- a/bleach/_vendor/html5lib/__init__.py
+++ b/bleach/_vendor/html5lib/__init__.py
@@ -32,4 +32,4 @@
 
 # this has to be at the top level, see how setup.py parses this
 #: Distribution version number.
-__version__ = "1.0.1"
+__version__ = "1.1"
diff --git a/bleach/_vendor/html5lib/_ihatexml.py b/bleach/_vendor/html5lib/_ihatexml.py
index 4c77717b..3ff803c1 100644
--- a/bleach/_vendor/html5lib/_ihatexml.py
+++ b/bleach/_vendor/html5lib/_ihatexml.py
@@ -136,6 +136,7 @@ def normaliseCharList(charList):
         i += j
     return rv
 
+
 # We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
 
@@ -254,7 +255,7 @@ def toXmlName(self, name):
         nameRest = name[1:]
         m = nonXmlNameFirstBMPRegexp.match(nameFirst)
         if m:
-            warnings.warn("Coercing non-XML name", DataLossWarning)
+            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
             nameFirstOutput = self.getReplacementCharacter(nameFirst)
         else:
             nameFirstOutput = nameFirst
@@ -262,7 +263,7 @@ def toXmlName(self, name):
         nameRestOutput = nameRest
         replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
         for char in replaceChars:
-            warnings.warn("Coercing non-XML name", DataLossWarning)
+            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
             replacement = self.getReplacementCharacter(char)
             nameRestOutput = nameRestOutput.replace(char, replacement)
         return nameFirstOutput + nameRestOutput
diff --git a/bleach/_vendor/html5lib/_inputstream.py b/bleach/_vendor/html5lib/_inputstream.py
index 177f0ab9..0207dd21 100644
--- a/bleach/_vendor/html5lib/_inputstream.py
+++ b/bleach/_vendor/html5lib/_inputstream.py
@@ -1,10 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from six import text_type, binary_type
+from six import text_type
 from six.moves import http_client, urllib
 
 import codecs
 import re
+from io import BytesIO, StringIO
 
 import webencodings
 
@@ -12,13 +13,6 @@
 from .constants import _ReparseException
 from . import _utils
 
-from io import StringIO
-
-try:
-    from io import BytesIO
-except ImportError:
-    BytesIO = StringIO
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@@ -40,13 +34,13 @@
 else:
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
-non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
-                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
-                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
-                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
-                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
-                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
-                                  0x10FFFE, 0x10FFFF])
+non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                              0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                              0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                              0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                              0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                              0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                              0x10FFFE, 0x10FFFF}
 
 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
 
@@ -367,7 +361,7 @@ def charsUntil(self, characters, opposite=False):
     def unget(self, char):
         # Only one character is allowed to be ungotten at once - it must
         # be consumed again before any further call to unget
-        if char is not None:
+        if char is not EOF:
             if self.chunkOffset == 0:
                 # unget is called quite rarely, so it's a good idea to do
                 # more work here if it saves a bit of work in the frequently
@@ -449,7 +443,7 @@ def openStream(self, source):
 
         try:
             stream.seek(stream.tell())
-        except:  # pylint:disable=bare-except
+        except Exception:
             stream = BufferedStream(stream)
 
         return stream
@@ -461,7 +455,7 @@ def determineEncoding(self, chardet=True):
         if charEncoding[0] is not None:
             return charEncoding
 
-        # If we've been overriden, we've been overriden
+        # If we've been overridden, we've been overridden
         charEncoding = lookupEncoding(self.override_encoding), "certain"
         if charEncoding[0] is not None:
             return charEncoding
@@ -664,9 +658,7 @@ def matchBytes(self, bytes):
         """Look for a sequence of bytes at the start of a string. If the bytes
         are found return True and advance the position to the byte after the
         match. Otherwise return False and leave the position alone"""
-        p = self.position
-        data = self[p:p + len(bytes)]
-        rv = data.startswith(bytes)
+        rv = self.startswith(bytes, self.position)
         if rv:
             self.position += len(bytes)
         return rv
@@ -674,15 +666,11 @@ def matchBytes(self, bytes):
     def jumpTo(self, bytes):
         """Look for the next sequence of bytes matching a given sequence. If
         a match is found advance the position to the last byte of the match"""
-        newPosition = self[self.position:].find(bytes)
-        if newPosition > -1:
-            # XXX: This is ugly, but I can't see a nicer way to fix this.
-            if self._position == -1:
-                self._position = 0
-            self._position += (newPosition + len(bytes) - 1)
-            return True
-        else:
+        try:
+            self._position = self.index(bytes, self.position) + len(bytes) - 1
+        except ValueError:
             raise StopIteration
+        return True
 
 
 class EncodingParser(object):
@@ -694,6 +682,9 @@ def __init__(self, data):
         self.encoding = None
 
     def getEncoding(self):
+        if b"<meta" not in self.data:
+            return None
+
         methodDispatch = (
             (b"<!--", self.handleComment),
             (b"<meta", self.handleMeta),
@@ -703,6 +694,10 @@ def getEncoding(self):
             (b"<", self.handlePossibleStartTag))
         for _ in self.data:
             keepParsing = True
+            try:
+                self.data.jumpTo(b"<")
+            except StopIteration:
+                break
             for key, method in methodDispatch:
                 if self.data.matchBytes(key):
                     try:
@@ -908,7 +903,7 @@ def parse(self):
 def lookupEncoding(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
-    if isinstance(encoding, binary_type):
+    if isinstance(encoding, bytes):
         try:
             encoding = encoding.decode("ascii")
         except UnicodeDecodeError:
diff --git a/bleach/_vendor/html5lib/_tokenizer.py b/bleach/_vendor/html5lib/_tokenizer.py
index 6078f66a..4748a197 100644
--- a/bleach/_vendor/html5lib/_tokenizer.py
+++ b/bleach/_vendor/html5lib/_tokenizer.py
@@ -2,7 +2,8 @@
 
 from six import unichr as chr
 
-from collections import deque
+from collections import deque, OrderedDict
+from sys import version_info
 
 from .constants import spaceCharacters
 from .constants import entities
@@ -17,6 +18,11 @@
 
 entitiesTrie = Trie(entities)
 
+if version_info >= (3, 7):
+    attributeMap = dict
+else:
+    attributeMap = OrderedDict
+
 
 class HTMLTokenizer(object):
     """ This class takes care of tokenizing HTML.
@@ -228,6 +234,14 @@ def emitCurrentToken(self):
         # Add token to the queue to be yielded
         if (token["type"] in tagTokenTypes):
             token["name"] = token["name"].translate(asciiUpper2Lower)
+            if token["type"] == tokenTypes["StartTag"]:
+                raw = token["data"]
+                data = attributeMap(raw)
+                if len(raw) > len(data):
+                    # we had some duplicated attribute, fix so first wins
+                    data.update(raw[::-1])
+                token["data"] = data
+
             if token["type"] == tokenTypes["EndTag"]:
                 if token["data"]:
                     self.tokenQueue.append({"type": tokenTypes["ParseError"],
diff --git a/bleach/_vendor/html5lib/_trie/__init__.py b/bleach/_vendor/html5lib/_trie/__init__.py
index a5ba4bf1..07bad5d3 100644
--- a/bleach/_vendor/html5lib/_trie/__init__.py
+++ b/bleach/_vendor/html5lib/_trie/__init__.py
@@ -1,14 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from .py import Trie as PyTrie
+from .py import Trie
 
-Trie = PyTrie
-
-# pylint:disable=wrong-import-position
-try:
-    from .datrie import Trie as DATrie
-except ImportError:
-    pass
-else:
-    Trie = DATrie
-# pylint:enable=wrong-import-position
+__all__ = ["Trie"]
diff --git a/bleach/_vendor/html5lib/_trie/_base.py b/bleach/_vendor/html5lib/_trie/_base.py
index a1158bbb..6b71975f 100644
--- a/bleach/_vendor/html5lib/_trie/_base.py
+++ b/bleach/_vendor/html5lib/_trie/_base.py
@@ -1,6 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from collections import Mapping
+try:
+    from collections.abc import Mapping
+except ImportError:  # Python 2.7
+    from collections import Mapping
 
 
 class Trie(Mapping):
diff --git a/bleach/_vendor/html5lib/_trie/datrie.py b/bleach/_vendor/html5lib/_trie/datrie.py
deleted file mode 100644
index 51f3d046..00000000
--- a/bleach/_vendor/html5lib/_trie/datrie.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from datrie import Trie as DATrie
-from six import text_type
-
-from ._base import Trie as ABCTrie
-
-
-class Trie(ABCTrie):
-    def __init__(self, data):
-        chars = set()
-        for key in data.keys():
-            if not isinstance(key, text_type):
-                raise TypeError("All keys must be strings")
-            for char in key:
-                chars.add(char)
-
-        self._data = DATrie("".join(chars))
-        for key, value in data.items():
-            self._data[key] = value
-
-    def __contains__(self, key):
-        return key in self._data
-
-    def __len__(self):
-        return len(self._data)
-
-    def __iter__(self):
-        raise NotImplementedError()
-
-    def __getitem__(self, key):
-        return self._data[key]
-
-    def keys(self, prefix=None):
-        return self._data.keys(prefix)
-
-    def has_keys_with_prefix(self, prefix):
-        return self._data.has_keys_with_prefix(prefix)
-
-    def longest_prefix(self, prefix):
-        return self._data.longest_prefix(prefix)
-
-    def longest_prefix_item(self, prefix):
-        return self._data.longest_prefix_item(prefix)
diff --git a/bleach/_vendor/html5lib/_utils.py b/bleach/_vendor/html5lib/_utils.py
index 91252f2c..9ea57942 100644
--- a/bleach/_vendor/html5lib/_utils.py
+++ b/bleach/_vendor/html5lib/_utils.py
@@ -2,12 +2,20 @@
 
 from types import ModuleType
 
-from six import text_type
-
 try:
-    import xml.etree.cElementTree as default_etree
+    from collections.abc import Mapping
 except ImportError:
+    from collections import Mapping
+
+from six import text_type, PY3
+
+if PY3:
     import xml.etree.ElementTree as default_etree
+else:
+    try:
+        import xml.etree.cElementTree as default_etree
+    except ImportError:
+        import xml.etree.ElementTree as default_etree
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@@ -27,7 +35,7 @@
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')  # pylint:disable=eval-used
         assert isinstance(_x, text_type)
-except:  # pylint:disable=bare-except
+except Exception:
     supports_lone_surrogates = False
 else:
     supports_lone_surrogates = True
@@ -47,9 +55,6 @@ class MethodDispatcher(dict):
     """
 
     def __init__(self, items=()):
-        # Using _dictEntries instead of directly assigning to self is about
-        # twice as fast. Please do careful performance testing before changing
-        # anything here.
         _dictEntries = []
         for name, value in items:
             if isinstance(name, (list, tuple, frozenset, set)):
@@ -64,6 +69,36 @@ def __init__(self, items=()):
     def __getitem__(self, key):
         return dict.get(self, key, self.default)
 
+    def __get__(self, instance, owner=None):
+        return BoundMethodDispatcher(instance, self)
+
+
+class BoundMethodDispatcher(Mapping):
+    """Wraps a MethodDispatcher, binding its return values to `instance`"""
+    def __init__(self, instance, dispatcher):
+        self.instance = instance
+        self.dispatcher = dispatcher
+
+    def __getitem__(self, key):
+        # see https://docs.python.org/3/reference/datamodel.html#object.__get__
+        # on a function, __get__ is used to bind a function to an instance as a bound method
+        return self.dispatcher[key].__get__(self.instance)
+
+    def get(self, key, default):
+        if key in self.dispatcher:
+            return self[key]
+        else:
+            return default
+
+    def __iter__(self):
+        return iter(self.dispatcher)
+
+    def __len__(self):
+        return len(self.dispatcher)
+
+    def __contains__(self, key):
+        return key in self.dispatcher
+
 
 # Some utility functions to deal with weirdness around UCS2 vs UCS4
 # python builds
diff --git a/bleach/_vendor/html5lib/constants.py b/bleach/_vendor/html5lib/constants.py
index 1ff80419..fe3e237c 100644
--- a/bleach/_vendor/html5lib/constants.py
+++ b/bleach/_vendor/html5lib/constants.py
@@ -519,8 +519,8 @@
     "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
 }
 
-unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
-                                  adjustForeignAttributes.items()])
+unadjustForeignAttributes = {(ns, local): qname for qname, (prefix, local, ns) in
+                             adjustForeignAttributes.items()}
 
 spaceCharacters = frozenset([
     "\t",
@@ -544,8 +544,7 @@
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
 
-asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
-                         for c in string.ascii_uppercase])
+asciiUpper2Lower = {ord(c): ord(c.lower()) for c in string.ascii_uppercase}
 
 # Heading elements need to be ordered
 headingElements = (
@@ -2934,7 +2933,7 @@
                            tokenTypes["EmptyTag"]])
 
 
-prefixes = dict([(v, k) for k, v in namespaces.items()])
+prefixes = {v: k for k, v in namespaces.items()}
 prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
 
 
diff --git a/bleach/_vendor/html5lib/filters/sanitizer.py b/bleach/_vendor/html5lib/filters/sanitizer.py
index e852f53b..70ef9066 100644
--- a/bleach/_vendor/html5lib/filters/sanitizer.py
+++ b/bleach/_vendor/html5lib/filters/sanitizer.py
@@ -1,6 +1,15 @@
+"""Deprecated from html5lib 1.1.
+
+See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
+information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
+is recommended as a replacement. Please let us know in the aforementioned issue
+if Bleach is unsuitable for your needs.
+
+"""
 from __future__ import absolute_import, division, unicode_literals
 
 import re
+import warnings
 from xml.sax.saxutils import escape, unescape
 
 from six.moves import urllib_parse as urlparse
@@ -11,6 +20,14 @@
 __all__ = ["Filter"]
 
 
+_deprecation_msg = (
+    "html5lib's sanitizer is deprecated; see " +
+    "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
+    "us know if Bleach is unsuitable for your needs"
+)
+
+warnings.warn(_deprecation_msg, DeprecationWarning)
+
 allowed_elements = frozenset((
     (namespaces['html'], 'a'),
     (namespaces['html'], 'abbr'),
@@ -750,6 +767,9 @@ def __init__(self,
 
         """
         super(Filter, self).__init__(source)
+
+        warnings.warn(_deprecation_msg, DeprecationWarning)
+
         self.allowed_elements = allowed_elements
         self.allowed_attributes = allowed_attributes
         self.allowed_css_properties = allowed_css_properties
diff --git a/bleach/_vendor/html5lib/html5parser.py b/bleach/_vendor/html5lib/html5parser.py
index 9d39b9d4..74d829d9 100644
--- a/bleach/_vendor/html5lib/html5parser.py
+++ b/bleach/_vendor/html5lib/html5parser.py
@@ -2,7 +2,6 @@
 from six import with_metaclass, viewkeys
 
 import types
-from collections import OrderedDict
 
 from . import _inputstream
 from . import _tokenizer
@@ -119,8 +118,8 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
         self.tree = tree(namespaceHTMLElements)
         self.errors = []
 
-        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
-                            getPhases(debug).items()])
+        self.phases = {name: cls(self, self.tree) for name, cls in
+                       getPhases(debug).items()}
 
     def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
 
@@ -202,7 +201,7 @@ def mainLoop(self):
         DoctypeToken = tokenTypes["Doctype"]
         ParseErrorToken = tokenTypes["ParseError"]
 
-        for token in self.normalizedTokens():
+        for token in self.tokenizer:
             prev_token = None
             new_token = token
             while new_token is not None:
@@ -260,10 +259,6 @@ def mainLoop(self):
             if reprocess:
                 assert self.phase not in phases
 
-    def normalizedTokens(self):
-        for token in self.tokenizer:
-            yield self.normalizeToken(token)
-
     def parse(self, stream, *args, **kwargs):
         """Parse a HTML document into a well-formed tree
 
@@ -325,17 +320,6 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
         if self.strict:
             raise ParseError(E[errorcode] % datavars)
 
-    def normalizeToken(self, token):
-        # HTML5 specific normalizations to the token stream
-        if token["type"] == tokenTypes["StartTag"]:
-            raw = token["data"]
-            token["data"] = OrderedDict(raw)
-            if len(raw) > len(token["data"]):
-                # we had some duplicated attribute, fix so first wins
-                token["data"].update(raw[::-1])
-
-        return token
-
     def adjustMathMLAttributes(self, token):
         adjust_attributes(token, adjustMathMLAttributes)
 
@@ -413,16 +397,12 @@ def parseRCDataRawtext(self, token, contentType):
 def getPhases(debug):
     def log(function):
         """Logger that records which phase processes each token"""
-        type_names = dict((value, key) for key, value in
-                          tokenTypes.items())
+        type_names = {value: key for key, value in tokenTypes.items()}
 
         def wrapped(self, *args, **kwargs):
             if function.__name__.startswith("process") and len(args) > 0:
                 token = args[0]
-                try:
-                    info = {"type": type_names[token['type']]}
-                except:
-                    raise
+                info = {"type": type_names[token['type']]}
                 if token['type'] in tagTokenTypes:
                     info["name"] = token['name']
 
@@ -446,10 +426,13 @@ def getMetaclass(use_metaclass, metaclass_func):
     class Phase(with_metaclass(getMetaclass(debug, log))):
         """Base class for helper object that implements each phase of processing
         """
+        __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
 
         def __init__(self, parser, tree):
             self.parser = parser
             self.tree = tree
+            self.__startTagCache = {}
+            self.__endTagCache = {}
 
         def processEOF(self):
             raise NotImplementedError
@@ -469,7 +452,21 @@ def processSpaceCharacters(self, token):
             self.tree.insertText(token["data"])
 
         def processStartTag(self, token):
-            return self.startTagHandler[token["name"]](token)
+            # Note the caching is done here rather than BoundMethodDispatcher as doing it there
+            # requires a circular reference to the Phase, and this ends up with a significant
+            # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+            name = token["name"]
+            # In Py2, using `in` is quicker in general than try/except KeyError
+            # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
+            if name in self.__startTagCache:
+                func = self.__startTagCache[name]
+            else:
+                func = self.__startTagCache[name] = self.startTagHandler[name]
+                # bound the cache size in case we get loads of unknown tags
+                while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
+                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
+                    self.__startTagCache.pop(next(iter(self.__startTagCache)))
+            return func(token)
 
         def startTagHtml(self, token):
             if not self.parser.firstStartTag and token["name"] == "html":
@@ -482,9 +479,25 @@ def startTagHtml(self, token):
             self.parser.firstStartTag = False
 
         def processEndTag(self, token):
-            return self.endTagHandler[token["name"]](token)
+            # Note the caching is done here rather than BoundMethodDispatcher as doing it there
+            # requires a circular reference to the Phase, and this ends up with a significant
+            # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+            name = token["name"]
+            # In Py2, using `in` is quicker in general than try/except KeyError
+            # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
+            if name in self.__endTagCache:
+                func = self.__endTagCache[name]
+            else:
+                func = self.__endTagCache[name] = self.endTagHandler[name]
+                # bound the cache size in case we get loads of unknown tags
+                while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
+                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
+                    self.__endTagCache.pop(next(iter(self.__endTagCache)))
+            return func(token)
 
     class InitialPhase(Phase):
+        __slots__ = tuple()
+
         def processSpaceCharacters(self, token):
             pass
 
@@ -613,6 +626,8 @@ def processEOF(self):
             return True
 
     class BeforeHtmlPhase(Phase):
+        __slots__ = tuple()
+
         # helper methods
         def insertHtmlElement(self):
             self.tree.insertRoot(impliedTagToken("html", "StartTag"))
@@ -648,19 +663,7 @@ def processEndTag(self, token):
                 return token
 
     class BeforeHeadPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("head", self.startTagHead)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                (("head", "body", "html", "br"), self.endTagImplyHead)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def processEOF(self):
             self.startTagHead(impliedTagToken("head", "StartTag"))
@@ -693,28 +696,19 @@ def endTagOther(self, token):
             self.parser.parseError("end-tag-after-implied-root",
                                    {"name": token["name"]})
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml),
+            ("head", startTagHead)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            (("head", "body", "html", "br"), endTagImplyHead)
+        ])
+        endTagHandler.default = endTagOther
+
     class InHeadPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("title", self.startTagTitle),
-                (("noframes", "style"), self.startTagNoFramesStyle),
-                ("noscript", self.startTagNoscript),
-                ("script", self.startTagScript),
-                (("base", "basefont", "bgsound", "command", "link"),
-                 self.startTagBaseLinkCommand),
-                ("meta", self.startTagMeta),
-                ("head", self.startTagHead)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("head", self.endTagHead),
-                (("br", "html", "body"), self.endTagHtmlBodyBr)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # the real thing
         def processEOF(self):
@@ -796,22 +790,27 @@ def endTagOther(self, token):
         def anythingElse(self):
             self.endTagHead(impliedTagToken("head"))
 
-    class InHeadNoscriptPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml),
+            ("title", startTagTitle),
+            (("noframes", "style"), startTagNoFramesStyle),
+            ("noscript", startTagNoscript),
+            ("script", startTagScript),
+            (("base", "basefont", "bgsound", "command", "link"),
+             startTagBaseLinkCommand),
+            ("meta", startTagMeta),
+            ("head", startTagHead)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("head", endTagHead),
+            (("br", "html", "body"), endTagHtmlBodyBr)
+        ])
+        endTagHandler.default = endTagOther
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
-                (("head", "noscript"), self.startTagHeadNoscript),
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("noscript", self.endTagNoscript),
-                ("br", self.endTagBr),
-            ])
-            self.endTagHandler.default = self.endTagOther
+    class InHeadNoscriptPhase(Phase):
+        __slots__ = tuple()
 
         def processEOF(self):
             self.parser.parseError("eof-in-head-noscript")
@@ -860,23 +859,21 @@ def anythingElse(self):
             # Caller must raise parse error first!
             self.endTagNoscript(impliedTagToken("noscript"))
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml),
+            (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
+            (("head", "noscript"), startTagHeadNoscript),
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("noscript", endTagNoscript),
+            ("br", endTagBr),
+        ])
+        endTagHandler.default = endTagOther
+
     class AfterHeadPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("body", self.startTagBody),
-                ("frameset", self.startTagFrameset),
-                (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
-                  "style", "title"),
-                 self.startTagFromHead),
-                ("head", self.startTagHead)
-            ])
-            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
-                                                           self.endTagHtmlBodyBr)])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def processEOF(self):
             self.anythingElse()
@@ -927,80 +924,30 @@ def anythingElse(self):
             self.parser.phase = self.parser.phases["inBody"]
             self.parser.framesetOK = True
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml),
+            ("body", startTagBody),
+            ("frameset", startTagFrameset),
+            (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+              "style", "title"),
+             startTagFromHead),
+            ("head", startTagHead)
+        ])
+        startTagHandler.default = startTagOther
+        endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+                                                  endTagHtmlBodyBr)])
+        endTagHandler.default = endTagOther
+
     class InBodyPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
         # the really-really-really-very crazy mode
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        __slots__ = ("processSpaceCharacters",)
 
+        def __init__(self, *args, **kwargs):
+            super(InBodyPhase, self).__init__(*args, **kwargs)
             # Set this to the default handler
             self.processSpaceCharacters = self.processSpaceCharactersNonPre
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                (("base", "basefont", "bgsound", "command", "link", "meta",
-                  "script", "style", "title"),
-                 self.startTagProcessInHead),
-                ("body", self.startTagBody),
-                ("frameset", self.startTagFrameset),
-                (("address", "article", "aside", "blockquote", "center", "details",
-                  "dir", "div", "dl", "fieldset", "figcaption", "figure",
-                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
-                  "section", "summary", "ul"),
-                 self.startTagCloseP),
-                (headingElements, self.startTagHeading),
-                (("pre", "listing"), self.startTagPreListing),
-                ("form", self.startTagForm),
-                (("li", "dd", "dt"), self.startTagListItem),
-                ("plaintext", self.startTagPlaintext),
-                ("a", self.startTagA),
-                (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
-                  "strong", "tt", "u"), self.startTagFormatting),
-                ("nobr", self.startTagNobr),
-                ("button", self.startTagButton),
-                (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
-                ("xmp", self.startTagXmp),
-                ("table", self.startTagTable),
-                (("area", "br", "embed", "img", "keygen", "wbr"),
-                 self.startTagVoidFormatting),
-                (("param", "source", "track"), self.startTagParamSource),
-                ("input", self.startTagInput),
-                ("hr", self.startTagHr),
-                ("image", self.startTagImage),
-                ("isindex", self.startTagIsIndex),
-                ("textarea", self.startTagTextarea),
-                ("iframe", self.startTagIFrame),
-                ("noscript", self.startTagNoscript),
-                (("noembed", "noframes"), self.startTagRawtext),
-                ("select", self.startTagSelect),
-                (("rp", "rt"), self.startTagRpRt),
-                (("option", "optgroup"), self.startTagOpt),
-                (("math"), self.startTagMath),
-                (("svg"), self.startTagSvg),
-                (("caption", "col", "colgroup", "frame", "head",
-                  "tbody", "td", "tfoot", "th", "thead",
-                  "tr"), self.startTagMisplaced)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("body", self.endTagBody),
-                ("html", self.endTagHtml),
-                (("address", "article", "aside", "blockquote", "button", "center",
-                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
-                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
-                  "section", "summary", "ul"), self.endTagBlock),
-                ("form", self.endTagForm),
-                ("p", self.endTagP),
-                (("dd", "dt", "li"), self.endTagListItem),
-                (headingElements, self.endTagHeading),
-                (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
-                  "strike", "strong", "tt", "u"), self.endTagFormatting),
-                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
-                ("br", self.endTagBr),
-            ])
-            self.endTagHandler.default = self.endTagOther
-
         def isMatchingFormattingElement(self, node1, node2):
             return (node1.name == node2.name and
                     node1.namespace == node2.namespace and
@@ -1650,14 +1597,73 @@ def endTagOther(self, token):
                         self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
                         break
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            (("base", "basefont", "bgsound", "command", "link", "meta",
+              "script", "style", "title"),
+             startTagProcessInHead),
+            ("body", startTagBody),
+            ("frameset", startTagFrameset),
+            (("address", "article", "aside", "blockquote", "center", "details",
+              "dir", "div", "dl", "fieldset", "figcaption", "figure",
+              "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+              "section", "summary", "ul"),
+             startTagCloseP),
+            (headingElements, startTagHeading),
+            (("pre", "listing"), startTagPreListing),
+            ("form", startTagForm),
+            (("li", "dd", "dt"), startTagListItem),
+            ("plaintext", startTagPlaintext),
+            ("a", startTagA),
+            (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+              "strong", "tt", "u"), startTagFormatting),
+            ("nobr", startTagNobr),
+            ("button", startTagButton),
+            (("applet", "marquee", "object"), startTagAppletMarqueeObject),
+            ("xmp", startTagXmp),
+            ("table", startTagTable),
+            (("area", "br", "embed", "img", "keygen", "wbr"),
+             startTagVoidFormatting),
+            (("param", "source", "track"), startTagParamSource),
+            ("input", startTagInput),
+            ("hr", startTagHr),
+            ("image", startTagImage),
+            ("isindex", startTagIsIndex),
+            ("textarea", startTagTextarea),
+            ("iframe", startTagIFrame),
+            ("noscript", startTagNoscript),
+            (("noembed", "noframes"), startTagRawtext),
+            ("select", startTagSelect),
+            (("rp", "rt"), startTagRpRt),
+            (("option", "optgroup"), startTagOpt),
+            (("math"), startTagMath),
+            (("svg"), startTagSvg),
+            (("caption", "col", "colgroup", "frame", "head",
+              "tbody", "td", "tfoot", "th", "thead",
+              "tr"), startTagMisplaced)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("body", endTagBody),
+            ("html", endTagHtml),
+            (("address", "article", "aside", "blockquote", "button", "center",
+              "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+              "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+              "section", "summary", "ul"), endTagBlock),
+            ("form", endTagForm),
+            ("p", endTagP),
+            (("dd", "dt", "li"), endTagListItem),
+            (headingElements, endTagHeading),
+            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+              "strike", "strong", "tt", "u"), endTagFormatting),
+            (("applet", "marquee", "object"), endTagAppletMarqueeObject),
+            ("br", endTagBr),
+        ])
+        endTagHandler.default = endTagOther
+
     class TextPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-            self.startTagHandler = _utils.MethodDispatcher([])
-            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("script", self.endTagScript)])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def processCharacters(self, token):
             self.tree.insertText(token["data"])
@@ -1683,30 +1689,15 @@ def endTagOther(self, token):
             self.tree.openElements.pop()
             self.parser.phase = self.parser.originalPhase
 
+        startTagHandler = _utils.MethodDispatcher([])
+        startTagHandler.default = startTagOther
+        endTagHandler = _utils.MethodDispatcher([
+            ("script", endTagScript)])
+        endTagHandler.default = endTagOther
+
     class InTablePhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-table
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("caption", self.startTagCaption),
-                ("colgroup", self.startTagColgroup),
-                ("col", self.startTagCol),
-                (("tbody", "tfoot", "thead"), self.startTagRowGroup),
-                (("td", "th", "tr"), self.startTagImplyTbody),
-                ("table", self.startTagTable),
-                (("style", "script"), self.startTagStyleScript),
-                ("input", self.startTagInput),
-                ("form", self.startTagForm)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("table", self.endTagTable),
-                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
-                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # helper methods
         def clearStackToTableContext(self):
@@ -1828,9 +1819,32 @@ def endTagOther(self, token):
             self.parser.phases["inBody"].processEndTag(token)
             self.tree.insertFromTable = False
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("caption", startTagCaption),
+            ("colgroup", startTagColgroup),
+            ("col", startTagCol),
+            (("tbody", "tfoot", "thead"), startTagRowGroup),
+            (("td", "th", "tr"), startTagImplyTbody),
+            ("table", startTagTable),
+            (("style", "script"), startTagStyleScript),
+            ("input", startTagInput),
+            ("form", startTagForm)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("table", endTagTable),
+            (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+              "tfoot", "th", "thead", "tr"), endTagIgnore)
+        ])
+        endTagHandler.default = endTagOther
+
     class InTableTextPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        __slots__ = ("originalPhase", "characterTokens")
+
+        def __init__(self, *args, **kwargs):
+            super(InTableTextPhase, self).__init__(*args, **kwargs)
             self.originalPhase = None
             self.characterTokens = []
 
@@ -1875,23 +1889,7 @@ def processEndTag(self, token):
 
     class InCaptionPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
-                  "thead", "tr"), self.startTagTableElement)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("caption", self.endTagCaption),
-                ("table", self.endTagTable),
-                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
-                  "thead", "tr"), self.endTagIgnore)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def ignoreEndTagCaption(self):
             return not self.tree.elementInScope("caption", variant="table")
@@ -1944,23 +1942,24 @@ def endTagIgnore(self, token):
         def endTagOther(self, token):
             return self.parser.phases["inBody"].processEndTag(token)
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), startTagTableElement)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("caption", endTagCaption),
+            ("table", endTagTable),
+            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), endTagIgnore)
+        ])
+        endTagHandler.default = endTagOther
+
     class InColumnGroupPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-column
-
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("col", self.startTagCol)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("colgroup", self.endTagColgroup),
-                ("col", self.endTagCol)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def ignoreEndTagColgroup(self):
             return self.tree.openElements[-1].name == "html"
@@ -2010,26 +2009,21 @@ def endTagOther(self, token):
             if not ignoreEndTag:
                 return token
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("col", startTagCol)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("colgroup", endTagColgroup),
+            ("col", endTagCol)
+        ])
+        endTagHandler.default = endTagOther
+
     class InTableBodyPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("tr", self.startTagTr),
-                (("td", "th"), self.startTagTableCell),
-                (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
-                 self.startTagTableOther)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
-                ("table", self.endTagTable),
-                (("body", "caption", "col", "colgroup", "html", "td", "th",
-                  "tr"), self.endTagIgnore)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # helper methods
         def clearStackToTableBodyContext(self):
@@ -2108,26 +2102,26 @@ def endTagIgnore(self, token):
         def endTagOther(self, token):
             return self.parser.phases["inTable"].processEndTag(token)
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("tr", startTagTr),
+            (("td", "th"), startTagTableCell),
+            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+             startTagTableOther)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            (("tbody", "tfoot", "thead"), endTagTableRowGroup),
+            ("table", endTagTable),
+            (("body", "caption", "col", "colgroup", "html", "td", "th",
+              "tr"), endTagIgnore)
+        ])
+        endTagHandler.default = endTagOther
+
     class InRowPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-row
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                (("td", "th"), self.startTagTableCell),
-                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
-                  "tr"), self.startTagTableOther)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("tr", self.endTagTr),
-                ("table", self.endTagTable),
-                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
-                (("body", "caption", "col", "colgroup", "html", "td", "th"),
-                 self.endTagIgnore)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # helper methods (XXX unify this with other table helper methods)
         def clearStackToTableRowContext(self):
@@ -2197,23 +2191,26 @@ def endTagIgnore(self, token):
         def endTagOther(self, token):
             return self.parser.phases["inTable"].processEndTag(token)
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            (("td", "th"), startTagTableCell),
+            (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+              "tr"), startTagTableOther)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("tr", endTagTr),
+            ("table", endTagTable),
+            (("tbody", "tfoot", "thead"), endTagTableRowGroup),
+            (("body", "caption", "col", "colgroup", "html", "td", "th"),
+             endTagIgnore)
+        ])
+        endTagHandler.default = endTagOther
+
     class InCellPhase(Phase):
         # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
-                  "thead", "tr"), self.startTagTableOther)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                (("td", "th"), self.endTagTableCell),
-                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
-                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # helper
         def closeCell(self):
@@ -2273,26 +2270,22 @@ def endTagImply(self, token):
         def endTagOther(self, token):
             return self.parser.phases["inBody"].processEndTag(token)
 
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), startTagTableOther)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            (("td", "th"), endTagTableCell),
+            (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
+            (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
+        ])
+        endTagHandler.default = endTagOther
+
     class InSelectPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("option", self.startTagOption),
-                ("optgroup", self.startTagOptgroup),
-                ("select", self.startTagSelect),
-                (("input", "keygen", "textarea"), self.startTagInput),
-                ("script", self.startTagScript)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("option", self.endTagOption),
-                ("optgroup", self.endTagOptgroup),
-                ("select", self.endTagSelect)
-            ])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         # http://www.whatwg.org/specs/web-apps/current-work/#in-select
         def processEOF(self):
@@ -2373,21 +2366,25 @@ def endTagOther(self, token):
             self.parser.parseError("unexpected-end-tag-in-select",
                                    {"name": token["name"]})
 
-    class InSelectInTablePhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
-                 self.startTagTable)
-            ])
-            self.startTagHandler.default = self.startTagOther
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("option", startTagOption),
+            ("optgroup", startTagOptgroup),
+            ("select", startTagSelect),
+            (("input", "keygen", "textarea"), startTagInput),
+            ("script", startTagScript)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            ("option", endTagOption),
+            ("optgroup", endTagOptgroup),
+            ("select", endTagSelect)
+        ])
+        endTagHandler.default = endTagOther
 
-            self.endTagHandler = _utils.MethodDispatcher([
-                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
-                 self.endTagTable)
-            ])
-            self.endTagHandler.default = self.endTagOther
+    class InSelectInTablePhase(Phase):
+        __slots__ = tuple()
 
         def processEOF(self):
             self.parser.phases["inSelect"].processEOF()
@@ -2412,7 +2409,21 @@ def endTagTable(self, token):
         def endTagOther(self, token):
             return self.parser.phases["inSelect"].processEndTag(token)
 
+        startTagHandler = _utils.MethodDispatcher([
+            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+             startTagTable)
+        ])
+        startTagHandler.default = startTagOther
+
+        endTagHandler = _utils.MethodDispatcher([
+            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+             endTagTable)
+        ])
+        endTagHandler.default = endTagOther
+
     class InForeignContentPhase(Phase):
+        __slots__ = tuple()
+
         breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
                                       "center", "code", "dd", "div", "dl", "dt",
                                       "em", "embed", "h1", "h2", "h3",
@@ -2422,9 +2433,6 @@ class InForeignContentPhase(Phase):
                                       "span", "strong", "strike", "sub", "sup",
                                       "table", "tt", "u", "ul", "var"])
 
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
         def adjustSVGTagNames(self, token):
             replacements = {"altglyph": "altGlyph",
                             "altglyphdef": "altGlyphDef",
@@ -2478,7 +2486,7 @@ def processStartTag(self, token):
             currentNode = self.tree.openElements[-1]
             if (token["name"] in self.breakoutElements or
                 (token["name"] == "font" and
-                 set(token["data"].keys()) & set(["color", "face", "size"]))):
+                 set(token["data"].keys()) & {"color", "face", "size"})):
                 self.parser.parseError("unexpected-html-element-in-foreign-content",
                                        {"name": token["name"]})
                 while (self.tree.openElements[-1].namespace !=
@@ -2528,16 +2536,7 @@ def processEndTag(self, token):
             return new_token
 
     class AfterBodyPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
-
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml)
-            ])
-            self.startTagHandler.default = self.startTagOther
-
-            self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
-            self.endTagHandler.default = self.endTagOther
+        __slots__ = tuple()
 
         def processEOF(self):
             # Stop parsing
@@ -2574,23 +2573,17 @@ def endTagOther(self, token):
             self.parser.phase = self.parser.phases["inBody"]
             return token
 
-    class InFramesetPhase(Phase):
-        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml)
+        ])
+        startTagHandler.default = startTagOther
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("frameset", self.startTagFrameset),
-                ("frame", self.startTagFrame),
-                ("noframes", self.startTagNoframes)
-            ])
-            self.startTagHandler.default = self.startTagOther
+        endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
+        endTagHandler.default = endTagOther
 
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("frameset", self.endTagFrameset)
-            ])
-            self.endTagHandler.default = self.endTagOther
+    class InFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+        __slots__ = tuple()
 
         def processEOF(self):
             if self.tree.openElements[-1].name != "html":
@@ -2631,21 +2624,22 @@ def endTagOther(self, token):
             self.parser.parseError("unexpected-end-tag-in-frameset",
                                    {"name": token["name"]})
 
-    class AfterFramesetPhase(Phase):
-        # http://www.whatwg.org/specs/web-apps/current-work/#after3
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("frameset", startTagFrameset),
+            ("frame", startTagFrame),
+            ("noframes", startTagNoframes)
+        ])
+        startTagHandler.default = startTagOther
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("noframes", self.startTagNoframes)
-            ])
-            self.startTagHandler.default = self.startTagOther
+        endTagHandler = _utils.MethodDispatcher([
+            ("frameset", endTagFrameset)
+        ])
+        endTagHandler.default = endTagOther
 
-            self.endTagHandler = _utils.MethodDispatcher([
-                ("html", self.endTagHtml)
-            ])
-            self.endTagHandler.default = self.endTagOther
+    class AfterFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#after3
+        __slots__ = tuple()
 
         def processEOF(self):
             # Stop parsing
@@ -2668,14 +2662,19 @@ def endTagOther(self, token):
             self.parser.parseError("unexpected-end-tag-after-frameset",
                                    {"name": token["name"]})
 
-    class AfterAfterBodyPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", Phase.startTagHtml),
+            ("noframes", startTagNoframes)
+        ])
+        startTagHandler.default = startTagOther
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml)
-            ])
-            self.startTagHandler.default = self.startTagOther
+        endTagHandler = _utils.MethodDispatcher([
+            ("html", endTagHtml)
+        ])
+        endTagHandler.default = endTagOther
+
+    class AfterAfterBodyPhase(Phase):
+        __slots__ = tuple()
 
         def processEOF(self):
             pass
@@ -2706,15 +2705,13 @@ def processEndTag(self, token):
             self.parser.phase = self.parser.phases["inBody"]
             return token
 
-    class AfterAfterFramesetPhase(Phase):
-        def __init__(self, parser, tree):
-            Phase.__init__(self, parser, tree)
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml)
+        ])
+        startTagHandler.default = startTagOther
 
-            self.startTagHandler = _utils.MethodDispatcher([
-                ("html", self.startTagHtml),
-                ("noframes", self.startTagNoFrames)
-            ])
-            self.startTagHandler.default = self.startTagOther
+    class AfterAfterFramesetPhase(Phase):
+        __slots__ = tuple()
 
         def processEOF(self):
             pass
@@ -2741,6 +2738,13 @@ def startTagOther(self, token):
         def processEndTag(self, token):
             self.parser.parseError("expected-eof-but-got-end-tag",
                                    {"name": token["name"]})
+
+        startTagHandler = _utils.MethodDispatcher([
+            ("html", startTagHtml),
+            ("noframes", startTagNoFrames)
+        ])
+        startTagHandler.default = startTagOther
+
     # pylint:enable=unused-argument
 
     return {
@@ -2774,8 +2778,8 @@ def processEndTag(self, token):
 def adjust_attributes(token, replacements):
     needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
     if needs_adjustment:
-        token['data'] = OrderedDict((replacements.get(k, k), v)
-                                    for k, v in token['data'].items())
+        token['data'] = type(token['data'])((replacements.get(k, k), v)
+                                            for k, v in token['data'].items())
 
 
 def impliedTagToken(name, type="EndTag", attributes=None,
diff --git a/bleach/_vendor/html5lib/serializer.py b/bleach/_vendor/html5lib/serializer.py
index d6b7105d..c66df683 100644
--- a/bleach/_vendor/html5lib/serializer.py
+++ b/bleach/_vendor/html5lib/serializer.py
@@ -274,7 +274,7 @@ def serialize(self, treewalker, encoding=None):
                 if token["systemId"]:
                     if token["systemId"].find('"') >= 0:
                         if token["systemId"].find("'") >= 0:
-                            self.serializeError("System identifer contains both single and double quote characters")
+                            self.serializeError("System identifier contains both single and double quote characters")
                         quote_char = "'"
                     else:
                         quote_char = '"'
diff --git a/bleach/_vendor/html5lib/treebuilders/base.py b/bleach/_vendor/html5lib/treebuilders/base.py
index 05d97ecc..e4a3d710 100644
--- a/bleach/_vendor/html5lib/treebuilders/base.py
+++ b/bleach/_vendor/html5lib/treebuilders/base.py
@@ -10,9 +10,9 @@
 
 listElementsMap = {
     None: (frozenset(scopingElements), False),
-    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
-    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
-                                              (namespaces["html"], "ul")])), False),
+    "button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
+    "list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
+                                          (namespaces["html"], "ul")}), False),
     "table": (frozenset([(namespaces["html"], "html"),
                          (namespaces["html"], "table")]), False),
     "select": (frozenset([(namespaces["html"], "optgroup"),
@@ -28,7 +28,7 @@ def __init__(self, name):
         :arg name: The tag name associated with the node
 
         """
-        # The tag name assocaited with the node
+        # The tag name associated with the node
         self.name = name
         # The parent of the current node (or None for the document node)
         self.parent = None
diff --git a/bleach/_vendor/html5lib/treebuilders/dom.py b/bleach/_vendor/html5lib/treebuilders/dom.py
index dcfac220..d8b53004 100644
--- a/bleach/_vendor/html5lib/treebuilders/dom.py
+++ b/bleach/_vendor/html5lib/treebuilders/dom.py
@@ -1,7 +1,10 @@
 from __future__ import absolute_import, division, unicode_literals
 
 
-from collections import MutableMapping
+try:
+    from collections.abc import MutableMapping
+except ImportError:  # Python 2.7
+    from collections import MutableMapping
 from xml.dom import minidom, Node
 import weakref
 
diff --git a/bleach/_vendor/html5lib/treebuilders/etree.py b/bleach/_vendor/html5lib/treebuilders/etree.py
index cb1d4aef..086bed4e 100644
--- a/bleach/_vendor/html5lib/treebuilders/etree.py
+++ b/bleach/_vendor/html5lib/treebuilders/etree.py
@@ -5,6 +5,8 @@
 
 import re
 
+from copy import copy
+
 from . import base
 from .. import _ihatexml
 from .. import constants
@@ -61,16 +63,17 @@ def _getAttributes(self):
             return self._element.attrib
 
         def _setAttributes(self, attributes):
-            # Delete existing attributes first
-            # XXX - there may be a better way to do this...
-            for key in list(self._element.attrib.keys()):
-                del self._element.attrib[key]
-            for key, value in attributes.items():
-                if isinstance(key, tuple):
-                    name = "{%s}%s" % (key[2], key[1])
-                else:
-                    name = key
-                self._element.set(name, value)
+            el_attrib = self._element.attrib
+            el_attrib.clear()
+            if attributes:
+                # calling .items _always_ allocates, and the above truthy check is cheaper than the
+                # allocation on average
+                for key, value in attributes.items():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s" % (key[2], key[1])
+                    else:
+                        name = key
+                    el_attrib[name] = value
 
         attributes = property(_getAttributes, _setAttributes)
 
@@ -129,8 +132,8 @@ def insertText(self, data, insertBefore=None):
 
         def cloneNode(self):
             element = type(self)(self.name, self.namespace)
-            for name, value in self.attributes.items():
-                element.attributes[name] = value
+            if self._element.attrib:
+                element._element.attrib = copy(self._element.attrib)
             return element
 
         def reparentChildren(self, newParent):
diff --git a/bleach/_vendor/html5lib/treebuilders/etree_lxml.py b/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
index ca12a99c..e73de61a 100644
--- a/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
+++ b/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
@@ -16,6 +16,11 @@
 import re
 import sys
 
+try:
+    from collections.abc import MutableMapping
+except ImportError:
+    from collections import MutableMapping
+
 from . import base
 from ..constants import DataLossWarning
 from .. import constants
@@ -23,6 +28,7 @@
 from .. import _ihatexml
 
 import lxml.etree as etree
+from six import PY3, binary_type
 
 
 fullTree = True
@@ -44,7 +50,11 @@ def __init__(self):
         self._childNodes = []
 
     def appendChild(self, element):
-        self._elementTree.getroot().addnext(element._element)
+        last = self._elementTree.getroot()
+        for last in self._elementTree.getroot().itersiblings():
+            pass
+
+        last.addnext(element._element)
 
     def _getChildNodes(self):
         return self._childNodes
@@ -185,26 +195,37 @@ def __init__(self, namespaceHTMLElements, fullTree=False):
         infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
         self.namespaceHTMLElements = namespaceHTMLElements
 
-        class Attributes(dict):
-            def __init__(self, element, value=None):
-                if value is None:
-                    value = {}
+        class Attributes(MutableMapping):
+            def __init__(self, element):
                 self._element = element
-                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
-                for key, value in self.items():
-                    if isinstance(key, tuple):
-                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
-                    else:
-                        name = infosetFilter.coerceAttribute(key)
-                    self._element._element.attrib[name] = value
 
-            def __setitem__(self, key, value):
-                dict.__setitem__(self, key, value)
+            def _coerceKey(self, key):
                 if isinstance(key, tuple):
                     name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                 else:
                     name = infosetFilter.coerceAttribute(key)
-                self._element._element.attrib[name] = value
+                return name
+
+            def __getitem__(self, key):
+                value = self._element._element.attrib[self._coerceKey(key)]
+                if not PY3 and isinstance(value, binary_type):
+                    value = value.decode("ascii")
+                return value
+
+            def __setitem__(self, key, value):
+                self._element._element.attrib[self._coerceKey(key)] = value
+
+            def __delitem__(self, key):
+                del self._element._element.attrib[self._coerceKey(key)]
+
+            def __iter__(self):
+                return iter(self._element._element.attrib)
+
+            def __len__(self):
+                return len(self._element._element.attrib)
+
+            def clear(self):
+                return self._element._element.attrib.clear()
 
         class Element(builder.Element):
             def __init__(self, name, namespace):
@@ -225,8 +246,10 @@ def _getName(self):
             def _getAttributes(self):
                 return self._attributes
 
-            def _setAttributes(self, attributes):
-                self._attributes = Attributes(self, attributes)
+            def _setAttributes(self, value):
+                attributes = self.attributes
+                attributes.clear()
+                attributes.update(value)
 
             attributes = property(_getAttributes, _setAttributes)
 
@@ -234,8 +257,11 @@ def insertText(self, data, insertBefore=None):
                 data = infosetFilter.coerceCharacters(data)
                 builder.Element.insertText(self, data, insertBefore)
 
-            def appendChild(self, child):
-                builder.Element.appendChild(self, child)
+            def cloneNode(self):
+                element = type(self)(self.name, self.namespace)
+                if self._element.attrib:
+                    element._element.attrib.update(self._element.attrib)
+                return element
 
         class Comment(builder.Comment):
             def __init__(self, data):
diff --git a/bleach/_vendor/html5lib/treewalkers/__init__.py b/bleach/_vendor/html5lib/treewalkers/__init__.py
index 9bec2076..b2d3aac3 100644
--- a/bleach/_vendor/html5lib/treewalkers/__init__.py
+++ b/bleach/_vendor/html5lib/treewalkers/__init__.py
@@ -2,10 +2,10 @@
 tree, generating tokens identical to those produced by the tokenizer
 module.
 
-To create a tree walker for a new type of tree, you need to do
+To create a tree walker for a new type of tree, you need to
 implement a tree walker object (called TreeWalker by convention) that
-implements a 'serialize' method taking a tree as sole argument and
-returning an iterator generating tokens.
+implements a 'serialize' method which takes a tree as sole argument and
+returns an iterator which generates tokens.
 """
 
 from __future__ import absolute_import, division, unicode_literals
diff --git a/bleach/_vendor/html5lib/treewalkers/etree.py b/bleach/_vendor/html5lib/treewalkers/etree.py
index d15a7eeb..44653372 100644
--- a/bleach/_vendor/html5lib/treewalkers/etree.py
+++ b/bleach/_vendor/html5lib/treewalkers/etree.py
@@ -127,4 +127,5 @@ def getParentNode(self, node):
 
     return locals()
 
+
 getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/bleach/_vendor/html5lib/treewalkers/etree_lxml.py b/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
index fb236311..a614ac5b 100644
--- a/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
+++ b/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 
+from collections import OrderedDict
+
 from lxml import etree
 from ..treebuilders.etree import tag_regexp
 
@@ -163,7 +165,7 @@ def getNodeDetails(self, node):
             else:
                 namespace = None
                 tag = ensure_str(node.tag)
-            attrs = {}
+            attrs = OrderedDict()
             for name, value in list(node.attrib.items()):
                 name = ensure_str(name)
                 value = ensure_str(value)
diff --git a/bleach/_vendor/vendor.txt b/bleach/_vendor/vendor.txt
index bb395f8c..f5162ab7 100644
--- a/bleach/_vendor/vendor.txt
+++ b/bleach/_vendor/vendor.txt
@@ -1,3 +1,3 @@
-html5lib==1.0.1 \
-    --hash=sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3 \
-    --hash=sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736
+html5lib==1.1 \
+    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
+    --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f

From 4d3ef668eb0cc5883724e761d71d8581d38eeaff Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 15 Sep 2020 12:54:53 -0400
Subject: [PATCH 266/314] update invalid attr name sanitization for html5lib
 1.1

1.1 passes token data as a dict of attr name to attr value instead of
a list

fixes: #543
---
 bleach/html5lib_shim.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index be06aead..703e1cca 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -26,7 +26,7 @@
 from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
 from bleach._vendor.html5lib._inputstream import HTMLInputStream
 from bleach._vendor.html5lib.serializer import HTMLSerializer
-from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
+from bleach._vendor.html5lib._tokenizer import attributeMap, HTMLTokenizer
 from bleach._vendor.html5lib._trie import Trie
 
 
@@ -244,14 +244,21 @@ def __iter__(self):
                 if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
                      token['type'] in TAG_TOKEN_TYPES and
                      token.get('data'))):
+                    # token["data"] is an html5lib attributeMap
+                    # (OrderedDict 3.7+ and dict otherwise)
+                    # of attr name to attr value
+                    #
                     # Remove attribute names that have ', " or < in them
                     # because those characters are invalid for attribute names.
-                    token['data'] = [
-                        item for item in token['data']
-                        if ('"' not in item[0] and
-                            "'" not in item[0] and
-                            '<' not in item[0])
-                    ]
+                    token["data"] = attributeMap(
+                        (attr_name, attr_value)
+                        for attr_name, attr_value in token["data"].items()
+                        if (
+                            '"' not in attr_name
+                            and "'" not in attr_name
+                            and "<" not in attr_name
+                        )
+                    )
                     last_error_token = None
                     yield token
 

From 26f7b1627e96c0e2f585cb4ca8f3295917d851ff Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 15 Sep 2020 13:31:39 -0400
Subject: [PATCH 267/314] suppress html5lib deprecation warnings

---
 bleach/html5lib_shim.py | 4 ++++
 bleach/sanitizer.py     | 5 +++++
 setup.cfg               | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 703e1cca..62ad0a6a 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -8,9 +8,13 @@
 
 import re
 import string
+import warnings
 
 import six
 
+# filter out html5lib deprecation warnings to use bleach
+warnings.simplefilter("ignore", category=DeprecationWarning)
+
 from bleach._vendor.html5lib import (
     HTMLParser,
     getTreeWalker,
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 3a5a9527..0eaf54cd 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -2,6 +2,7 @@
 
 from itertools import chain
 import re
+import warnings
 
 import six
 from six.moves.urllib.parse import urlparse
@@ -11,6 +12,10 @@
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
+# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
+warnings.simplefilter("ignore", category=DeprecationWarning)
+
+
 #: List of allowed tags
 ALLOWED_TAGS = [
     'a',
diff --git a/setup.cfg b/setup.cfg
index ea29d36f..ef44e287 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,5 +10,8 @@ ignore =
     W503
 max-line-length = 100
 
+[tool:pytest]
+filterwarnings = ignore:html5lib:DeprecationWarning
+
 [wheel]
 universal=1

From 7c111706d53a8064190e6d5f233a523ca9d41d14 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 15 Sep 2020 13:36:13 -0400
Subject: [PATCH 268/314] update changelog, contributors, and set dev version

---
 CHANGES            | 16 ++++++++++++++++
 CONTRIBUTORS       |  2 ++
 bleach/__init__.py |  4 ++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 1832d292..bceb3b1d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,22 @@
 Bleach changes
 ==============
 
+Version Unreleased (replaceme, 2020)
+--------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* ``html5lib`` dependency to version 1.1.0. Thank you Sam Sneddon.
+* update tests_website terminology. Thank you Thomas Grainger.
+
 Version 3.1.5 (April 29th, 2020)
 --------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 56b6b130..93cbd8aa 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -63,10 +63,12 @@ Contributors:
 - Paul Craciunoiu
 - Ricky Rosario
 - Ryan Niemeyer
+- Sam Sneddon
 - Sébastien Fievet
 - sedrubal
 - Stephane Blondon
 - Stu Cox
+- Thomas Grainger
 - Tim Dumol
 - Timothy Fitz
 - Tim Gates
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 1692674e..b585b763 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200429'
+__releasedate__ = 'replaceme'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.5'
+__version__ = '3.2.0dev1'
 VERSION = packaging.version.Version(__version__)
 
 

From fecb496418d6dbf76f4dd06b687e9f60e92bc7ca Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 15 Sep 2020 14:13:52 -0400
Subject: [PATCH 269/314] add hashin to requirements-dev.txt

---
 requirements-dev.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 2e999d52..73e89bdc 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,3 +14,6 @@ twine
 
 # Requirements for running setup.py bdist_wheel
 wheel
+
+# Requirements for updating the vendored html5lib
+hashin

From b54c78e084b41be8fda8483f72740710377edd26 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 15 Sep 2020 14:14:05 -0400
Subject: [PATCH 270/314] vendor: add pip version used to vendor readme

---
 bleach/_vendor/README.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst
index a8a42d68..ba001b10 100644
--- a/bleach/_vendor/README.rst
+++ b/bleach/_vendor/README.rst
@@ -19,7 +19,7 @@ Adding/Updating a vendored library
 
 Way to vendor a library or update a version:
 
-1. Update ``vendor.txt`` with the library, version, and hash. You can use 
+1. Update ``vendor.txt`` with the library, version, and hash. You can use
    `hashin <https://pypi.org/project/hashin/>`_.
 2. Remove all old files and directories of the old version.
 3. Run ``pip_install_vendor.sh`` and check everything it produced in including
@@ -35,3 +35,6 @@ Way to verify a vendored library addition/update:
 2. Delete all the old files and directories of the old version.
 3. Run ``pip_install_vendor.sh``.
 4. Run ``git diff`` and verify there are no changes.
+
+
+NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info

From 17dceafa0037fc45feae573bff0904b2ec1e185a Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 09:22:15 -0400
Subject: [PATCH 271/314] Update for v3.2.0 release

---
 CHANGES            | 4 ++--
 SECURITY.md        | 2 +-
 bleach/__init__.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGES b/CHANGES
index bceb3b1d..f2bbb230 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version Unreleased (replaceme, 2020)
---------------------------------
+Version 3.2.0 (September 16th, 2020)
+------------------------------------
 
 **Security fixes**
 
diff --git a/SECURITY.md b/SECURITY.md
index 85e42426..47b7d7a8 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -7,7 +7,7 @@ currently being supported with security updates.
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 3.1.x   | :white_check_mark: |
+| 3.2.x   | :white_check_mark: |
 | < 3.1   | :x:                |
 
 ## Reporting a Vulnerability
diff --git a/bleach/__init__.py b/bleach/__init__.py
index b585b763..f11972f2 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = 'replaceme'
+__releasedate__ = '20200916'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.2.0dev1'
+__version__ = '3.2.0'
 VERSION = packaging.version.Version(__version__)
 
 

From 29a6f722d9956c136be869fdb43bf238b5a12390 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 11:16:41 -0400
Subject: [PATCH 272/314] add tests for void tags #488

---
 tests/test_clean.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index 133cd822..d5ca41cf 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -771,6 +771,45 @@ def test_nonexistent_namespace():
     assert clean('<d {c}>') == '&lt;d {c}&gt;'
 
 
+@pytest.mark.parametrize(
+    "tag",
+    [
+        "area",
+        "base",
+        "br",
+        "embed",
+        "hr",
+        "img",
+        "input",
+        pytest.param(
+            "keygen",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/mozilla/bleach/issues/488"
+            ),
+        ),
+        "link",
+        "meta",
+        "param",
+        "source",
+        pytest.param(
+            "menuitem",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/mozilla/bleach/issues/488"
+            ),
+        ),
+        "track",
+        pytest.param(
+            "wbr",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/mozilla/bleach/issues/488"
+            ),
+        ),
+    ],
+)
+def test_self_closing_tags_self_close(tag):
+    assert clean("<%s>" % tag, tags=[tag]) == "<%s>" % tag
+
+
 # tags that get content passed through (i.e. parsed with parseRCDataRawtext)
 _raw_tags = [
     "title",

From 692b15f6f81b69ef07544ab7f650b3004f5aaaea Mon Sep 17 00:00:00 2001
From: Greg Guthe <g-k@users.noreply.github.com>
Date: Wed, 16 Sep 2020 12:06:29 -0400
Subject: [PATCH 273/314] Update issue templates

adds bug report and feature request issue templates

fixes: #550
---
 .github/ISSUE_TEMPLATE/bug-report.md      | 37 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature-request.md | 31 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 00000000..c99d006e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,37 @@
+---
+name: bug report
+about: Create a report to a bug or regression
+title: 'bug: '
+labels: regression
+assignees: ''
+
+---
+
+**Describe the bug**
+
+A clear and concise description of what the bug is. [e.g. "`bleach.clean` does not escape script tag contents"]
+
+** python and bleach versions (please complete the following information):**
+
+ - Python Version: [e.g. 3.8.2]
+ - Bleach Version: [e.g. 3.2.0]
+
+**To Reproduce**
+
+Steps to reproduce the behavior:
+
+[e.g. ```python
+>>> bleach.clean("><script>alert("XSS")</script>&")
+"><script>alert("XSS")</script>&"
+```]
+
+**Expected behavior**
+
+[e.g. ```python
+>>> bleach.clean("><script>alert("XSS")</script>&")
+'&gt;"&gt;&lt;script&gt;alert("XSS")&lt;/script&gt;&amp;'
+```]
+
+**Additional context**
+
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 00000000..79b7fcdc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,31 @@
+---
+name: feature request
+about: Suggest an idea for this project
+title: 'feature: '
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+
+A clear and concise description of what you want to happen. 
+
+Does this feature modify an existing method e.g. `clean`, `linkify`?
+
+If you have an implementation in mind include an example call and output [e.g. ```python
+>>> bleach.clean(always_error=True)
+Exception("always raise exception enabled")
+```]
+
+**Describe alternatives you've considered**
+
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+
+Add any other context or screenshots about the feature request here.

From d5656069c3596fe88128254df252fc3b6c917a36 Mon Sep 17 00:00:00 2001
From: Mitar <mitar.github@tnode.com>
Date: Mon, 6 Jan 2020 14:43:55 -0800
Subject: [PATCH 274/314] add DEFAULT_CALLBACKS to linkifier

---
 bleach/linkifier.py   | 4 ++--
 docs/linkify.rst      | 2 +-
 tests/test_linkify.py | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index ed8925fd..44932a7e 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -192,8 +192,8 @@ class LinkifyFilter(html5lib_shim.Filter):
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE):
+    def __init__(self, source, callbacks=DEFAULT_CALLBACKS, skip_tags=None,
+                 parse_email=False, url_re=URL_RE, email_re=EMAIL_RE):
         """Creates a LinkifyFilter instance
 
         :arg TreeWalker source: stream
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 113583ed..6d6a63ac 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -416,7 +416,7 @@ For example, using all the defaults:
 
    >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter])
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   '<pre><a href="http://example.com">http://example.com</a></pre>'
+   '<pre><a href="http://example.com" rel="nofollow">http://example.com</a></pre>'
 
 
 And passing parameters to ``LinkifyFilter``:
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 683498a6..1f48814c 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -680,18 +680,18 @@ def test_only_text_is_linkified(self):
 
 @pytest.mark.parametrize('text, expected', [
     ('abc', 'abc'),
-    ('example.com', '<a href="http://example.com">example.com</a>'),
+    ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'),
     (
         'http://example.com?b=1&c=2',
-        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
+        '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>'
     ),
     (
         'http://example.com?b=1&amp;c=2',
-        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
+        '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>'
     ),
     (
         'link: https://example.com/watch#anchor',
-        'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>'
+        'link: <a href="https://example.com/watch#anchor" rel="nofollow">https://example.com/watch#anchor</a>'
     )
 ])
 def test_linkify_filter(text, expected):

From 1f93bbe20426db24a31fe9e4cb1bfdc8e724a76c Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 12:25:32 -0400
Subject: [PATCH 275/314] update CHANGES, CONTRIBUTORS, and set dev version

---
 CHANGES            | 15 +++++++++++++++
 CONTRIBUTORS       |  1 +
 bleach/__init__.py |  4 ++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index f2bbb230..ddcf8d8b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,21 @@
 Bleach changes
 ==============
 
+Version replaceme (replaceme, 2020)
+------------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* change linkifier to add rel="nofollow" as documented. Thank you @mitar.
+
 Version 3.2.0 (September 16th, 2020)
 ------------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 93cbd8aa..b114de94 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -57,6 +57,7 @@ Contributors:
 - Mark Lee
 - Mark Paschal
 - mdxs
+- mitar
 - Nikita Sobolev
 - nikolas
 - Oh Jinkyun
diff --git a/bleach/__init__.py b/bleach/__init__.py
index f11972f2..b42cd439 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200916'
+__releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.2.0'
+__version__ = '3.2.1dev0'
 VERSION = packaging.version.Version(__version__)
 
 

From 103c16de43ad6ad00fb5686fcb9a5f48eaf7d8e8 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 12:58:34 -0400
Subject: [PATCH 276/314] scripts: add format and format-check test modes

---
 scripts/run_tests.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 6be281f6..8210f2f8 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -18,6 +18,10 @@ case "${MODE}" in
     ./scripts/vendor_verify.sh ;;
   docs)
     tox -e docs ;;
+  format)
+    black bleach/*.py tests/ tests_website/ ;;
+  format-check)
+    black --check --diff bleach/*.py tests/ tests_website/ ;;
   *)
     echo "Unknown mode $MODE."
     exit 1

From b0485609a5abc117b9b389b24dc4b8382b92dbc2 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 12:58:52 -0400
Subject: [PATCH 277/314] add format-check to tox and CI

---
 .travis.yml |  6 ++++++
 tox.ini     | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index cba5b98b..4cf14201 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,6 +26,12 @@ matrix:
       env: MODE=vendorverify
     - python: "3.8"
       env: MODE=lint
+    - python: "3.8"
+      env: MODE=format-check
+      install:
+        - pip install -U pip setuptools>=18.5
+        - pip install -r requirements-dev.txt
+        - pip install -U black
     - python: "3.6"
       env: MODE=docs
 
diff --git a/tox.ini b/tox.ini
index 65c2fa3a..b746b370 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,6 +5,7 @@ envlist =
     py{27,35,36,37,38,py,py3}
     py{27,35,36,37,38}-build-no-lang
     docs
+    format-check
     lint
     vendorverify
 
@@ -55,6 +56,15 @@ deps =
 commands =
     ./run_tests.sh vendorverify
 
+[testenv:format-check]
+basepython = python3.8
+changedir = scripts
+deps =
+    -rrequirements-dev.txt
+    black
+commands =
+    ./run_tests.sh format-check
+
 [testenv:docs]
 basepython = python3.6
 changedir = docs

From da0eb4b1d50c75469c16894c05b44f8665084ebd Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 12:59:58 -0400
Subject: [PATCH 278/314] format bleach code and tests ignoring _vendor

---
 bleach/__init__.py              |  24 +-
 bleach/callbacks.py             |  20 +-
 bleach/html5lib_shim.py         | 349 +++++------
 bleach/linkifier.py             | 199 ++++---
 bleach/sanitizer.py             | 222 +++----
 bleach/utils.py                 |   8 +-
 tests/test_callbacks.py         |  52 +-
 tests/test_clean.py             | 994 ++++++++++++++------------------
 tests/test_css.py               | 413 ++++++-------
 tests/test_html5lib_shim.py     | 187 +++---
 tests/test_linkify.py           | 701 +++++++++++-----------
 tests/test_unicode.py           |  39 +-
 tests/test_utils.py             |  35 +-
 tests_website/data_to_json.py   |  36 +-
 tests_website/open_test_page.py |  12 +-
 tests_website/server.py         |  18 +-
 16 files changed, 1614 insertions(+), 1695 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index b42cd439..852db4a7 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,18 +18,24 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = ""
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.2.1dev0'
+__version__ = "3.2.1dev0"
 VERSION = packaging.version.Version(__version__)
 
 
-__all__ = ['clean', 'linkify']
+__all__ = ["clean", "linkify"]
 
 
-def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-          strip_comments=True):
+def clean(
+    text,
+    tags=ALLOWED_TAGS,
+    attributes=ALLOWED_ATTRIBUTES,
+    styles=ALLOWED_STYLES,
+    protocols=ALLOWED_PROTOCOLS,
+    strip=False,
+    strip_comments=True,
+):
     """Clean an HTML fragment of malicious content and return it
 
     This function is a security-focused function whose sole purpose is to
@@ -123,9 +129,5 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False
     :returns: linkified text as unicode
 
     """
-    linker = Linker(
-        callbacks=callbacks,
-        skip_tags=skip_tags,
-        parse_email=parse_email
-    )
+    linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email)
     return linker.linkify(text)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index c9625029..6ef4c259 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -3,31 +3,31 @@
 
 
 def nofollow(attrs, new=False):
-    href_key = (None, 'href')
+    href_key = (None, "href")
 
     if href_key not in attrs:
         return attrs
 
-    if attrs[href_key].startswith('mailto:'):
+    if attrs[href_key].startswith("mailto:"):
         return attrs
 
-    rel_key = (None, 'rel')
-    rel_values = [val for val in attrs.get(rel_key, '').split(' ') if val]
-    if 'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
-        rel_values.append('nofollow')
-    attrs[rel_key] = ' '.join(rel_values)
+    rel_key = (None, "rel")
+    rel_values = [val for val in attrs.get(rel_key, "").split(" ") if val]
+    if "nofollow" not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append("nofollow")
+    attrs[rel_key] = " ".join(rel_values)
 
     return attrs
 
 
 def target_blank(attrs, new=False):
-    href_key = (None, 'href')
+    href_key = (None, "href")
 
     if href_key not in attrs:
         return attrs
 
-    if attrs[href_key].startswith('mailto:'):
+    if attrs[href_key].startswith("mailto:"):
         return attrs
 
-    attrs[(None, 'target')] = '_blank'
+    attrs[(None, "target")] = "_blank"
     return attrs
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 62ad0a6a..72642246 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -42,129 +42,129 @@
 
 #: Token type constants--these never change
 TAG_TOKEN_TYPES = {
-    constants.tokenTypes['StartTag'],
-    constants.tokenTypes['EndTag'],
-    constants.tokenTypes['EmptyTag']
+    constants.tokenTypes["StartTag"],
+    constants.tokenTypes["EndTag"],
+    constants.tokenTypes["EmptyTag"],
 }
-CHARACTERS_TYPE = constants.tokenTypes['Characters']
-PARSEERROR_TYPE = constants.tokenTypes['ParseError']
+CHARACTERS_TYPE = constants.tokenTypes["Characters"]
+PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
 
 
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
 HTML_TAGS = [
-    'a',
-    'abbr',
-    'address',
-    'area',
-    'article',
-    'aside',
-    'audio',
-    'b',
-    'base',
-    'bdi',
-    'bdo',
-    'blockquote',
-    'body',
-    'br',
-    'button',
-    'canvas',
-    'caption',
-    'cite',
-    'code',
-    'col',
-    'colgroup',
-    'data',
-    'datalist',
-    'dd',
-    'del',
-    'details',
-    'dfn',
-    'dialog',
-    'div',
-    'dl',
-    'dt',
-    'em',
-    'embed',
-    'fieldset',
-    'figcaption',
-    'figure',
-    'footer',
-    'form',
-    'h1',
-    'h2',
-    'h3',
-    'h4',
-    'h5',
-    'h6',
-    'head',
-    'header',
-    'hgroup',
-    'hr',
-    'html',
-    'i',
-    'iframe',
-    'img',
-    'input',
-    'ins',
-    'kbd',
-    'keygen',
-    'label',
-    'legend',
-    'li',
-    'link',
-    'map',
-    'mark',
-    'menu',
-    'meta',
-    'meter',
-    'nav',
-    'noscript',
-    'object',
-    'ol',
-    'optgroup',
-    'option',
-    'output',
-    'p',
-    'param',
-    'picture',
-    'pre',
-    'progress',
-    'q',
-    'rp',
-    'rt',
-    'ruby',
-    's',
-    'samp',
-    'script',
-    'section',
-    'select',
-    'slot',
-    'small',
-    'source',
-    'span',
-    'strong',
-    'style',
-    'sub',
-    'summary',
-    'sup',
-    'table',
-    'tbody',
-    'td',
-    'template',
-    'textarea',
-    'tfoot',
-    'th',
-    'thead',
-    'time',
-    'title',
-    'tr',
-    'track',
-    'u',
-    'ul',
-    'var',
-    'video',
-    'wbr',
+    "a",
+    "abbr",
+    "address",
+    "area",
+    "article",
+    "aside",
+    "audio",
+    "b",
+    "base",
+    "bdi",
+    "bdo",
+    "blockquote",
+    "body",
+    "br",
+    "button",
+    "canvas",
+    "caption",
+    "cite",
+    "code",
+    "col",
+    "colgroup",
+    "data",
+    "datalist",
+    "dd",
+    "del",
+    "details",
+    "dfn",
+    "dialog",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "embed",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "head",
+    "header",
+    "hgroup",
+    "hr",
+    "html",
+    "i",
+    "iframe",
+    "img",
+    "input",
+    "ins",
+    "kbd",
+    "keygen",
+    "label",
+    "legend",
+    "li",
+    "link",
+    "map",
+    "mark",
+    "menu",
+    "meta",
+    "meter",
+    "nav",
+    "noscript",
+    "object",
+    "ol",
+    "optgroup",
+    "option",
+    "output",
+    "p",
+    "param",
+    "picture",
+    "pre",
+    "progress",
+    "q",
+    "rp",
+    "rt",
+    "ruby",
+    "s",
+    "samp",
+    "script",
+    "section",
+    "select",
+    "slot",
+    "small",
+    "source",
+    "span",
+    "strong",
+    "style",
+    "sub",
+    "summary",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "template",
+    "textarea",
+    "tfoot",
+    "th",
+    "thead",
+    "time",
+    "title",
+    "tr",
+    "track",
+    "u",
+    "ul",
+    "var",
+    "video",
+    "wbr",
 ]
 
 
@@ -175,6 +175,7 @@ class InputStreamWithMemory(object):
     since the last < which marked an open tag state.
 
     """
+
     def __init__(self, inner_stream):
         self._inner_stream = inner_stream
         self.reset = self._inner_stream.reset
@@ -218,7 +219,7 @@ def get_tag(self):
         is the "tag" that is being tokenized.
 
         """
-        return six.text_type('').join(self._buffer)
+        return six.text_type("").join(self._buffer)
 
     def start_tag(self):
         """Resets stream history to just '<'
@@ -227,11 +228,12 @@ def start_tag(self):
         open tag. Any time we see that, we reset the buffer.
 
         """
-        self._buffer = ['<']
+        self._buffer = ["<"]
 
 
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
+
     def __init__(self, consume_entities=False, **kwargs):
         super(BleachHTMLTokenizer, self).__init__(**kwargs)
 
@@ -245,9 +247,11 @@ def __iter__(self):
 
         for token in super(BleachHTMLTokenizer, self).__iter__():
             if last_error_token is not None:
-                if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
-                     token['type'] in TAG_TOKEN_TYPES and
-                     token.get('data'))):
+                if (
+                    last_error_token["data"] == "invalid-character-in-attribute-name"
+                    and token["type"] in TAG_TOKEN_TYPES
+                    and token.get("data")
+                ):
                     # token["data"] is an html5lib attributeMap
                     # (OrderedDict 3.7+ and dict otherwise)
                     # of attr name to attr value
@@ -266,9 +270,11 @@ def __iter__(self):
                     last_error_token = None
                     yield token
 
-                elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
-                       self.parser.tags is not None and
-                       token['data'].lower().strip() not in self.parser.tags)):
+                elif (
+                    last_error_token["data"] == "expected-closing-tag-but-got-char"
+                    and self.parser.tags is not None
+                    and token["data"].lower().strip() not in self.parser.tags
+                ):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
                     # comment which Bleach clean() will drop so we interfere
@@ -280,13 +286,13 @@ def __iter__(self):
                     #
                     # If this is not an allowed tag, then we convert it to
                     # characters and it'll get escaped in the sanitizer.
-                    token['data'] = self.stream.get_tag()
-                    token['type'] = CHARACTERS_TYPE
+                    token["data"] = self.stream.get_tag()
+                    token["type"] = CHARACTERS_TYPE
 
                     last_error_token = None
                     yield token
 
-                elif token['type'] == PARSEERROR_TYPE:
+                elif token["type"] == PARSEERROR_TYPE:
                     # If the token is a parse error, then let the last_error_token
                     # go, and make token the new last_error_token
                     yield last_error_token
@@ -301,7 +307,7 @@ def __iter__(self):
 
             # If the token is a ParseError, we hold on to it so we can get the
             # next token and potentially fix it.
-            if token['type'] == PARSEERROR_TYPE:
+            if token["type"] == PARSEERROR_TYPE:
                 last_error_token = token
                 continue
 
@@ -314,7 +320,9 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # If this tokenizer is set to consume entities, then we can let the
         # superclass do its thing.
         if self.consume_entities:
-            return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
+            return super(BleachHTMLTokenizer, self).consumeEntity(
+                allowedChar, fromAttribute
+            )
 
         # If this tokenizer is set to not consume entities, then we don't want
         # to consume and convert them, so this overrides the html5lib tokenizer's
@@ -323,10 +331,10 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # However, when that gets called, it's consumed an &, so we put that back in
         # the stream.
         if fromAttribute:
-            self.currentToken['data'][-1][1] += '&'
+            self.currentToken["data"][-1][1] += "&"
 
         else:
-            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
+            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
 
     def tagOpenState(self):
         # This state marks a < that is either a StartTag, EndTag, EmptyTag,
@@ -339,16 +347,18 @@ def tagOpenState(self):
     def emitCurrentToken(self):
         token = self.currentToken
 
-        if ((self.parser.tags is not None and
-             token['type'] in TAG_TOKEN_TYPES and
-             token['name'].lower() not in self.parser.tags)):
+        if (
+            self.parser.tags is not None
+            and token["type"] in TAG_TOKEN_TYPES
+            and token["name"].lower() not in self.parser.tags
+        ):
             # If this is a start/end/empty tag for a tag that's not in our
             # allowed list, then it gets stripped or escaped. In both of these
             # cases it gets converted to a Characters token.
             if self.parser.strip:
                 # If we're stripping the token, we just throw in an empty
                 # string token.
-                new_data = ''
+                new_data = ""
 
             else:
                 # If we're escaping the token, we want to escape the exact
@@ -358,10 +368,7 @@ def emitCurrentToken(self):
                 # string and use that.
                 new_data = self.stream.get_tag()
 
-            new_token = {
-                'type': CHARACTERS_TYPE,
-                'data': new_data
-            }
+            new_token = {"type": CHARACTERS_TYPE, "data": new_data}
 
             self.currentToken = new_token
             self.tokenQueue.append(new_token)
@@ -373,6 +380,7 @@ def emitCurrentToken(self):
 
 class BleachHTMLParser(HTMLParser):
     """Parser that uses BleachHTMLTokenizer"""
+
     def __init__(self, tags, strip, consume_entities, **kwargs):
         """
         :arg tags: list of allowed tags--everything else is either stripped or
@@ -388,7 +396,9 @@ def __init__(self, tags, strip, consume_entities, **kwargs):
         self.consume_entities = consume_entities
         super(BleachHTMLParser, self).__init__(**kwargs)
 
-    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
+    def _parse(
+        self, stream, innerHTML=False, container="div", scripting=True, **kwargs
+    ):
         # set scripting=True to parse <noscript> as though JS is enabled to
         # match the expected context in browsers
         #
@@ -399,10 +409,7 @@ def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwa
         self.container = container
         self.scripting = scripting
         self.tokenizer = BleachHTMLTokenizer(
-            stream=stream,
-            consume_entities=self.consume_entities,
-            parser=self,
-            **kwargs
+            stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
         )
         self.reset()
 
@@ -424,8 +431,8 @@ def convert_entity(value):
         doesn't match a character entity
 
     """
-    if value[0] == '#':
-        if value[1] in ('x', 'X'):
+    if value[0] == "#":
+        if value[1] in ("x", "X"):
             return six.unichr(int(value[2:], 16))
         return six.unichr(int(value[1:], 10))
 
@@ -440,7 +447,7 @@ def convert_entities(text):
     :returns: unicode text with converted entities
 
     """
-    if '&' not in text:
+    if "&" not in text:
         return text
 
     new_text = []
@@ -448,7 +455,7 @@ def convert_entities(text):
         if not part:
             continue
 
-        if part.startswith('&'):
+        if part.startswith("&"):
             entity = match_entity(part)
             if entity is not None:
                 converted = convert_entity(entity)
@@ -457,14 +464,14 @@ def convert_entities(text):
                 # unicode character. Otherwise, we leave the entity in.
                 if converted is not None:
                     new_text.append(converted)
-                    remainder = part[len(entity) + 2:]
+                    remainder = part[len(entity) + 2 :]
                     if part:
                         new_text.append(remainder)
                     continue
 
         new_text.append(part)
 
-    return ''.join(new_text)
+    return "".join(new_text)
 
 
 def match_entity(stream):
@@ -480,25 +487,25 @@ def match_entity(stream):
 
     """
     # Nix the & at the beginning
-    if stream[0] != '&':
+    if stream[0] != "&":
         raise ValueError('Stream should begin with "&"')
 
     stream = stream[1:]
 
     stream = list(stream)
-    possible_entity = ''
-    end_characters = '<&=;' + string.whitespace
+    possible_entity = ""
+    end_characters = "<&=;" + string.whitespace
 
     # Handle number entities
-    if stream and stream[0] == '#':
-        possible_entity = '#'
+    if stream and stream[0] == "#":
+        possible_entity = "#"
         stream.pop(0)
 
-        if stream and stream[0] in ('x', 'X'):
-            allowed = '0123456789abcdefABCDEF'
+        if stream and stream[0] in ("x", "X"):
+            allowed = "0123456789abcdefABCDEF"
             possible_entity += stream.pop(0)
         else:
-            allowed = '0123456789'
+            allowed = "0123456789"
 
         # FIXME(willkg): Do we want to make sure these are valid number
         # entities? This doesn't do that currently.
@@ -508,7 +515,7 @@ def match_entity(stream):
                 break
             possible_entity += c
 
-        if possible_entity and stream and stream[0] == ';':
+        if possible_entity and stream and stream[0] == ";":
             return possible_entity
         return None
 
@@ -519,13 +526,13 @@ def match_entity(stream):
             break
         possible_entity += c
 
-    if possible_entity and stream and stream[0] == ';':
+    if possible_entity and stream and stream[0] == ";":
         return possible_entity
 
     return None
 
 
-AMP_SPLIT_RE = re.compile('(&)')
+AMP_SPLIT_RE = re.compile("(&)")
 
 
 def next_possible_entity(text):
@@ -541,7 +548,7 @@ def next_possible_entity(text):
         if i == 0:
             yield part
         elif i % 2 == 0:
-            yield '&' + part
+            yield "&" + part
 
 
 class BleachHTMLSerializer(HTMLSerializer):
@@ -564,7 +571,7 @@ def escape_base_amp(self, stoken):
         # entities and convert them to their respective characters, but the
         # BleachHTMLTokenizer doesn't do that. For example, this fixes
         # &amp;entity; back to &entity; .
-        stoken = stoken.replace('&amp;', '&')
+        stoken = stoken.replace("&amp;", "&")
 
         # However, we do want all bare & that are not marking character
         # entities to be changed to &amp;, so let's do that carefully here.
@@ -572,21 +579,21 @@ def escape_base_amp(self, stoken):
             if not part:
                 continue
 
-            if part.startswith('&'):
+            if part.startswith("&"):
                 entity = match_entity(part)
                 # Only leave entities in that are not ambiguous. If they're
                 # ambiguous, then we escape the ampersand.
                 if entity is not None and convert_entity(entity) is not None:
-                    yield '&' + entity + ';'
+                    yield "&" + entity + ";"
 
                     # Length of the entity plus 2--one for & at the beginning
                     # and one for ; at the end
-                    part = part[len(entity) + 2:]
+                    part = part[len(entity) + 2 :]
                     if part:
                         yield part
                     continue
 
-            yield part.replace('&', '&amp;')
+            yield part.replace("&", "&amp;")
 
     def serialize(self, treewalker, encoding=None):
         """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
@@ -600,7 +607,7 @@ def serialize(self, treewalker, encoding=None):
 
         for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
             if in_tag:
-                if stoken == '>':
+                if stoken == ">":
                     in_tag = False
 
                 elif after_equals:
@@ -611,11 +618,11 @@ def serialize(self, treewalker, encoding=None):
                         after_equals = False
                         continue
 
-                elif stoken == '=':
+                elif stoken == "=":
                     after_equals = True
 
                 yield stoken
             else:
-                if stoken.startswith('<'):
+                if stoken.startswith("<"):
                     in_tag = True
                 yield stoken
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 44932a7e..c7618e85 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -32,14 +32,14 @@
 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
     """Builds the url regex used by linkifier
 
-   If you want a different set of tlds or allowed protocols, pass those in
-   and stomp on the existing ``url_re``::
+    If you want a different set of tlds or allowed protocols, pass those in
+    and stomp on the existing ``url_re``::
 
-       from bleach import linkifier
+        from bleach import linkifier
 
-       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+        my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
 
-       linker = LinkifyFilter(url_re=my_url_re)
+        linker = LinkifyFilter(url_re=my_url_re)
 
     """
     return re.compile(
@@ -49,26 +49,29 @@ def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
         (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
             # /path/zz (excluding "unsafe" chars from RFC 1738,
             # except for # and ~, which happen in practice)
-        """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))),
-        re.IGNORECASE | re.VERBOSE | re.UNICODE)
+        """.format(
+            "|".join(sorted(protocols)), "|".join(sorted(tlds))
+        ),
+        re.IGNORECASE | re.VERBOSE | re.UNICODE,
+    )
 
 
 URL_RE = build_url_re()
 
 
-PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
 
 
 def build_email_re(tlds=TLDS):
     """Builds the email regex used by linkifier
 
-   If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
+    If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
 
-       from bleach import linkifier
+        from bleach import linkifier
 
-       my_email_re = linkifier.build_email_re(my_tlds_list)
+        my_email_re = linkifier.build_email_re(my_tlds_list)
 
-       linker = LinkifyFilter(email_re=my_url_re)
+        linker = LinkifyFilter(email_re=my_url_re)
 
     """
     # open and closing braces doubled below for format string
@@ -79,8 +82,11 @@ def build_email_re(tlds=TLDS):
         |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
             |\\[\001-\011\013\014\016-\177])*"  # quoted-string
         )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
-        """.format('|'.join(tlds)),
-        re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+        """.format(
+            "|".join(tlds)
+        ),
+        re.IGNORECASE | re.MULTILINE | re.VERBOSE,
+    )
 
 
 EMAIL_RE = build_email_re()
@@ -100,8 +106,16 @@ class Linker(object):
     situations due to crazy text.
 
     """
-    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
+
+    def __init__(
+        self,
+        callbacks=DEFAULT_CALLBACKS,
+        skip_tags=None,
+        parse_email=False,
+        url_re=URL_RE,
+        email_re=EMAIL_RE,
+        recognized_tags=html5lib_shim.HTML_TAGS,
+    ):
         """Creates a Linker instance
 
         :arg list callbacks: list of callbacks to run when adjusting tag attributes;
@@ -137,14 +151,12 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
             consume_entities=True,
             namespaceHTMLElements=False,
         )
-        self.walker = html5lib_shim.getTreeWalker('etree')
+        self.walker = html5lib_shim.getTreeWalker("etree")
         self.serializer = html5lib_shim.BleachHTMLSerializer(
-            quote_attr_values='always',
+            quote_attr_values="always",
             omit_optional_tags=False,
-
             # linkify does not sanitize
             sanitize=False,
-
             # linkify alphabetizes
             alphabetical_attributes=False,
         )
@@ -160,12 +172,12 @@ def linkify(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            raise TypeError('argument must be of text type')
+            raise TypeError("argument must be of text type")
 
         text = force_unicode(text)
 
         if not text:
-            return ''
+            return ""
 
         dom = self.parser.parseFragment(text)
         filtered = LinkifyFilter(
@@ -192,8 +204,16 @@ class LinkifyFilter(html5lib_shim.Filter):
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, callbacks=DEFAULT_CALLBACKS, skip_tags=None,
-                 parse_email=False, url_re=URL_RE, email_re=EMAIL_RE):
+
+    def __init__(
+        self,
+        source,
+        callbacks=DEFAULT_CALLBACKS,
+        skip_tags=None,
+        parse_email=False,
+        url_re=URL_RE,
+        email_re=EMAIL_RE,
+    ):
         """Creates a LinkifyFilter instance
 
         :arg TreeWalker source: stream
@@ -262,17 +282,17 @@ def extract_character_data(self, token_list):
 
         out = []
         for token in token_list:
-            token_type = token['type']
-            if token_type in ['Characters', 'SpaceCharacters']:
-                out.append(token['data'])
+            token_type = token["type"]
+            if token_type in ["Characters", "SpaceCharacters"]:
+                out.append(token["data"])
 
-        return ''.join(out)
+        return "".join(out)
 
     def handle_email_addresses(self, src_iter):
         """Handle email addresses in character tokens"""
         for token in src_iter:
-            if token['type'] == 'Characters':
-                text = token['data']
+            if token["type"] == "Characters":
+                text = token["data"]
                 new_tokens = []
                 end = 0
 
@@ -280,39 +300,41 @@ def handle_email_addresses(self, src_iter):
                 for match in self.email_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
-                            {'type': 'Characters', 'data': text[end:match.start()]}
+                            {"type": "Characters", "data": text[end : match.start()]}
                         )
 
                     # Run attributes through the callbacks to see what we
                     # should do with this match
                     attrs = {
-                        (None, 'href'): 'mailto:%s' % match.group(0),
-                        '_text': match.group(0)
+                        (None, "href"): "mailto:%s" % match.group(0),
+                        "_text": match.group(0),
                     }
                     attrs = self.apply_callbacks(attrs, True)
 
                     if attrs is None:
                         # Just add the text--but not as a link
                         new_tokens.append(
-                            {'type': 'Characters', 'data': match.group(0)}
+                            {"type": "Characters", "data": match.group(0)}
                         )
 
                     else:
                         # Add an "a" tag for the new link
-                        _text = attrs.pop('_text', '')
+                        _text = attrs.pop("_text", "")
                         attrs = alphabetize_attributes(attrs)
-                        new_tokens.extend([
-                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
-                            {'type': 'Characters', 'data': force_unicode(_text)},
-                            {'type': 'EndTag', 'name': 'a'}
-                        ])
+                        new_tokens.extend(
+                            [
+                                {"type": "StartTag", "name": "a", "data": attrs},
+                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "EndTag", "name": "a"},
+                            ]
+                        )
                     end = match.end()
 
                 if new_tokens:
                     # Yield the adjusted set of tokens and then continue
                     # through the loop
                     if end < len(text):
-                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
+                        new_tokens.append({"type": "Characters", "data": text[end:]})
 
                     for new_token in new_tokens:
                         yield new_token
@@ -327,17 +349,17 @@ def strip_non_url_bits(self, fragment):
         This accounts for over-eager matching by the regex.
 
         """
-        prefix = suffix = ''
+        prefix = suffix = ""
 
         while fragment:
             # Try removing ( from the beginning and, if it's balanced, from the
             # end, too
-            if fragment.startswith('('):
-                prefix = prefix + '('
+            if fragment.startswith("("):
+                prefix = prefix + "("
                 fragment = fragment[1:]
 
-                if fragment.endswith(')'):
-                    suffix = ')' + suffix
+                if fragment.endswith(")"):
+                    suffix = ")" + suffix
                     fragment = fragment[:-1]
                 continue
 
@@ -347,21 +369,21 @@ def strip_non_url_bits(self, fragment):
             #
             #     "i looked at the site (at http://example.com)"
 
-            if fragment.endswith(')') and '(' not in fragment:
+            if fragment.endswith(")") and "(" not in fragment:
                 fragment = fragment[:-1]
-                suffix = ')' + suffix
+                suffix = ")" + suffix
                 continue
 
             # Handle commas
-            if fragment.endswith(','):
+            if fragment.endswith(","):
                 fragment = fragment[:-1]
-                suffix = ',' + suffix
+                suffix = "," + suffix
                 continue
 
             # Handle periods
-            if fragment.endswith('.'):
+            if fragment.endswith("."):
                 fragment = fragment[:-1]
-                suffix = '.' + suffix
+                suffix = "." + suffix
                 continue
 
             # Nothing matched, so we're done
@@ -374,27 +396,27 @@ def handle_links(self, src_iter):
         in_a = False  # happens, if parse_email=True and if a mail was found
         for token in src_iter:
             if in_a:
-                if token['type'] == 'EndTag' and token['name'] == 'a':
+                if token["type"] == "EndTag" and token["name"] == "a":
                     in_a = False
                 yield token
                 continue
-            elif token['type'] == 'StartTag' and token['name'] == 'a':
+            elif token["type"] == "StartTag" and token["name"] == "a":
                 in_a = True
                 yield token
                 continue
-            if token['type'] == 'Characters':
-                text = token['data']
+            if token["type"] == "Characters":
+                text = token["data"]
                 new_tokens = []
                 end = 0
 
                 for match in self.url_re.finditer(text):
                     if match.start() > end:
                         new_tokens.append(
-                            {'type': 'Characters', 'data': text[end:match.start()]}
+                            {"type": "Characters", "data": text[end : match.start()]}
                         )
 
                     url = match.group(0)
-                    prefix = suffix = ''
+                    prefix = suffix = ""
 
                     # Sometimes we pick up too much in the url match, so look for
                     # bits we should drop and remove them from the match
@@ -404,40 +426,35 @@ def handle_links(self, src_iter):
                     if PROTO_RE.search(url):
                         href = url
                     else:
-                        href = 'http://%s' % url
+                        href = "http://%s" % url
 
-                    attrs = {
-                        (None, 'href'): href,
-                        '_text': url
-                    }
+                    attrs = {(None, "href"): href, "_text": url}
                     attrs = self.apply_callbacks(attrs, True)
 
                     if attrs is None:
                         # Just add the text
                         new_tokens.append(
-                            {'type': 'Characters', 'data': prefix + url + suffix}
+                            {"type": "Characters", "data": prefix + url + suffix}
                         )
 
                     else:
                         # Add the "a" tag!
                         if prefix:
-                            new_tokens.append(
-                                {'type': 'Characters', 'data': prefix}
-                            )
+                            new_tokens.append({"type": "Characters", "data": prefix})
 
-                        _text = attrs.pop('_text', '')
+                        _text = attrs.pop("_text", "")
                         attrs = alphabetize_attributes(attrs)
 
-                        new_tokens.extend([
-                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
-                            {'type': 'Characters', 'data': force_unicode(_text)},
-                            {'type': 'EndTag', 'name': 'a'},
-                        ])
+                        new_tokens.extend(
+                            [
+                                {"type": "StartTag", "name": "a", "data": attrs},
+                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "EndTag", "name": "a"},
+                            ]
+                        )
 
                         if suffix:
-                            new_tokens.append(
-                                {'type': 'Characters', 'data': suffix}
-                            )
+                            new_tokens.append({"type": "Characters", "data": suffix})
 
                     end = match.end()
 
@@ -445,7 +462,7 @@ def handle_links(self, src_iter):
                     # Yield the adjusted set of tokens and then continue
                     # through the loop
                     if end < len(text):
-                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
+                        new_tokens.append({"type": "Characters", "data": text[end:]})
 
                     for new_token in new_tokens:
                         yield new_token
@@ -464,23 +481,23 @@ def handle_a_tag(self, token_buffer):
 
         """
         a_token = token_buffer[0]
-        if a_token['data']:
-            attrs = a_token['data']
+        if a_token["data"]:
+            attrs = a_token["data"]
         else:
             attrs = {}
         text = self.extract_character_data(token_buffer)
-        attrs['_text'] = text
+        attrs["_text"] = text
 
         attrs = self.apply_callbacks(attrs, False)
 
         if attrs is None:
             # We're dropping the "a" tag and everything else and replacing
             # it with character data. So emit that token.
-            yield {'type': 'Characters', 'data': text}
+            yield {"type": "Characters", "data": text}
 
         else:
-            new_text = attrs.pop('_text', '')
-            a_token['data'] = alphabetize_attributes(attrs)
+            new_text = attrs.pop("_text", "")
+            a_token["data"] = alphabetize_attributes(attrs)
 
             if text == new_text:
                 # The callbacks didn't change the text, so we yield the new "a"
@@ -494,7 +511,7 @@ def handle_a_tag(self, token_buffer):
                 # all the tokens between the start and end "a" tags and replace
                 # it with the new text
                 yield a_token
-                yield {'type': 'Characters', 'data': force_unicode(new_text)}
+                yield {"type": "Characters", "data": force_unicode(new_text)}
                 yield token_buffer[-1]
 
     def __iter__(self):
@@ -507,7 +524,7 @@ def __iter__(self):
             if in_a:
                 # Handle the case where we're in an "a" tag--we want to buffer tokens
                 # until we hit an end "a" tag.
-                if token['type'] == 'EndTag' and token['name'] == 'a':
+                if token["type"] == "EndTag" and token["name"] == "a":
                     # Add the end tag to the token buffer and then handle them
                     # and yield anything returned
                     token_buffer.append(token)
@@ -522,13 +539,13 @@ def __iter__(self):
                     token_buffer.append(token)
                 continue
 
-            if token['type'] in ['StartTag', 'EmptyTag']:
-                if token['name'] in self.skip_tags:
+            if token["type"] in ["StartTag", "EmptyTag"]:
+                if token["name"] in self.skip_tags:
                     # Skip tags start a "special mode" where we don't linkify
                     # anything until the end tag.
-                    in_skip_tag = token['name']
+                    in_skip_tag = token["name"]
 
-                elif token['name'] == 'a':
+                elif token["name"] == "a":
                     # The "a" tag is special--we switch to a slurp mode and
                     # slurp all the tokens until the end "a" tag and then
                     # figure out what to do with them there.
@@ -542,10 +559,10 @@ def __iter__(self):
             elif in_skip_tag and self.skip_tags:
                 # NOTE(willkg): We put this clause here since in_a and
                 # switching in and out of in_a takes precedence.
-                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
+                if token["type"] == "EndTag" and token["name"] == in_skip_tag:
                     in_skip_tag = None
 
-            elif not in_a and not in_skip_tag and token['type'] == 'Characters':
+            elif not in_a and not in_skip_tag and token["type"] == "Characters":
                 new_stream = iter([token])
                 if self.parse_email:
                     new_stream = self.handle_email_addresses(new_stream)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0eaf54cd..8b5dafa7 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -18,46 +18,45 @@
 
 #: List of allowed tags
 ALLOWED_TAGS = [
-    'a',
-    'abbr',
-    'acronym',
-    'b',
-    'blockquote',
-    'code',
-    'em',
-    'i',
-    'li',
-    'ol',
-    'strong',
-    'ul',
+    "a",
+    "abbr",
+    "acronym",
+    "b",
+    "blockquote",
+    "code",
+    "em",
+    "i",
+    "li",
+    "ol",
+    "strong",
+    "ul",
 ]
 
 
 #: Map of allowed attributes by tag
 ALLOWED_ATTRIBUTES = {
-    'a': ['href', 'title'],
-    'abbr': ['title'],
-    'acronym': ['title'],
+    "a": ["href", "title"],
+    "abbr": ["title"],
+    "acronym": ["title"],
 }
 
 #: List of allowed styles
 ALLOWED_STYLES = []
 
 #: List of allowed protocols
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
 
 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
-INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
+INVISIBLE_CHARACTERS = "".join(
+    [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
+)
 
 #: Regexp for characters that are invisible
-INVISIBLE_CHARACTERS_RE = re.compile(
-    '[' + INVISIBLE_CHARACTERS + ']',
-    re.UNICODE
-)
+INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
 
 #: String to replace invisible characters with. This can be a character, a
 #: string, or even a function that takes a Python re matchobj
-INVISIBLE_REPLACEMENT_CHAR = '?'
+INVISIBLE_REPLACEMENT_CHAR = "?"
 
 
 class Cleaner(object):
@@ -89,9 +88,16 @@ class Cleaner(object):
 
     """
 
-    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-                 strip_comments=True, filters=None):
+    def __init__(
+        self,
+        tags=ALLOWED_TAGS,
+        attributes=ALLOWED_ATTRIBUTES,
+        styles=ALLOWED_STYLES,
+        protocols=ALLOWED_PROTOCOLS,
+        strip=False,
+        strip_comments=True,
+        filters=None,
+    ):
         """Initializes a Cleaner
 
         :arg list tags: allowed list of tags; defaults to
@@ -132,21 +138,18 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
             tags=self.tags,
             strip=self.strip,
             consume_entities=False,
-            namespaceHTMLElements=False
+            namespaceHTMLElements=False,
         )
-        self.walker = html5lib_shim.getTreeWalker('etree')
+        self.walker = html5lib_shim.getTreeWalker("etree")
         self.serializer = html5lib_shim.BleachHTMLSerializer(
-            quote_attr_values='always',
+            quote_attr_values="always",
             omit_optional_tags=False,
             escape_lt_in_attrs=True,
-
             # We want to leave entities as they are without escaping or
             # resolving or expanding
             resolve_entities=False,
-
             # Bleach has its own sanitizer, so don't use the html5lib one
             sanitize=False,
-
             # Bleach sanitizer alphabetizes already, so don't use the html5lib one
             alphabetical_attributes=False,
         )
@@ -162,24 +165,25 @@ def clean(self, text):
 
         """
         if not isinstance(text, six.string_types):
-            message = "argument cannot be of '{name}' type, must be of text type".format(
-                name=text.__class__.__name__)
+            message = (
+                "argument cannot be of '{name}' type, must be of text type".format(
+                    name=text.__class__.__name__
+                )
+            )
             raise TypeError(message)
 
         if not text:
-            return ''
+            return ""
 
         text = force_unicode(text)
 
         dom = self.parser.parseFragment(text)
         filtered = BleachSanitizerFilter(
             source=self.walker(dom),
-
             # Bleach-sanitizer-specific things
             attributes=self.attributes,
             strip_disallowed_elements=self.strip,
             strip_html_comments=self.strip_comments,
-
             # html5lib-sanitizer things
             allowed_elements=self.tags,
             allowed_css_properties=self.styles,
@@ -206,6 +210,7 @@ def attribute_filter_factory(attributes):
         return attributes
 
     if isinstance(attributes, dict):
+
         def _attr_filter(tag, attr, value):
             if tag in attributes:
                 attr_val = attributes[tag]
@@ -215,8 +220,8 @@ def _attr_filter(tag, attr, value):
                 if attr in attr_val:
                     return True
 
-            if '*' in attributes:
-                attr_val = attributes['*']
+            if "*" in attributes:
+                attr_val = attributes["*"]
                 if callable(attr_val):
                     return attr_val(tag, attr, value)
 
@@ -227,12 +232,13 @@ def _attr_filter(tag, attr, value):
         return _attr_filter
 
     if isinstance(attributes, list):
+
         def _attr_filter(tag, attr, value):
             return attr in attributes
 
         return _attr_filter
 
-    raise ValueError('attributes needs to be a callable, a list or a dict')
+    raise ValueError("attributes needs to be a callable, a list or a dict")
 
 
 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
@@ -241,9 +247,15 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
     This filter can be used anywhere html5lib filters can be used.
 
     """
-    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
-                 strip_disallowed_elements=False, strip_html_comments=True,
-                 **kwargs):
+
+    def __init__(
+        self,
+        source,
+        attributes=ALLOWED_ATTRIBUTES,
+        strip_disallowed_elements=False,
+        strip_html_comments=True,
+        **kwargs
+    ):
         """Creates a BleachSanitizerFilter instance
 
         :arg Treewalker source: stream
@@ -291,33 +303,37 @@ def merge_characters(self, token_iterator):
 
         for token in token_iterator:
             if characters_buffer:
-                if token['type'] == 'Characters':
+                if token["type"] == "Characters":
                     characters_buffer.append(token)
                     continue
                 else:
                     # Merge all the characters tokens together into one and then
                     # operate on it.
                     new_token = {
-                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-                        'type': 'Characters'
+                        "data": "".join(
+                            [char_token["data"] for char_token in characters_buffer]
+                        ),
+                        "type": "Characters",
                     }
                     characters_buffer = []
                     yield new_token
 
-            elif token['type'] == 'Characters':
+            elif token["type"] == "Characters":
                 characters_buffer.append(token)
                 continue
 
             yield token
 
         new_token = {
-            'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-            'type': 'Characters'
+            "data": "".join([char_token["data"] for char_token in characters_buffer]),
+            "type": "Characters",
         }
         yield new_token
 
     def __iter__(self):
-        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
+        return self.merge_characters(
+            self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
+        )
 
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
@@ -335,28 +351,28 @@ def sanitize_token(self, token):
         :returns: token or list of tokens
 
         """
-        token_type = token['type']
-        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
-            if token['name'] in self.allowed_elements:
+        token_type = token["type"]
+        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
                 return self.allow_token(token)
 
             elif self.strip_disallowed_elements:
                 return None
 
             else:
-                if 'data' in token:
+                if "data" in token:
                     # Alphabetize the attributes before calling .disallowed_token()
                     # so that the resulting string is stable
-                    token['data'] = alphabetize_attributes(token['data'])
+                    token["data"] = alphabetize_attributes(token["data"])
                 return self.disallowed_token(token)
 
-        elif token_type == 'Comment':
+        elif token_type == "Comment":
             if not self.strip_html_comments:
                 return token
             else:
                 return None
 
-        elif token_type == 'Characters':
+        elif token_type == "Characters":
             return self.sanitize_characters(token)
 
         else:
@@ -377,16 +393,16 @@ def sanitize_characters(self, token):
         :returns: a list of tokens
 
         """
-        data = token.get('data', '')
+        data = token.get("data", "")
 
         if not data:
             return token
 
         data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
-        token['data'] = data
+        token["data"] = data
 
         # If there isn't a & in the data, we can return now
-        if '&' not in data:
+        if "&" not in data:
             return token
 
         new_tokens = []
@@ -397,10 +413,10 @@ def sanitize_characters(self, token):
             if not part:
                 continue
 
-            if part.startswith('&'):
+            if part.startswith("&"):
                 entity = html5lib_shim.match_entity(part)
                 if entity is not None:
-                    if entity == 'amp':
+                    if entity == "amp":
                         # LinkifyFilter can't match urls across token boundaries
                         # which is problematic with &amp; since that shows up in
                         # querystrings all the time. This special-cases &amp;
@@ -408,18 +424,18 @@ def sanitize_characters(self, token):
                         # Characters token. It'll get merged with surrounding
                         # tokens in the BleachSanitizerfilter.__iter__ and
                         # escaped in the serializer.
-                        new_tokens.append({'type': 'Characters', 'data': '&'})
+                        new_tokens.append({"type": "Characters", "data": "&"})
                     else:
-                        new_tokens.append({'type': 'Entity', 'name': entity})
+                        new_tokens.append({"type": "Entity", "name": entity})
 
                     # Length of the entity plus 2--one for & at the beginning
                     # and one for ; at the end
-                    remainder = part[len(entity) + 2:]
+                    remainder = part[len(entity) + 2 :]
                     if remainder:
-                        new_tokens.append({'type': 'Characters', 'data': remainder})
+                        new_tokens.append({"type": "Characters", "data": remainder})
                     continue
 
-            new_tokens.append({'type': 'Characters', 'data': part})
+            new_tokens.append({"type": "Characters", "data": part})
 
         return new_tokens
 
@@ -440,14 +456,10 @@ def sanitize_uri_value(self, value, allowed_protocols):
         new_value = html5lib_shim.convert_entities(value)
 
         # Nix backtick, space characters, and control characters
-        new_value = re.sub(
-            r"[`\000-\040\177-\240\s]+",
-            '',
-            new_value
-        )
+        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
 
         # Remove REPLACEMENT characters
-        new_value = new_value.replace('\ufffd', '')
+        new_value = new_value.replace("\ufffd", "")
 
         # Lowercase it--this breaks the value, but makes it easier to match
         # against
@@ -468,23 +480,23 @@ def sanitize_uri_value(self, value, allowed_protocols):
 
         else:
             # Allow uris that are just an anchor
-            if new_value.startswith('#'):
+            if new_value.startswith("#"):
                 return value
 
             # Handle protocols that urlparse doesn't recognize like "myprotocol"
-            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
+            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
                 return value
 
             # If there's no protocol/scheme specified, then assume it's "http"
             # and see if that's allowed
-            if 'http' in allowed_protocols:
+            if "http" in allowed_protocols:
                 return value
 
         return None
 
     def allow_token(self, token):
         """Handles the case where we're allowing the tag"""
-        if 'data' in token:
+        if "data" in token:
             # Loop through all the attributes and drop the ones that are not
             # allowed, are unsafe or break other rules. Additionally, fix
             # attribute values that need fixing.
@@ -492,14 +504,14 @@ def allow_token(self, token):
             # At the end of this loop, we have the final set of attributes
             # we're keeping.
             attrs = {}
-            for namespaced_name, val in token['data'].items():
+            for namespaced_name, val in token["data"].items():
                 namespace, name = namespaced_name
 
                 # Drop attributes that are not explicitly allowed
                 #
                 # NOTE(willkg): We pass in the attribute name--not a namespaced
                 # name.
-                if not self.attr_filter(token['name'], name, val):
+                if not self.attr_filter(token["name"], name, val):
                     continue
 
                 # Drop attributes with uri values that use a disallowed protocol
@@ -512,9 +524,7 @@ def allow_token(self, token):
 
                 # Drop values in svg attrs with non-local IRIs
                 if namespaced_name in self.svg_attr_val_allows_ref:
-                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                     ' ',
-                                     unescape(val))
+                    new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
                     new_val = new_val.strip()
                     if not new_val:
                         continue
@@ -525,21 +535,22 @@ def allow_token(self, token):
                         val = new_val
 
                 # Drop href and xlink:href attr for svg elements with non-local IRIs
-                if (None, token['name']) in self.svg_allow_local_href:
+                if (None, token["name"]) in self.svg_allow_local_href:
                     if namespaced_name in [
-                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
+                        (None, "href"),
+                        (html5lib_shim.namespaces["xlink"], "href"),
                     ]:
-                        if re.search(r'^\s*[^#\s]', val):
+                        if re.search(r"^\s*[^#\s]", val):
                             continue
 
                 # If it's a style attribute, sanitize it
-                if namespaced_name == (None, 'style'):
+                if namespaced_name == (None, "style"):
                     val = self.sanitize_css(val)
 
                 # At this point, we want to keep the attribute, so add it in
                 attrs[namespaced_name] = val
 
-            token['data'] = alphabetize_attributes(attrs)
+            token["data"] = alphabetize_attributes(attrs)
 
         return token
 
@@ -562,16 +573,19 @@ def disallowed_token(self, token):
                 if ns is None or ns not in html5lib_shim.prefixes:
                     namespaced_name = name
                 else:
-                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
-
-                attrs.append(' %s="%s"' % (
-                    namespaced_name,
-                    # NOTE(willkg): HTMLSerializer escapes attribute values
-                    # already, so if we do it here (like HTMLSerializer does),
-                    # then we end up double-escaping.
-                    v)
+                    namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
+
+                attrs.append(
+                    ' %s="%s"'
+                    % (
+                        namespaced_name,
+                        # NOTE(willkg): HTMLSerializer escapes attribute values
+                        # already, so if we do it here (like HTMLSerializer does),
+                        # then we end up double-escaping.
+                        v,
+                    )
                 )
-            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+            token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
 
         else:
             token["data"] = "<%s>" % token["name"]
@@ -590,13 +604,13 @@ def sanitize_css(self, style):
         style = html5lib_shim.convert_entities(style)
 
         # Drop any url values before we do anything else
-        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
 
         # The gauntlet of sanitization
 
         # Validate the css in the style tag and if it's not valid, then drop
         # the whole thing.
-        parts = style.split(';')
+        parts = style.split(";")
         gauntlet = re.compile(
             r"""^(  # consider a style attribute value as composed of:
 [/:,#%!.\s\w]    # a non-newline character
@@ -605,25 +619,25 @@ def sanitize_css(self, style):
 |"[\s\w]+"       # a double quoted string of [\s\w]+
 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
 )*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
-            flags=re.U | re.VERBOSE
+            flags=re.U | re.VERBOSE,
         )
 
         for part in parts:
             if not gauntlet.match(part):
-                return ''
+                return ""
 
         if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
+            return ""
 
         clean = []
-        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
             if not value:
                 continue
 
             if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")
 
             elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")
 
-        return ' '.join(clean)
+        return " ".join(clean)
diff --git a/bleach/utils.py b/bleach/utils.py
index add7ba6e..ad780d52 100644
--- a/bleach/utils.py
+++ b/bleach/utils.py
@@ -11,7 +11,7 @@ def _attr_key(attr):
     ``None`` to an empty string.
 
     """
-    key = (attr[0][0] or ''), attr[0][1]
+    key = (attr[0][0] or ""), attr[0][1]
     return key
 
 
@@ -20,9 +20,7 @@ def alphabetize_attributes(attrs):
     if not attrs:
         return attrs
 
-    return OrderedDict(
-        [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
-    )
+    return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])
 
 
 def force_unicode(text):
@@ -41,4 +39,4 @@ def force_unicode(text):
         return text
 
     # If not, convert it
-    return six.text_type(text, 'utf-8', 'strict')
+    return six.text_type(text, "utf-8", "strict")
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 69c9a133..121d14bf 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -9,36 +9,36 @@ def test_blank(self):
         assert nofollow(attrs) == attrs
 
     def test_no_href(self):
-        attrs = {'_text': 'something something'}
+        attrs = {"_text": "something something"}
         assert nofollow(attrs) == attrs
 
     def test_basic(self):
-        attrs = {(None, 'href'): 'http://example.com'}
-        assert (
-            nofollow(attrs) ==
-            {(None, 'href'): 'http://example.com', (None, 'rel'): 'nofollow'}
-        )
+        attrs = {(None, "href"): "http://example.com"}
+        assert nofollow(attrs) == {
+            (None, "href"): "http://example.com",
+            (None, "rel"): "nofollow",
+        }
 
     def test_mailto(self):
-        attrs = {(None, 'href'): 'mailto:joe@example.com'}
+        attrs = {(None, "href"): "mailto:joe@example.com"}
         assert nofollow(attrs) == attrs
 
     def test_has_nofollow_already(self):
         attrs = {
-            (None, 'href'): 'http://example.com',
-            (None, 'rel'): 'nofollow',
+            (None, "href"): "http://example.com",
+            (None, "rel"): "nofollow",
         }
         assert nofollow(attrs) == attrs
 
     def test_other_rel(self):
         attrs = {
-            (None, 'href'): 'http://example.com',
-            (None, 'rel'): 'next',
+            (None, "href"): "http://example.com",
+            (None, "rel"): "next",
+        }
+        assert nofollow(attrs) == {
+            (None, "href"): "http://example.com",
+            (None, "rel"): "next nofollow",
         }
-        assert (
-            nofollow(attrs) ==
-            {(None, 'href'): 'http://example.com', (None, 'rel'): 'next nofollow'}
-        )
 
 
 class TestTargetBlankCallback:
@@ -47,19 +47,19 @@ def test_empty(self):
         assert target_blank(attrs) == attrs
 
     def test_mailto(self):
-        attrs = {(None, 'href'): 'mailto:joe@example.com'}
+        attrs = {(None, "href"): "mailto:joe@example.com"}
         assert target_blank(attrs) == attrs
 
     def test_add_target(self):
-        attrs = {(None, 'href'): 'http://example.com'}
-        assert (
-            target_blank(attrs) ==
-            {(None, 'href'): 'http://example.com', (None, 'target'): '_blank'}
-        )
+        attrs = {(None, "href"): "http://example.com"}
+        assert target_blank(attrs) == {
+            (None, "href"): "http://example.com",
+            (None, "target"): "_blank",
+        }
 
     def test_stomp_target(self):
-        attrs = {(None, 'href'): 'http://example.com', (None, 'target'): 'foo'}
-        assert (
-            target_blank(attrs) ==
-            {(None, 'href'): 'http://example.com', (None, 'target'): '_blank'}
-        )
+        attrs = {(None, "href"): "http://example.com", (None, "target"): "foo"}
+        assert target_blank(attrs) == {
+            (None, "href"): "http://example.com",
+            (None, "target"): "_blank",
+        }
diff --git a/tests/test_clean.py b/tests/test_clean.py
index d5ca41cf..1cd58df0 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -9,14 +9,15 @@
 from bleach.sanitizer import Cleaner
 from bleach._vendor.html5lib.constants import rcdataElements
 
+
 def test_clean_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
-    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+    dirty = "<span>invalid & </span> < extra http://link.com<em>"
     assert clean(clean(dirty)) == clean(dirty)
 
 
 def test_only_text_is_cleaned():
-    some_text = 'text'
+    some_text = "text"
     some_type = int
     no_type = None
 
@@ -32,350 +33,272 @@ def test_only_text_is_cleaned():
 
 
 def test_empty():
-    assert clean('') == ''
+    assert clean("") == ""
 
 
 def test_content_has_no_html():
-    assert clean('no html string') == 'no html string'
-
+    assert clean("no html string") == "no html string"
 
-@pytest.mark.parametrize('data, expected', [
-    (
-        'an <strong>allowed</strong> tag',
-        'an <strong>allowed</strong> tag'
-    ),
 
-    (
-        'another <em>good</em> tag',
-        'another <em>good</em> tag'
-    )
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        ("an <strong>allowed</strong> tag", "an <strong>allowed</strong> tag"),
+        ("another <em>good</em> tag", "another <em>good</em> tag"),
+    ],
+)
 def test_content_has_allowed_html(data, expected):
     assert clean(data) == expected
 
 
 def test_html_is_lowercased():
     assert (
-        clean('<A HREF="http://example.com">foo</A>') ==
-        '<a href="http://example.com">foo</a>'
+        clean('<A HREF="http://example.com">foo</A>')
+        == '<a href="http://example.com">foo</a>'
     )
 
 
 def test_invalid_uri_does_not_raise_error():
-    assert clean('<a href="http://example.com]">text</a>') == '<a>text</a>'
-
-
-@pytest.mark.parametrize('data, should_strip, expected', [
-    # Regular comment
-    (
-        '<!-- this is a comment -->',
-        True,
-        ''
-    ),
-
-    # Open comment with no close comment bit
-    (
-        '<!-- open comment',
-        True,
-        ''
-    ),
-    (
-        '<!--open comment',
-        True,
-        ''
-    ),
-    (
-        '<!-- open comment',
-        False,
-        '<!-- open comment-->'
-    ),
-    (
-        '<!--open comment',
-        False,
-        '<!--open comment-->'
-    ),
-
-    # Comment with text to the right
-    (
-        '<!-- comment -->text',
-        True,
-        'text'
-    ),
-    (
-        '<!--comment-->text',
-        True,
-        'text'
-    ),
-    (
-        '<!-- comment -->text',
-        False,
-        '<!-- comment -->text'
-    ),
-    (
-        '<!--comment-->text',
-        False,
-        '<!--comment-->text'
-    ),
-
-    # Comment with text to the left
-    (
-        'text<!-- comment -->',
-        True,
-        'text'
-    ),
-    (
-        'text<!--comment-->',
-        True,
-        'text'
-    ),
-    (
-        'text<!-- comment -->',
-        False,
-        'text<!-- comment -->'
-    ),
-    (
-        'text<!--comment-->',
-        False,
-        'text<!--comment-->'
-    )
-])
+    assert clean('<a href="http://example.com]">text</a>') == "<a>text</a>"
+
+
+@pytest.mark.parametrize(
+    "data, should_strip, expected",
+    [
+        # Regular comment
+        ("<!-- this is a comment -->", True, ""),
+        # Open comment with no close comment bit
+        ("<!-- open comment", True, ""),
+        ("<!--open comment", True, ""),
+        ("<!-- open comment", False, "<!-- open comment-->"),
+        ("<!--open comment", False, "<!--open comment-->"),
+        # Comment with text to the right
+        ("<!-- comment -->text", True, "text"),
+        ("<!--comment-->text", True, "text"),
+        ("<!-- comment -->text", False, "<!-- comment -->text"),
+        ("<!--comment-->text", False, "<!--comment-->text"),
+        # Comment with text to the left
+        ("text<!-- comment -->", True, "text"),
+        ("text<!--comment-->", True, "text"),
+        ("text<!-- comment -->", False, "text<!-- comment -->"),
+        ("text<!--comment-->", False, "text<!--comment-->"),
+    ],
+)
 def test_comments(data, should_strip, expected):
     assert clean(data, strip_comments=should_strip) == expected
 
 
 def test_invalid_char_in_tag():
     assert (
-        clean('<script/xss src="http://xx.com/xss.js"></script>') ==
-        '&lt;script/xss src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        clean('<script/xss src="http://xx.com/xss.js"></script>')
+        == '&lt;script/xss src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
     )
     assert (
-        clean('<script/src="http://xx.com/xss.js"></script>') ==
-        '&lt;script/src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        clean('<script/src="http://xx.com/xss.js"></script>')
+        == '&lt;script/src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
     )
 
 
 def test_unclosed_tag():
+    assert clean("a <em>fixed tag") == "a <em>fixed tag</em>"
     assert (
-        clean('a <em>fixed tag') ==
-        'a <em>fixed tag</em>'
+        clean("<script src=http://xx.com/xss.js<b>")
+        == "&lt;script src=http://xx.com/xss.js&lt;b&gt;"
     )
     assert (
-        clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src=http://xx.com/xss.js&lt;b&gt;'
+        clean('<script src="http://xx.com/xss.js"<b>')
+        == '&lt;script src="http://xx.com/xss.js"&lt;b&gt;'
     )
     assert (
-        clean('<script src="http://xx.com/xss.js"<b>') ==
-        '&lt;script src="http://xx.com/xss.js"&lt;b&gt;'
-    )
-    assert (
-        clean('<script src="http://xx.com/xss.js" <b>') ==
-        '&lt;script src="http://xx.com/xss.js" &lt;b&gt;'
+        clean('<script src="http://xx.com/xss.js" <b>')
+        == '&lt;script src="http://xx.com/xss.js" &lt;b&gt;'
     )
 
 
 def test_nested_script_tag():
     assert (
-        clean('<<script>script>evil()<</script>/script>') ==
-        '&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;'
+        clean("<<script>script>evil()<</script>/script>")
+        == "&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;"
     )
     assert (
-        clean('<<x>script>evil()<</x>/script>') ==
-        '&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;'
+        clean("<<x>script>evil()<</x>/script>")
+        == "&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;"
     )
     assert (
-        clean('<script<script>>evil()</script</script>>') ==
-        '&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;/script&gt;&gt;'
+        clean("<script<script>>evil()</script</script>>")
+        == "&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;/script&gt;&gt;"
     )
 
 
-@pytest.mark.parametrize('text, expected', [
-    ('an & entity', 'an &amp; entity'),
-    ('an < entity', 'an &lt; entity'),
-    ('tag < <em>and</em> entity', 'tag &lt; <em>and</em> entity'),
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("an & entity", "an &amp; entity"),
+        ("an < entity", "an &lt; entity"),
+        ("tag < <em>and</em> entity", "tag &lt; <em>and</em> entity"),
+    ],
+)
 def test_bare_entities_get_escaped_correctly(text, expected):
     assert clean(text) == expected
 
 
-@pytest.mark.parametrize('text, expected', [
-    # Test character entities
-    ('&amp;', '&amp;'),
-    ('&nbsp;', '&nbsp;'),
-    ('&nbsp; test string &nbsp;', '&nbsp; test string &nbsp;'),
-    ('&lt;em&gt;strong&lt;/em&gt;', '&lt;em&gt;strong&lt;/em&gt;'),
-
-    # Test character entity at beginning of string
-    ('&amp;is cool', '&amp;is cool'),
-
-    # Test it at the end of the string
-    ('cool &amp;', 'cool &amp;'),
-
-    # Test bare ampersands and entities at beginning
-    ('&&amp; is cool', '&amp;&amp; is cool'),
-
-    # Test entities and bare ampersand at end
-    ('&amp; is cool &amp;&', '&amp; is cool &amp;&amp;'),
-
-    # Test missing semi-colon means we don't treat it like an entity
-    ('this &amp that', 'this &amp;amp that'),
-
-    # Test a thing that looks like a character entity, but isn't because it's
-    # missing a ; (&current)
-    (
-        'http://example.com?active=true&current=true',
-        'http://example.com?active=true&amp;current=true'
-    ),
-
-    # Test character entities in attribute values are left alone
-    (
-        '<a href="?art&amp;copy">foo</a>',
-        '<a href="?art&amp;copy">foo</a>'
-    ),
-    (
-        '<a href="?this=&gt;that">foo</a>',
-        '<a href="?this=&gt;that">foo</a>'
-    ),
-
-    # Ambiguous ampersands get escaped in attributes
-    (
-        '<a href="http://example.com/&xx;">foo</a>',
-        '<a href="http://example.com/&amp;xx;">foo</a>'
-    ),
-    (
-        '<a href="http://example.com?active=true&current=true">foo</a>',
-        '<a href="http://example.com?active=true&amp;current=true">foo</a>'
-    ),
-
-    # Ambiguous ampersands in text are not escaped
-    ('&xx;', '&xx;'),
-
-    # Test numeric entities
-    ('&#39;', '&#39;'),
-    ('&#34;', '&#34;'),
-    ('&#123;', '&#123;'),
-    ('&#x0007b;', '&#x0007b;'),
-    ('&#x0007B;', '&#x0007B;'),
-
-    # Test non-numeric entities
-    ('&#', '&amp;#'),
-    ('&#<', '&amp;#&lt;'),
-
-    # html5lib tokenizer unescapes character entities, so these would become '
-    # and " which makes it possible to break out of html attributes.
-    #
-    # Verify that clean() doesn't unescape entities.
-    ('&#39;&#34;', '&#39;&#34;'),
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        # Test character entities
+        ("&amp;", "&amp;"),
+        ("&nbsp;", "&nbsp;"),
+        ("&nbsp; test string &nbsp;", "&nbsp; test string &nbsp;"),
+        ("&lt;em&gt;strong&lt;/em&gt;", "&lt;em&gt;strong&lt;/em&gt;"),
+        # Test character entity at beginning of string
+        ("&amp;is cool", "&amp;is cool"),
+        # Test it at the end of the string
+        ("cool &amp;", "cool &amp;"),
+        # Test bare ampersands and entities at beginning
+        ("&&amp; is cool", "&amp;&amp; is cool"),
+        # Test entities and bare ampersand at end
+        ("&amp; is cool &amp;&", "&amp; is cool &amp;&amp;"),
+        # Test missing semi-colon means we don't treat it like an entity
+        ("this &amp that", "this &amp;amp that"),
+        # Test a thing that looks like a character entity, but isn't because it's
+        # missing a ; (&current)
+        (
+            "http://example.com?active=true&current=true",
+            "http://example.com?active=true&amp;current=true",
+        ),
+        # Test character entities in attribute values are left alone
+        ('<a href="?art&amp;copy">foo</a>', '<a href="?art&amp;copy">foo</a>'),
+        ('<a href="?this=&gt;that">foo</a>', '<a href="?this=&gt;that">foo</a>'),
+        # Ambiguous ampersands get escaped in attributes
+        (
+            '<a href="http://example.com/&xx;">foo</a>',
+            '<a href="http://example.com/&amp;xx;">foo</a>',
+        ),
+        (
+            '<a href="http://example.com?active=true&current=true">foo</a>',
+            '<a href="http://example.com?active=true&amp;current=true">foo</a>',
+        ),
+        # Ambiguous ampersands in text are not escaped
+        ("&xx;", "&xx;"),
+        # Test numeric entities
+        ("&#39;", "&#39;"),
+        ("&#34;", "&#34;"),
+        ("&#123;", "&#123;"),
+        ("&#x0007b;", "&#x0007b;"),
+        ("&#x0007B;", "&#x0007B;"),
+        # Test non-numeric entities
+        ("&#", "&amp;#"),
+        ("&#<", "&amp;#&lt;"),
+        # html5lib tokenizer unescapes character entities, so these would become '
+        # and " which makes it possible to break out of html attributes.
+        #
+        # Verify that clean() doesn't unescape entities.
+        ("&#39;&#34;", "&#39;&#34;"),
+    ],
+)
 def test_character_entities_handling(text, expected):
     assert clean(text) == expected
 
 
-@pytest.mark.parametrize('data, kwargs, expected', [
-    # All tags are allowed, so it strips nothing
-    (
-        'a test <em>with</em> <b>html</b> tags',
-        {},
-        'a test <em>with</em> <b>html</b> tags'
-    ),
-
-    # img tag is disallowed, so it's stripped
-    (
-        'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
-        {},
-        'a test <em>with</em>  <b>html</b> tags'
-    ),
-
-    # a tag is disallowed, so it's stripped
-    (
-        '<p><a href="http://example.com/">link text</a></p>',
-        {'tags': ['p']},
-        '<p>link text</p>'
-    ),
-
-    # Test nested disallowed tag
-    (
-        '<p><span>multiply <span>nested <span>text</span></span></span></p>',
-        {'tags': ['p']},
-        '<p>multiply nested text</p>'
-    ),
-    # (#271)
-    (
-        '<ul><li><script></li></ul>',
-        {'tags': ['ul', 'li']},
-        '<ul><li></li></ul>'
-    ),
-
-    # Test disallowed tag that's deep in the tree
-    (
-        '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>',
-        {'tags': ['a', 'p']},
-        '<p><a href="http://example.com/"></a></p>'
-    ),
-
-    # Test isindex -- the parser expands this to a prompt (#279)
-    ('<isindex>', {}, ''),
-
-    # Test non-tags that are well-formed HTML (#280)
-    ('Yeah right <sarcasm/>', {}, 'Yeah right '),
-    ('<sarcasm>', {}, ''),
-    ('</sarcasm>', {}, ''),
-
-    # These are non-tags, but also "malformed" so they don't get treated like
-    # tags and stripped
-    ('</ sarcasm>', {}, '&lt;/ sarcasm&gt;'),
-    ('</ sarcasm >', {}, '&lt;/ sarcasm &gt;'),
-    ('Foo <bar@example.com>', {}, 'Foo '),
-    ('Favorite movie: <name of movie>', {}, 'Favorite movie: '),
-    ('</3', {}, '&lt;/3'),
-])
+@pytest.mark.parametrize(
+    "data, kwargs, expected",
+    [
+        # All tags are allowed, so it strips nothing
+        (
+            "a test <em>with</em> <b>html</b> tags",
+            {},
+            "a test <em>with</em> <b>html</b> tags",
+        ),
+        # img tag is disallowed, so it's stripped
+        (
+            'a test <em>with</em> <img src="http://example.com/"> <b>html</b> tags',
+            {},
+            "a test <em>with</em>  <b>html</b> tags",
+        ),
+        # a tag is disallowed, so it's stripped
+        (
+            '<p><a href="http://example.com/">link text</a></p>',
+            {"tags": ["p"]},
+            "<p>link text</p>",
+        ),
+        # Test nested disallowed tag
+        (
+            "<p><span>multiply <span>nested <span>text</span></span></span></p>",
+            {"tags": ["p"]},
+            "<p>multiply nested text</p>",
+        ),
+        # (#271)
+        ("<ul><li><script></li></ul>", {"tags": ["ul", "li"]}, "<ul><li></li></ul>"),
+        # Test disallowed tag that's deep in the tree
+        (
+            '<p><a href="http://example.com/"><img src="http://example.com/"></a></p>',
+            {"tags": ["a", "p"]},
+            '<p><a href="http://example.com/"></a></p>',
+        ),
+        # Test isindex -- the parser expands this to a prompt (#279)
+        ("<isindex>", {}, ""),
+        # Test non-tags that are well-formed HTML (#280)
+        ("Yeah right <sarcasm/>", {}, "Yeah right "),
+        ("<sarcasm>", {}, ""),
+        ("</sarcasm>", {}, ""),
+        # These are non-tags, but also "malformed" so they don't get treated like
+        # tags and stripped
+        ("</ sarcasm>", {}, "&lt;/ sarcasm&gt;"),
+        ("</ sarcasm >", {}, "&lt;/ sarcasm &gt;"),
+        ("Foo <bar@example.com>", {}, "Foo "),
+        ("Favorite movie: <name of movie>", {}, "Favorite movie: "),
+        ("</3", {}, "&lt;/3"),
+    ],
+)
 def test_stripping_tags(data, kwargs, expected):
     assert clean(data, strip=True, **kwargs) == expected
-    assert clean('  ' + data + '  ', strip=True, **kwargs) == '  ' + expected + '  '
-    assert clean('abc ' + data + ' def', strip=True, **kwargs) == 'abc ' + expected + ' def'
-
-
-@pytest.mark.parametrize('data, expected', [
-    # Disallowed tag is escaped
-    ('<img src="javascript:alert(\'XSS\');">', '&lt;img src="javascript:alert(\'XSS\');"&gt;'),
-
-    # Test with parens
-    ('<script>safe()</script>', '&lt;script&gt;safe()&lt;/script&gt;'),
-
-    # Test with braces
-    ('<style>body{}</style>', '&lt;style&gt;body{}&lt;/style&gt;'),
-
-    # Test nested disallow tags (#271)
-    ('<ul><li><script></li></ul>', '<ul><li>&lt;script&gt;</li></ul>'),
+    assert clean("  " + data + "  ", strip=True, **kwargs) == "  " + expected + "  "
+    assert (
+        clean("abc " + data + " def", strip=True, **kwargs)
+        == "abc " + expected + " def"
+    )
 
-    # Test isindex -- the parser expands this to a prompt (#279)
-    ('<isindex>', '&lt;isindex&gt;'),
 
-    # Test non-tags (#280)
-    ('<sarcasm/>', '&lt;sarcasm/&gt;'),
-    ('<sarcasm>', '&lt;sarcasm&gt;'),
-    ('</sarcasm>', '&lt;/sarcasm&gt;'),
-    ('</ sarcasm>', '&lt;/ sarcasm&gt;'),
-    ('</ sarcasm >', '&lt;/ sarcasm &gt;'),
-    ('</3', '&lt;/3'),
-    ('<bar@example.com>', '&lt;bar@example.com&gt;'),
-    ('Favorite movie: <name of movie>', 'Favorite movie: &lt;name of movie&gt;'),
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        # Disallowed tag is escaped
+        (
+            "<img src=\"javascript:alert('XSS');\">",
+            "&lt;img src=\"javascript:alert('XSS');\"&gt;",
+        ),
+        # Test with parens
+        ("<script>safe()</script>", "&lt;script&gt;safe()&lt;/script&gt;"),
+        # Test with braces
+        ("<style>body{}</style>", "&lt;style&gt;body{}&lt;/style&gt;"),
+        # Test nested disallow tags (#271)
+        ("<ul><li><script></li></ul>", "<ul><li>&lt;script&gt;</li></ul>"),
+        # Test isindex -- the parser expands this to a prompt (#279)
+        ("<isindex>", "&lt;isindex&gt;"),
+        # Test non-tags (#280)
+        ("<sarcasm/>", "&lt;sarcasm/&gt;"),
+        ("<sarcasm>", "&lt;sarcasm&gt;"),
+        ("</sarcasm>", "&lt;/sarcasm&gt;"),
+        ("</ sarcasm>", "&lt;/ sarcasm&gt;"),
+        ("</ sarcasm >", "&lt;/ sarcasm &gt;"),
+        ("</3", "&lt;/3"),
+        ("<bar@example.com>", "&lt;bar@example.com&gt;"),
+        ("Favorite movie: <name of movie>", "Favorite movie: &lt;name of movie&gt;"),
+    ],
+)
 def test_escaping_tags(data, expected):
     assert clean(data, strip=False) == expected
-    assert clean('  ' + data + '  ', strip=False) == '  ' + expected + '  '
-    assert clean('abc ' + data + ' def', strip=False) == 'abc ' + expected + ' def'
-
-
-@pytest.mark.parametrize('data, expected', [
-    (
-        '<scri<script>pt>alert(1)</scr</script>ipt>',
-        'pt&gt;alert(1)ipt&gt;'
-    ),
-    (
-        '<scri<scri<script>pt>pt>alert(1)</script>',
-        'pt&gt;pt&gt;alert(1)'
-    ),
-])
+    assert clean("  " + data + "  ", strip=False) == "  " + expected + "  "
+    assert clean("abc " + data + " def", strip=False) == "abc " + expected + " def"
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        ("<scri<script>pt>alert(1)</scr</script>ipt>", "pt&gt;alert(1)ipt&gt;"),
+        ("<scri<scri<script>pt>pt>alert(1)</script>", "pt&gt;pt&gt;alert(1)"),
+    ],
+)
 def test_stripping_tags_is_safe(data, expected):
     """Test stripping tags shouldn't result in malicious content"""
     assert clean(data, strip=True) == expected
@@ -383,86 +306,86 @@ def test_stripping_tags_is_safe(data, expected):
 
 def test_allowed_styles():
     """Test allowed styles"""
-    ATTRS = ['style']
-    STYLE = ['color']
+    ATTRS = ["style"]
+    STYLE = ["color"]
 
-    assert (
-        clean('<b style="top:0"></b>', attributes=ATTRS) ==
-        '<b style=""></b>'
-    )
+    assert clean('<b style="top:0"></b>', attributes=ATTRS) == '<b style=""></b>'
 
     text = '<b style="color: blue;"></b>'
     assert clean(text, attributes=ATTRS, styles=STYLE) == text
 
     text = '<b style="top: 0; color: blue;"></b>'
-    assert (
-        clean(text, attributes=ATTRS, styles=STYLE) ==
-        '<b style="color: blue;"></b>'
-    )
+    assert clean(text, attributes=ATTRS, styles=STYLE) == '<b style="color: blue;"></b>'
 
 
 def test_href_with_wrong_tag():
-    assert (
-        clean('<em href="fail">no link</em>') ==
-        '<em>no link</em>'
-    )
+    assert clean('<em href="fail">no link</em>') == "<em>no link</em>"
 
 
 def test_disallowed_attr():
-    IMG = ['img', ]
-    IMG_ATTR = ['src']
+    IMG = [
+        "img",
+    ]
+    IMG_ATTR = ["src"]
 
+    assert clean('<a onclick="evil" href="test">test</a>') == '<a href="test">test</a>'
     assert (
-        clean('<a onclick="evil" href="test">test</a>') ==
-        '<a href="test">test</a>'
+        clean('<img onclick="evil" src="test" />', tags=IMG, attributes=IMG_ATTR)
+        == '<img src="test">'
     )
     assert (
-        clean('<img onclick="evil" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="test">'
-    )
-    assert (
-        clean('<img href="invalid" src="test" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="test">'
+        clean('<img href="invalid" src="test" />', tags=IMG, attributes=IMG_ATTR)
+        == '<img src="test">'
     )
 
 
 def test_unquoted_attr_values_are_quoted():
     assert (
-        clean('<abbr title=mytitle>myabbr</abbr>') ==
-        '<abbr title="mytitle">myabbr</abbr>'
+        clean("<abbr title=mytitle>myabbr</abbr>")
+        == '<abbr title="mytitle">myabbr</abbr>'
     )
 
 
 def test_unquoted_event_handler_attr_value():
     assert (
-        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>') ==
-        '<a href="http://xx.com">xx.com</a>'
+        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>')
+        == '<a href="http://xx.com">xx.com</a>'
     )
 
 
 def test_invalid_filter_attr():
-    IMG = ['img', ]
+    IMG = [
+        "img",
+    ]
     IMG_ATTR = {
-        'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
+        "img": lambda tag, name, val: name == "src" and val == "http://example.com/"
     }
 
     assert (
-        clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img src="http://example.com/">'
+        clean(
+            '<img onclick="evil" src="http://example.com/" />',
+            tags=IMG,
+            attributes=IMG_ATTR,
+        )
+        == '<img src="http://example.com/">'
     )
     assert (
-        clean('<img onclick="evil" src="http://badhost.com/" />', tags=IMG, attributes=IMG_ATTR) ==
-        '<img>'
+        clean(
+            '<img onclick="evil" src="http://badhost.com/" />',
+            tags=IMG,
+            attributes=IMG_ATTR,
+        )
+        == "<img>"
     )
 
 
 def test_poster_attribute():
     """Poster attributes should not allow javascript."""
-    tags = ['video']
-    attrs = {'video': ['poster']}
+    tags = ["video"]
+    attrs = {"video": ["poster"]}
 
     test = '<video poster="javascript:alert(1)"></video>'
-    assert clean(test, tags=tags, attributes=attrs) == '<video></video>'
+    assert clean(test, tags=tags, attributes=attrs) == "<video></video>"
 
     ok = '<video poster="/foo.png"></video>'
     assert clean(ok, tags=tags, attributes=attrs) == ok
@@ -470,210 +393,155 @@ def test_poster_attribute():
 
 def test_attributes_callable():
     """Verify attributes can take a callable"""
-    ATTRS = lambda tag, name, val: name == 'title'
-    TAGS = ['a']
+    ATTRS = lambda tag, name, val: name == "title"
+    TAGS = ["a"]
 
     text = '<a href="/foo" title="blah">example</a>'
-    assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<a title="blah">example</a>'
-    )
+    assert clean(text, tags=TAGS, attributes=ATTRS) == '<a title="blah">example</a>'
 
 
 def test_attributes_wildcard():
     """Verify attributes[*] works"""
     ATTRS = {
-        '*': ['id'],
-        'img': ['src'],
+        "*": ["id"],
+        "img": ["src"],
     }
-    TAGS = ['img', 'em']
+    TAGS = ["img", "em"]
 
-    text = 'both <em id="foo" style="color: black">can</em> have <img id="bar" src="foo"/>'
+    text = (
+        'both <em id="foo" style="color: black">can</em> have <img id="bar" src="foo"/>'
+    )
     assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        'both <em id="foo">can</em> have <img id="bar" src="foo">'
+        clean(text, tags=TAGS, attributes=ATTRS)
+        == 'both <em id="foo">can</em> have <img id="bar" src="foo">'
     )
 
 
 def test_attributes_wildcard_callable():
     """Verify attributes[*] callable works"""
-    ATTRS = {
-        '*': lambda tag, name, val: name == 'title'
-    }
-    TAGS = ['a']
+    ATTRS = {"*": lambda tag, name, val: name == "title"}
+    TAGS = ["a"]
 
     assert (
-        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-        '<a title="blah">example</a>'
+        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS)
+        == '<a title="blah">example</a>'
     )
 
 
 def test_attributes_tag_callable():
     """Verify attributes[tag] callable works"""
+
     def img_test(tag, name, val):
-        return name == 'src' and val.startswith('https')
+        return name == "src" and val.startswith("https")
 
     ATTRS = {
-        'img': img_test,
+        "img": img_test,
     }
-    TAGS = ['img']
+    TAGS = ["img"]
 
     text = 'foo <img src="http://example.com" alt="blah"> baz'
-    assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        'foo <img> baz'
-    )
+    assert clean(text, tags=TAGS, attributes=ATTRS) == "foo <img> baz"
     text = 'foo <img src="https://example.com" alt="blah"> baz'
     assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        'foo <img src="https://example.com"> baz'
+        clean(text, tags=TAGS, attributes=ATTRS)
+        == 'foo <img src="https://example.com"> baz'
     )
 
 
 def test_attributes_tag_list():
     """Verify attributes[tag] list works"""
-    ATTRS = {
-        'a': ['title']
-    }
-    TAGS = ['a']
+    ATTRS = {"a": ["title"]}
+    TAGS = ["a"]
 
     assert (
-        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS) ==
-        '<a title="blah">example</a>'
+        clean('<a href="/foo" title="blah">example</a>', tags=TAGS, attributes=ATTRS)
+        == '<a title="blah">example</a>'
     )
 
 
 def test_attributes_list():
     """Verify attributes list works"""
-    ATTRS = ['title']
-    TAGS = ['a']
+    ATTRS = ["title"]
+    TAGS = ["a"]
 
     text = '<a href="/foo" title="blah">example</a>'
-    assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<a title="blah">example</a>'
-    )
+    assert clean(text, tags=TAGS, attributes=ATTRS) == '<a title="blah">example</a>'
 
 
-@pytest.mark.parametrize('data, kwargs, expected', [
-    # javascript: is not allowed by default
-    (
-        '<a href="javascript:alert(\'XSS\')">xss</a>',
-        {},
-        '<a>xss</a>'
-    ),
-
-    # File protocol is not allowed by default
-    (
-        '<a href="file:///tmp/foo">foo</a>',
-        {},
-        '<a>foo</a>'
-    ),
-
-    # Specified protocols are allowed
-    (
-        '<a href="myprotocol://more_text">allowed href</a>',
-        {'protocols': ['myprotocol']},
-        '<a href="myprotocol://more_text">allowed href</a>'
-    ),
-
-    # Unspecified protocols are not allowed
-    (
-        '<a href="http://example.com">invalid href</a>',
-        {'protocols': ['myprotocol']},
-        '<a>invalid href</a>'
-    ),
-
-    # Anchors are ok
-    (
-        '<a href="#example.com">foo</a>',
-        {'protocols': []},
-        '<a href="#example.com">foo</a>'
-    ),
-
-    # Allow implicit http if allowed
-    (
-        '<a href="example.com">valid</a>',
-        {'protocols': ['http']},
-        '<a href="example.com">valid</a>'
-    ),
-    (
-        '<a href="example.com:8000">valid</a>',
-        {'protocols': ['http']},
-        '<a href="example.com:8000">valid</a>'
-    ),
-    (
-        '<a href="localhost">valid</a>',
-        {'protocols': ['http']},
-        '<a href="localhost">valid</a>'
-    ),
-    (
-        '<a href="localhost:8000">valid</a>',
-        {'protocols': ['http']},
-        '<a href="localhost:8000">valid</a>'
-    ),
-    (
-        '<a href="192.168.100.100">valid</a>',
-        {'protocols': ['http']},
-        '<a href="192.168.100.100">valid</a>'
-    ),
-    (
-        '<a href="192.168.100.100:8000">valid</a>',
-        {'protocols': ['http']},
-        '<a href="192.168.100.100:8000">valid</a>'
-    ),
-
-    # Disallow implicit http if disallowed
-    (
-        '<a href="example.com">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-    (
-        '<a href="example.com:8000">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-    (
-        '<a href="localhost">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-    (
-        '<a href="localhost:8000">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-    (
-        '<a href="192.168.100.100">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-    (
-        '<a href="192.168.100.100:8000">foo</a>',
-        {'protocols': []},
-        '<a>foo</a>'
-    ),
-
-    # Disallowed protocols with sneaky character entities
-    (
-        '<a href="javas&#x09;cript:alert(1)">alert</a>',
-        {},
-        '<a>alert</a>'
-    ),
-    (
-        '<a href="&#14;javascript:alert(1)">alert</a>',
-        {},
-        '<a>alert</a>'
-    ),
-
-    # Checking the uri should change it at all
-    (
-        '<a href="http://example.com/?foo&nbsp;bar">foo</a>',
-        {},
-        '<a href="http://example.com/?foo&nbsp;bar">foo</a>'
-    ),
-])
+@pytest.mark.parametrize(
+    "data, kwargs, expected",
+    [
+        # javascript: is not allowed by default
+        ("<a href=\"javascript:alert('XSS')\">xss</a>", {}, "<a>xss</a>"),
+        # File protocol is not allowed by default
+        ('<a href="file:///tmp/foo">foo</a>', {}, "<a>foo</a>"),
+        # Specified protocols are allowed
+        (
+            '<a href="myprotocol://more_text">allowed href</a>',
+            {"protocols": ["myprotocol"]},
+            '<a href="myprotocol://more_text">allowed href</a>',
+        ),
+        # Unspecified protocols are not allowed
+        (
+            '<a href="http://example.com">invalid href</a>',
+            {"protocols": ["myprotocol"]},
+            "<a>invalid href</a>",
+        ),
+        # Anchors are ok
+        (
+            '<a href="#example.com">foo</a>',
+            {"protocols": []},
+            '<a href="#example.com">foo</a>',
+        ),
+        # Allow implicit http if allowed
+        (
+            '<a href="example.com">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="example.com">valid</a>',
+        ),
+        (
+            '<a href="example.com:8000">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="example.com:8000">valid</a>',
+        ),
+        (
+            '<a href="localhost">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="localhost">valid</a>',
+        ),
+        (
+            '<a href="localhost:8000">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="localhost:8000">valid</a>',
+        ),
+        (
+            '<a href="192.168.100.100">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="192.168.100.100">valid</a>',
+        ),
+        (
+            '<a href="192.168.100.100:8000">valid</a>',
+            {"protocols": ["http"]},
+            '<a href="192.168.100.100:8000">valid</a>',
+        ),
+        # Disallow implicit http if disallowed
+        ('<a href="example.com">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        ('<a href="example.com:8000">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        ('<a href="localhost">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        ('<a href="localhost:8000">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        ('<a href="192.168.100.100">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        ('<a href="192.168.100.100:8000">foo</a>', {"protocols": []}, "<a>foo</a>"),
+        # Disallowed protocols with sneaky character entities
+        ('<a href="javas&#x09;cript:alert(1)">alert</a>', {}, "<a>alert</a>"),
+        ('<a href="&#14;javascript:alert(1)">alert</a>', {}, "<a>alert</a>"),
+        # Checking the uri should change it at all
+        (
+            '<a href="http://example.com/?foo&nbsp;bar">foo</a>',
+            {},
+            '<a href="http://example.com/?foo&nbsp;bar">foo</a>',
+        ),
+    ],
+)
 def test_uri_value_allowed_protocols(data, kwargs, expected):
     assert clean(data, **kwargs) == expected
 
@@ -681,84 +549,86 @@ def test_uri_value_allowed_protocols(data, kwargs, expected):
 def test_svg_attr_val_allows_ref():
     """Unescape values in svg attrs that allow url references"""
     # Local IRI, so keep it
-    TAGS = ['svg', 'rect']
+    TAGS = ["svg", "rect"]
     ATTRS = {
-        'rect': ['fill'],
+        "rect": ["fill"],
     }
 
     text = '<svg><rect fill="url(#foo)" /></svg>'
     assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<svg><rect fill="url(#foo)"></rect></svg>'
+        clean(text, tags=TAGS, attributes=ATTRS)
+        == '<svg><rect fill="url(#foo)"></rect></svg>'
     )
 
     # Non-local IRI, so drop it
-    TAGS = ['svg', 'rect']
+    TAGS = ["svg", "rect"]
     ATTRS = {
-        'rect': ['fill'],
+        "rect": ["fill"],
     }
     text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
-    assert (
-        clean(text, tags=TAGS, attributes=ATTRS) ==
-        '<svg><rect></rect></svg>'
-    )
+    assert clean(text, tags=TAGS, attributes=ATTRS) == "<svg><rect></rect></svg>"
 
 
-@pytest.mark.parametrize('text, expected', [
-    (
-        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
-        '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
-    ),
-    (
-        '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
-        # NOTE(willkg): Bug in html5lib serializer drops the xlink part
-        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
-    ),
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        (
+            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+            '<svg><pattern href="#patt2" id="patt1"></pattern></svg>',
+        ),
+        (
+            '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
+            # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+        ),
+    ],
+)
 def test_svg_allow_local_href(text, expected):
     """Keep local hrefs for svg elements"""
-    TAGS = ['svg', 'pattern']
+    TAGS = ["svg", "pattern"]
     ATTRS = {
-        'pattern': ['id', 'href'],
+        "pattern": ["id", "href"],
     }
     assert clean(text, tags=TAGS, attributes=ATTRS) == expected
 
 
-@pytest.mark.parametrize('text, expected', [
-    (
-        '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
-        '<svg><pattern id="patt1"></pattern></svg>'
-    ),
-    (
-        '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
-        '<svg><pattern id="patt1"></pattern></svg>'
-    ),
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        (
+            '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
+            '<svg><pattern id="patt1"></pattern></svg>',
+        ),
+        (
+            '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
+            '<svg><pattern id="patt1"></pattern></svg>',
+        ),
+    ],
+)
 def test_svg_allow_local_href_nonlocal(text, expected):
     """Drop non-local hrefs for svg elements"""
-    TAGS = ['svg', 'pattern']
+    TAGS = ["svg", "pattern"]
     ATTRS = {
-        'pattern': ['id', 'href'],
+        "pattern": ["id", "href"],
     }
     assert clean(text, tags=TAGS, attributes=ATTRS) == expected
 
 
-@pytest.mark.parametrize('data, expected', [
-    # Convert bell
-    ('1\a23', '1?23'),
-
-    # Convert backpsace
-    ('1\b23', '1?23'),
-
-    # Convert formfeed
-    ('1\v23', '1?23'),
-
-    # Convert vertical tab
-    ('1\f23', '1?23'),
-
-    # Convert a bunch of characters in a string
-    ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        # Convert bell
+        ("1\a23", "1?23"),
+        # Convert backpsace
+        ("1\b23", "1?23"),
+        # Convert formfeed
+        ("1\v23", "1?23"),
+        # Convert vertical tab
+        ("1\f23", "1?23"),
+        # Convert a bunch of characters in a string
+        ("import y\bose\bm\bi\bt\be\b", "import y?ose?m?i?t?e?"),
+    ],
+)
 def test_invisible_characters(data, expected):
     assert clean(data) == expected
 
@@ -768,7 +638,7 @@ def test_nonexistent_namespace():
     # namespace didn't exist. After the fixes for Bleach 3.0, this no longer
     # goes through the HTML parser as a tag, so it doesn't tickle the bad
     # namespace code.
-    assert clean('<d {c}>') == '&lt;d {c}&gt;'
+    assert clean("<d {c}>") == "&lt;d {c}&gt;"
 
 
 @pytest.mark.parametrize(
@@ -822,13 +692,15 @@ def test_self_closing_tags_self_close(tag):
     "xmp",
 ]
 
+
 @pytest.mark.parametrize(
     "raw_tag, data, expected",
     [
         (
             raw_tag,
             "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
-            "<noscript>&lt;%s&gt;</noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+            "<noscript>&lt;%s&gt;</noscript>&lt;img src=x onerror=alert(1) /&gt;"
+            % raw_tag,
         )
         for raw_tag in _raw_tags
     ],
@@ -844,8 +716,10 @@ def test_noscript_rawtag_(raw_tag, data, expected):
         (
             namespace_tag,
             rc_data_element_tag,
-            "<%s><%s><img src=x onerror=alert(1)>" % (namespace_tag, rc_data_element_tag),
-            "<%s><%s>&lt;img src=x onerror=alert(1)&gt;</%s></%s>" % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag),
+            "<%s><%s><img src=x onerror=alert(1)>"
+            % (namespace_tag, rc_data_element_tag),
+            "<%s><%s>&lt;img src=x onerror=alert(1)&gt;</%s></%s>"
+            % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag),
         )
         for namespace_tag in ["math", "svg"]
         # https://dev.w3.org/html5/html-author/#rcdata-elements
@@ -854,11 +728,15 @@ def test_noscript_rawtag_(raw_tag, data, expected):
         for rc_data_element_tag in rcdataElements
     ],
 )
-def test_namespace_rc_data_element_strip_false(namespace_tag, rc_data_element_tag, data, expected):
+def test_namespace_rc_data_element_strip_false(
+    namespace_tag, rc_data_element_tag, data, expected
+):
     # refs: bug 1621692 / GHSA-m6xf-fq7q-8743
     #
     # browsers will pull the img out of the namespace and rc data tag resulting in XSS
-    assert clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected
+    assert (
+        clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected
+    )
 
 
 def get_ids_and_tests():
@@ -867,13 +745,12 @@ def get_ids_and_tests():
     :returns: list of ``(id, filedata)`` tuples
 
     """
-    datadir = os.path.join(os.path.dirname(__file__), 'data')
+    datadir = os.path.join(os.path.dirname(__file__), "data")
     tests = [
-        os.path.join(datadir, fn) for fn in os.listdir(datadir)
-        if fn.endswith('.test')
+        os.path.join(datadir, fn) for fn in os.listdir(datadir) if fn.endswith(".test")
     ]
     # Sort numerically which makes it easier to iterate through them
-    tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
+    tests.sort(key=lambda x: int(os.path.basename(x).split(".", 1)[0]))
 
     testcases = []
     for fn in tests:
@@ -889,10 +766,10 @@ def get_ids_and_tests():
 _regression_tests = [item[1] for item in _regression_ids_and_tests]
 
 
-@pytest.mark.parametrize('test_case', _regression_tests, ids=_regression_ids)
+@pytest.mark.parametrize("test_case", _regression_tests, ids=_regression_ids)
 def test_regressions(test_case):
     """Regression tests for clean so we can see if there are issues"""
-    test_data, expected = test_case.split('\n--\n')
+    test_data, expected = test_case.split("\n--\n")
 
     # NOTE(willkg): This strips input and expected which makes it easier to
     # maintain the files. If there comes a time when the input needs whitespace
@@ -905,14 +782,14 @@ def test_regressions(test_case):
 
 class TestCleaner:
     def test_basics(self):
-        TAGS = ['span', 'br']
-        ATTRS = {'span': ['style']}
+        TAGS = ["span", "br"]
+        ATTRS = {"span": ["style"]}
 
         cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
 
         assert (
-            cleaner.clean('a <br/><span style="color:red">test</span>') ==
-            'a <br><span style="">test</span>'
+            cleaner.clean('a <br/><span style="color:red">test</span>')
+            == 'a <br><span style="">test</span>'
         )
 
     def test_filters(self):
@@ -920,21 +797,16 @@ def test_filters(self):
         class MooFilter(Filter):
             def __iter__(self):
                 for token in Filter.__iter__(self):
-                    if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
-                        for attr, value in token['data'].items():
-                            token['data'][attr] = 'moo'
+                    if token["type"] in ["StartTag", "EmptyTag"] and token["data"]:
+                        for attr, value in token["data"].items():
+                            token["data"][attr] = "moo"
 
                     yield token
 
-        ATTRS = {
-            'img': ['rel', 'src']
-        }
-        TAGS = ['img']
+        ATTRS = {"img": ["rel", "src"]}
+        TAGS = ["img"]
 
         cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
 
         dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
-        assert (
-            cleaner.clean(dirty) ==
-            'this is cute! <img rel="moo" src="moo">'
-        )
+        assert cleaner.clean(dirty) == 'this is cute! <img rel="moo" src="moo">'
diff --git a/tests/test_css.py b/tests/test_css.py
index 6a3d8eb9..43bdc4b6 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -8,239 +8,264 @@
 from bleach import clean
 
 
-clean = partial(clean, tags=['p'], attributes=['style'])
-
-
-@pytest.mark.parametrize('data, styles, expected', [
-    (
-        '<p style="font-family: Arial; color: red; float: left; background-color: red;">bar</p>',
-        ['color'],
-        '<p style="color: red;">bar</p>'
-    ),
-    (
-        '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
-        ['color'],
-        '<p style="color: red;">bar</p>'
-    ),
-    (
-        '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
-        ['color', 'float'],
-        '<p style="color: red; float: left;">bar</p>'
-    ),
-    (
-        '<p style="color: red; float: left; padding: 1em;">bar</p>',
-        ['color', 'float'],
-        '<p style="color: red; float: left;">bar</p>'
-    ),
-    (
-        '<p style="color: red; float: left; padding: 1em;">bar</p>',
-        ['color'],
-        '<p style="color: red;">bar</p>'
-    ),
-    # Handle leading - in attributes
-    # regressed with the fix for bug 1623633
-    pytest.param(
-        '<p style="cursor: -moz-grab;">bar</p>',
-        ['cursor'],
-        '<p style="cursor: -moz-grab;">bar</p>',
-        marks=pytest.mark.xfail,
-    ),
-    # Handle () in attributes
-    (
-        '<p style="color: hsl(30,100%,50%);">bar</p>',
-        ['color'],
-        '<p style="color: hsl(30,100%,50%);">bar</p>',
-    ),
-    (
-        '<p style="color: rgba(255,0,0,0.4);">bar</p>',
-        ['color'],
-        '<p style="color: rgba(255,0,0,0.4);">bar</p>',
-    ),
-    # Handle ' in attributes
-    # regressed with the fix for bug 1623633
-    pytest.param(
-        '<p style="text-overflow: \',\' ellipsis;">bar</p>',
-        ['text-overflow'],
-        '<p style="text-overflow: \',\' ellipsis;">bar</p>',
-        marks=pytest.mark.xfail,
-    ),
-    # Handle " in attributes
-    # regressed with the fix for bug 1623633
-    pytest.param(
-        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>',
-        ['text-overflow'],
-        '<p style=\'text-overflow: "," ellipsis;\'>bar</p>',
-        marks=pytest.mark.xfail,
-    ),
-    (
-        '<p style=\'font-family: "Arial";\'>bar</p>',
-        ['font-family'],
-        '<p style=\'font-family: "Arial";\'>bar</p>'
-    ),
-    # Handle non-ascii characters in attributes
-    (
-        '<p style="font-family: \u30e1\u30a4\u30ea\u30aa; color: blue;">bar</p>',
-        ['color'],
-        '<p style="color: blue;">bar</p>'
-    ),
-])
+clean = partial(clean, tags=["p"], attributes=["style"])
+
+
+@pytest.mark.parametrize(
+    "data, styles, expected",
+    [
+        (
+            '<p style="font-family: Arial; color: red; float: left; background-color: red;">bar</p>',
+            ["color"],
+            '<p style="color: red;">bar</p>',
+        ),
+        (
+            '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
+            ["color"],
+            '<p style="color: red;">bar</p>',
+        ),
+        (
+            '<p style="border: 1px solid blue; color: red; float: left;">bar</p>',
+            ["color", "float"],
+            '<p style="color: red; float: left;">bar</p>',
+        ),
+        (
+            '<p style="color: red; float: left; padding: 1em;">bar</p>',
+            ["color", "float"],
+            '<p style="color: red; float: left;">bar</p>',
+        ),
+        (
+            '<p style="color: red; float: left; padding: 1em;">bar</p>',
+            ["color"],
+            '<p style="color: red;">bar</p>',
+        ),
+        # Handle leading - in attributes
+        # regressed with the fix for bug 1623633
+        pytest.param(
+            '<p style="cursor: -moz-grab;">bar</p>',
+            ["cursor"],
+            '<p style="cursor: -moz-grab;">bar</p>',
+            marks=pytest.mark.xfail,
+        ),
+        # Handle () in attributes
+        (
+            '<p style="color: hsl(30,100%,50%);">bar</p>',
+            ["color"],
+            '<p style="color: hsl(30,100%,50%);">bar</p>',
+        ),
+        (
+            '<p style="color: rgba(255,0,0,0.4);">bar</p>',
+            ["color"],
+            '<p style="color: rgba(255,0,0,0.4);">bar</p>',
+        ),
+        # Handle ' in attributes
+        # regressed with the fix for bug 1623633
+        pytest.param(
+            "<p style=\"text-overflow: ',' ellipsis;\">bar</p>",
+            ["text-overflow"],
+            "<p style=\"text-overflow: ',' ellipsis;\">bar</p>",
+            marks=pytest.mark.xfail,
+        ),
+        # Handle " in attributes
+        # regressed with the fix for bug 1623633
+        pytest.param(
+            "<p style='text-overflow: \",\" ellipsis;'>bar</p>",
+            ["text-overflow"],
+            "<p style='text-overflow: \",\" ellipsis;'>bar</p>",
+            marks=pytest.mark.xfail,
+        ),
+        (
+            "<p style='font-family: \"Arial\";'>bar</p>",
+            ["font-family"],
+            "<p style='font-family: \"Arial\";'>bar</p>",
+        ),
+        # Handle non-ascii characters in attributes
+        (
+            '<p style="font-family: \u30e1\u30a4\u30ea\u30aa; color: blue;">bar</p>',
+            ["color"],
+            '<p style="color: blue;">bar</p>',
+        ),
+    ],
+)
 def test_allowed_css(data, styles, expected):
     assert clean(data, styles=styles) == expected
 
 
 def test_valid_css():
     """The sanitizer should fix missing CSS values."""
-    styles = ['color', 'float']
+    styles = ["color", "float"]
     assert (
-        clean('<p style="float: left; color: ">foo</p>', styles=styles) ==
-        '<p style="float: left;">foo</p>'
+        clean('<p style="float: left; color: ">foo</p>', styles=styles)
+        == '<p style="float: left;">foo</p>'
     )
     assert (
-        clean('<p style="color: float: left;">foo</p>', styles=styles) ==
-        '<p style="">foo</p>'
+        clean('<p style="color: float: left;">foo</p>', styles=styles)
+        == '<p style="">foo</p>'
     )
 
 
-@pytest.mark.parametrize('data, expected', [
-    # No url--unchanged
-    (
-        '<p style="background: #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-
-    # Verify urls with no quotes, single quotes, and double quotes are all dropped
-    (
-        '<p style="background: url(topbanner.png) #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-    (
-        '<p style="background: url(\'topbanner.png\') #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-    (
-        '<p style=\'background: url("topbanner.png") #00D;\'>foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-
-    # Verify urls with spacing
-    (
-        '<p style="background: url(  \'topbanner.png\') #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-    (
-        '<p style="background: url(\'topbanner.png\'  ) #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-    (
-        '<p style="background: url(  \'topbanner.png\'  ) #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-    (
-        '<p style="background: url (  \'topbanner.png\'  ) #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-
-    # Verify urls with character entities
-    (
-        '<p style="background: url&#x09;(\'topbanner.png\') #00D;">foo</p>',
-        '<p style="background: #00D;">foo</p>'
-    ),
-
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        # No url--unchanged
+        (
+            '<p style="background: #00D;">foo</p>',
+            '<p style="background: #00D;">foo</p>',
+        ),
+        # Verify urls with no quotes, single quotes, and double quotes are all dropped
+        (
+            '<p style="background: url(topbanner.png) #00D;">foo</p>',
+            '<p style="background: #00D;">foo</p>',
+        ),
+        (
+            "<p style=\"background: url('topbanner.png') #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        (
+            "<p style='background: url(\"topbanner.png\") #00D;'>foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        # Verify urls with spacing
+        (
+            "<p style=\"background: url(  'topbanner.png') #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        (
+            "<p style=\"background: url('topbanner.png'  ) #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        (
+            "<p style=\"background: url(  'topbanner.png'  ) #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        (
+            "<p style=\"background: url (  'topbanner.png'  ) #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+        # Verify urls with character entities
+        (
+            "<p style=\"background: url&#x09;('topbanner.png') #00D;\">foo</p>",
+            '<p style="background: #00D;">foo</p>',
+        ),
+    ],
+)
 def test_urls(data, expected):
-    assert clean(data, styles=['background']) == expected
+    assert clean(data, styles=["background"]) == expected
 
 
 def test_style_hang():
     """The sanitizer should not hang on any inline styles"""
     style = [
-        'margin-top: 0px;',
-        'margin-right: 0px;',
-        'margin-bottom: 1.286em;',
-        'margin-left: 0px;',
-        'padding-top: 15px;',
-        'padding-right: 15px;',
-        'padding-bottom: 15px;',
-        'padding-left: 15px;',
-        'border-top-width: 1px;',
-        'border-right-width: 1px;',
-        'border-bottom-width: 1px;',
-        'border-left-width: 1px;',
-        'border-top-style: dotted;',
-        'border-right-style: dotted;',
-        'border-bottom-style: dotted;',
-        'border-left-style: dotted;',
-        'border-top-color: rgb(203, 200, 185);',
-        'border-right-color: rgb(203, 200, 185);',
-        'border-bottom-color: rgb(203, 200, 185);',
-        'border-left-color: rgb(203, 200, 185);',
-        'background-image: initial;',
-        'background-attachment: initial;',
-        'background-origin: initial;',
-        'background-clip: initial;',
-        'background-color: rgb(246, 246, 242);',
-        'overflow-x: auto;',
-        'overflow-y: auto;',
-        'font: italic small-caps bolder condensed 16px/3 cursive;',
-        'background-position: initial initial;',
-        'background-repeat: initial initial;'
+        "margin-top: 0px;",
+        "margin-right: 0px;",
+        "margin-bottom: 1.286em;",
+        "margin-left: 0px;",
+        "padding-top: 15px;",
+        "padding-right: 15px;",
+        "padding-bottom: 15px;",
+        "padding-left: 15px;",
+        "border-top-width: 1px;",
+        "border-right-width: 1px;",
+        "border-bottom-width: 1px;",
+        "border-left-width: 1px;",
+        "border-top-style: dotted;",
+        "border-right-style: dotted;",
+        "border-bottom-style: dotted;",
+        "border-left-style: dotted;",
+        "border-top-color: rgb(203, 200, 185);",
+        "border-right-color: rgb(203, 200, 185);",
+        "border-bottom-color: rgb(203, 200, 185);",
+        "border-left-color: rgb(203, 200, 185);",
+        "background-image: initial;",
+        "background-attachment: initial;",
+        "background-origin: initial;",
+        "background-clip: initial;",
+        "background-color: rgb(246, 246, 242);",
+        "overflow-x: auto;",
+        "overflow-y: auto;",
+        "font: italic small-caps bolder condensed 16px/3 cursive;",
+        "background-position: initial initial;",
+        "background-repeat: initial initial;",
     ]
-    html = '<p style="%s">Hello world</p>' % ' '.join(style)
+    html = '<p style="%s">Hello world</p>' % " ".join(style)
     styles = [
-        'border', 'float', 'overflow', 'min-height', 'vertical-align',
-        'white-space',
-        'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right',
-        'padding', 'padding-left', 'padding-top', 'padding-bottom',
-        'padding-right',
-        'background',
-        'background-color',
-        'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
+        "border",
+        "float",
+        "overflow",
+        "min-height",
+        "vertical-align",
+        "white-space",
+        "margin",
+        "margin-left",
+        "margin-top",
+        "margin-bottom",
+        "margin-right",
+        "padding",
+        "padding-left",
+        "padding-top",
+        "padding-bottom",
+        "padding-right",
+        "background",
+        "background-color",
+        "font",
+        "font-size",
+        "font-weight",
+        "text-align",
+        "text-transform",
     ]
 
     expected = (
         '<p style="'
-        'margin-top: 0px; '
-        'margin-right: 0px; '
-        'margin-bottom: 1.286em; '
-        'margin-left: 0px; '
-        'padding-top: 15px; '
-        'padding-right: 15px; '
-        'padding-bottom: 15px; '
-        'padding-left: 15px; '
-        'background-color: rgb(246, 246, 242); '
-        'font: italic small-caps bolder condensed 16px/3 cursive;'
+        "margin-top: 0px; "
+        "margin-right: 0px; "
+        "margin-bottom: 1.286em; "
+        "margin-left: 0px; "
+        "padding-top: 15px; "
+        "padding-right: 15px; "
+        "padding-bottom: 15px; "
+        "padding-left: 15px; "
+        "background-color: rgb(246, 246, 242); "
+        "font: italic small-caps bolder condensed 16px/3 cursive;"
         '">Hello world</p>'
     )
 
     assert clean(html, styles=styles) == expected
 
 
-@pytest.mark.parametrize('data, styles, expected', [
-    (
-        '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>',
-        ['font-family', 'white-space'],
-        '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>'
-    ),
-    (
-        '<p style="font-family: &quot;Droid Sans&quot;, serif; white-space: pre-wrap;">text</p>',
-        ['font-family', 'white-space'],
-        '<p style=\'font-family: "Droid Sans", serif; white-space: pre-wrap;\'>text</p>'
-    ),
-])
+@pytest.mark.parametrize(
+    "data, styles, expected",
+    [
+        (
+            '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>',
+            ["font-family", "white-space"],
+            '<p style="font-family: Droid Sans, serif; white-space: pre-wrap;">text</p>',
+        ),
+        (
+            '<p style="font-family: &quot;Droid Sans&quot;, serif; white-space: pre-wrap;">text</p>',
+            ["font-family", "white-space"],
+            "<p style='font-family: \"Droid Sans\", serif; white-space: pre-wrap;'>text</p>",
+        ),
+    ],
+)
 def test_css_parsing_with_entities(data, styles, expected):
     """The sanitizer should be ok with character entities"""
-    assert clean(data, tags=['p'], attributes={'p': ['style']}, styles=styles) == expected
+    assert (
+        clean(data, tags=["p"], attributes={"p": ["style"]}, styles=styles) == expected
+    )
 
 
-@pytest.mark.parametrize('overlap_test_char', ["\"", "'", "-"])
+@pytest.mark.parametrize("overlap_test_char", ['"', "'", "-"])
 def test_css_parsing_gauntlet_regex_backtracking(overlap_test_char):
     """The sanitizer gauntlet regex should not catastrophically backtrack"""
     # refs: https://bugzilla.mozilla.org/show_bug.cgi?id=1623633
 
     def time_clean(test_char, size):
-        style_attr_value = (test_char + 'a' + test_char) * size + '^'
-        stmt = """clean('''<a style='%s'></a>''', attributes={'a': ['style']})""" % style_attr_value
-        return timeit(stmt=stmt, setup='from bleach import clean', number=1)
+        style_attr_value = (test_char + "a" + test_char) * size + "^"
+        stmt = (
+            """clean('''<a style='%s'></a>''', attributes={'a': ['style']})"""
+            % style_attr_value
+        )
+        return timeit(stmt=stmt, setup="from bleach import clean", number=1)
 
     # should complete in less than one second
     assert time_clean(overlap_test_char, 22) < 1.0
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index a7da8647..fcb7799d 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -5,71 +5,67 @@
 from bleach import html5lib_shim
 
 
-@pytest.mark.parametrize('data, expected', [
-    # Strings without character entities pass through as is
-    ('', ''),
-    ('abc', 'abc'),
-
-    # Handles character entities--both named and numeric
-    ('&nbsp;', '\xa0'),
-    ('&#32;', ' '),
-    ('&#x20;', ' '),
-
-    # Handles ambiguous ampersand
-    ('&xx;', '&xx;'),
-
-    # Handles multiple entities in the same string
-    ('this &amp; that &amp; that', 'this & that & that'),
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        # Strings without character entities pass through as is
+        ("", ""),
+        ("abc", "abc"),
+        # Handles character entities--both named and numeric
+        ("&nbsp;", "\xa0"),
+        ("&#32;", " "),
+        ("&#x20;", " "),
+        # Handles ambiguous ampersand
+        ("&xx;", "&xx;"),
+        # Handles multiple entities in the same string
+        ("this &amp; that &amp; that", "this & that & that"),
+    ],
+)
 def test_convert_entities(data, expected):
     assert html5lib_shim.convert_entities(data) == expected
 
 
-@pytest.mark.parametrize('data, expected', [
-    ('', ''),
-    ('text', 'text'),
-
-    # & in Characters is escaped
-    ('&', '&amp;'),
-
-    # FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring
-    # character entities. What it should be doing is creating Entity tokens
-    # for character entities.
-    #
-    # That was too hard at the time I was fixing it, so I fixed it in
-    # BleachSanitizerFilter. When that gest fixed correctly in the tokenizer,
-    # then this test cases will get fixed.
-    ('a &amp; b', 'a &amp;amp; b'),    # should be 'a &amp; b'
-
-    # & in HTML attribute values are escaped
-    (
-        '<a href="http://example.com?key=value&key2=value">tag</a>',
-        '<a href="http://example.com?key=value&amp;key2=value">tag</a>'
-    ),
-    # & marking character entities in HTML attribute values aren't escaped
-    (
-        '<a href="http://example.com?key=value&amp;key2=value">tag</a>',
-        '<a href="http://example.com?key=value&amp;key2=value">tag</a>'
-    ),
-    # & marking ambiguous character entities in attribute values are escaped
-    # (&curren; is a character entity)
-    (
-        '<a href="http://example.com?key=value&current=value">tag</a>',
-        '<a href="http://example.com?key=value&amp;current=value">tag</a>'
-    ),
-
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        ("", ""),
+        ("text", "text"),
+        # & in Characters is escaped
+        ("&", "&amp;"),
+        # FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring
+        # character entities. What it should be doing is creating Entity tokens
+        # for character entities.
+        #
+        # That was too hard at the time I was fixing it, so I fixed it in
+        # BleachSanitizerFilter. When that gest fixed correctly in the tokenizer,
+        # then this test cases will get fixed.
+        ("a &amp; b", "a &amp;amp; b"),  # should be 'a &amp; b'
+        # & in HTML attribute values are escaped
+        (
+            '<a href="http://example.com?key=value&key2=value">tag</a>',
+            '<a href="http://example.com?key=value&amp;key2=value">tag</a>',
+        ),
+        # & marking character entities in HTML attribute values aren't escaped
+        (
+            '<a href="http://example.com?key=value&amp;key2=value">tag</a>',
+            '<a href="http://example.com?key=value&amp;key2=value">tag</a>',
+        ),
+        # & marking ambiguous character entities in attribute values are escaped
+        # (&curren; is a character entity)
+        (
+            '<a href="http://example.com?key=value&current=value">tag</a>',
+            '<a href="http://example.com?key=value&amp;current=value">tag</a>',
+        ),
+    ],
+)
 def test_serializer(data, expected):
     # Build a parser, walker, and serializer just like we do in clean()
     parser = html5lib_shim.BleachHTMLParser(
-        tags=None,
-        strip=True,
-        consume_entities=False,
-        namespaceHTMLElements=False
+        tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False
     )
-    walker = html5lib_shim.getTreeWalker('etree')
+    walker = html5lib_shim.getTreeWalker("etree")
     serializer = html5lib_shim.BleachHTMLSerializer(
-        quote_attr_values='always',
+        quote_attr_values="always",
         omit_optional_tags=False,
         escape_lt_in_attrs=True,
         resolve_entities=False,
@@ -84,60 +80,43 @@ def test_serializer(data, expected):
     assert serialized == expected
 
 
-@pytest.mark.parametrize('parser_args, data, expected', [
-    # Make sure InputStreamWithMemory has charEncoding and changeEncoding
-    (
-        {},
-        '<meta charset="utf-8">',
-        '<meta charset="utf-8">'
-    ),
-    # Handle consume entities False--all entities are passed along and then
-    # escaped when serialized
-    (
-        {'consume_entities': False},
-        'text &amp;&gt;&quot;',
-        'text &amp;amp;&amp;gt;&amp;quot;'
-    ),
-    # Handle consume entities True--all entities are consumed and converted
-    # to their character equivalents and then &, <, and > are escaped when
-    # serialized
-    (
-        {'consume_entities': True},
-        'text &amp;&gt;&quot;',
-        'text &amp;&gt;"'
-    ),
-    # Test that "invalid-character-in-attribute-name" errors in tokenizing
-    # result in attributes with invalid names getting dropped
-    (
-        {},
-        '<a href="http://example.com"">',
-        '<a href="http://example.com"></a>'
-    ),
-    (
-        {},
-        '<a href=\'http://example.com\'\'>',
-        '<a href="http://example.com"></a>'
-    ),
-    # Test that "expected-closing-tag-but-got-char" works when tags is None
-    (
-        {},
-        '</ chars',
-        '<!-- chars-->',
-    )
-])
+@pytest.mark.parametrize(
+    "parser_args, data, expected",
+    [
+        # Make sure InputStreamWithMemory has charEncoding and changeEncoding
+        ({}, '<meta charset="utf-8">', '<meta charset="utf-8">'),
+        # Handle consume entities False--all entities are passed along and then
+        # escaped when serialized
+        (
+            {"consume_entities": False},
+            "text &amp;&gt;&quot;",
+            "text &amp;amp;&amp;gt;&amp;quot;",
+        ),
+        # Handle consume entities True--all entities are consumed and converted
+        # to their character equivalents and then &, <, and > are escaped when
+        # serialized
+        ({"consume_entities": True}, "text &amp;&gt;&quot;", 'text &amp;&gt;"'),
+        # Test that "invalid-character-in-attribute-name" errors in tokenizing
+        # result in attributes with invalid names getting dropped
+        ({}, '<a href="http://example.com"">', '<a href="http://example.com"></a>'),
+        ({}, "<a href='http://example.com''>", '<a href="http://example.com"></a>'),
+        # Test that "expected-closing-tag-but-got-char" works when tags is None
+        (
+            {},
+            "</ chars",
+            "<!-- chars-->",
+        ),
+    ],
+)
 def test_bleach_html_parser(parser_args, data, expected):
-    args = {
-        'tags': None,
-        'strip': True,
-        'consume_entities': True
-    }
+    args = {"tags": None, "strip": True, "consume_entities": True}
     args.update(parser_args)
 
     # Build a parser, walker, and serializer just like we do in clean()
     parser = html5lib_shim.BleachHTMLParser(**args)
-    walker = html5lib_shim.getTreeWalker('etree')
+    walker = html5lib_shim.getTreeWalker("etree")
     serializer = html5lib_shim.BleachHTMLSerializer(
-        quote_attr_values='always',
+        quote_attr_values="always",
         omit_optional_tags=False,
         escape_lt_in_attrs=True,
         resolve_entities=False,
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 1f48814c..f3aca67b 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -11,50 +11,51 @@
 
 
 def test_empty():
-    assert linkify('') == ''
+    assert linkify("") == ""
 
 
 def test_simple_link():
     assert (
-        linkify('a http://example.com link') ==
-        'a <a href="http://example.com" rel="nofollow">http://example.com</a> link'
+        linkify("a http://example.com link")
+        == 'a <a href="http://example.com" rel="nofollow">http://example.com</a> link'
     )
     assert (
-        linkify('a https://example.com link') ==
-        'a <a href="https://example.com" rel="nofollow">https://example.com</a> link'
+        linkify("a https://example.com link")
+        == 'a <a href="https://example.com" rel="nofollow">https://example.com</a> link'
     )
     assert (
-        linkify('a example.com link') ==
-        'a <a href="http://example.com" rel="nofollow">example.com</a> link'
+        linkify("a example.com link")
+        == 'a <a href="http://example.com" rel="nofollow">example.com</a> link'
     )
 
 
 def test_trailing_slash():
     assert (
-        linkify('http://examp.com/') ==
-        '<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>'
+        linkify("http://examp.com/")
+        == '<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>'
     )
     assert (
-        linkify('http://example.com/foo/') ==
-        '<a href="http://example.com/foo/" rel="nofollow">http://example.com/foo/</a>'
+        linkify("http://example.com/foo/")
+        == '<a href="http://example.com/foo/" rel="nofollow">http://example.com/foo/</a>'
     )
     assert (
-        linkify('http://example.com/foo/bar/') ==
-        '<a href="http://example.com/foo/bar/" rel="nofollow">http://example.com/foo/bar/</a>'
+        linkify("http://example.com/foo/bar/")
+        == '<a href="http://example.com/foo/bar/" rel="nofollow">http://example.com/foo/bar/</a>'
     )
 
 
 def test_mangle_link():
     """We can muck with the href attribute of the link."""
+
     def filter_url(attrs, new=False):
-        if not attrs.get((None, 'href'), '').startswith('http://bouncer'):
-            quoted = quote_plus(attrs[(None, 'href')])
-            attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted)
+        if not attrs.get((None, "href"), "").startswith("http://bouncer"):
+            quoted = quote_plus(attrs[(None, "href")])
+            attrs[(None, "href")] = "http://bouncer/?u={0!s}".format(quoted)
         return attrs
 
     assert (
-        linkify('http://example.com', callbacks=DC + [filter_url]) ==
-        '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>'
+        linkify("http://example.com", callbacks=DC + [filter_url])
+        == '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>'
     )
 
 
@@ -62,79 +63,72 @@ def test_mangle_text():
     """We can muck with the inner text of a link."""
 
     def ft(attrs, new=False):
-        attrs['_text'] = 'bar'
+        attrs["_text"] = "bar"
         return attrs
 
     assert (
-        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', callbacks=[ft]) ==
-        '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>'
-    )
-
-
-@pytest.mark.parametrize('data,parse_email,expected', [
-    (
-        'a james@example.com mailto',
-        False,
-        'a james@example.com mailto'
-    ),
-    (
-        'a james@example.com.au mailto',
-        False,
-        'a james@example.com.au mailto'
-    ),
-    (
-        'a james@example.com mailto',
-        True,
-        'a <a href="mailto:james@example.com">james@example.com</a> mailto'
-    ),
-    (
-        'aussie james@example.com.au mailto',
-        True,
-        'aussie <a href="mailto:james@example.com.au">james@example.com.au</a> mailto'
-    ),
-    # This is kind of a pathological case. I guess we do our best here.
-    (
-        'email to <a href="james@example.com">james@example.com</a>',
-        True,
-        'email to <a href="james@example.com" rel="nofollow">james@example.com</a>'
-    ),
-    (
-        '<br>jinkyun@example.com',
-        True,
-        '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>'
-    ),
-    # Mailto links at the end of a sentence.
-    (
-        'mailto james@example.com.au.',
-        True,
-        'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.'
-    ),
-    # Incorrect email
-    (
-        '"\\\n"@opa.ru',
-        True,
-        '"\\\n"@opa.ru'
-    ),
-
-])
+        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', callbacks=[ft])
+        == '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>'
+    )
+
+
+@pytest.mark.parametrize(
+    "data,parse_email,expected",
+    [
+        ("a james@example.com mailto", False, "a james@example.com mailto"),
+        ("a james@example.com.au mailto", False, "a james@example.com.au mailto"),
+        (
+            "a james@example.com mailto",
+            True,
+            'a <a href="mailto:james@example.com">james@example.com</a> mailto',
+        ),
+        (
+            "aussie james@example.com.au mailto",
+            True,
+            'aussie <a href="mailto:james@example.com.au">james@example.com.au</a> mailto',
+        ),
+        # This is kind of a pathological case. I guess we do our best here.
+        (
+            'email to <a href="james@example.com">james@example.com</a>',
+            True,
+            'email to <a href="james@example.com" rel="nofollow">james@example.com</a>',
+        ),
+        (
+            "<br>jinkyun@example.com",
+            True,
+            '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>',
+        ),
+        # Mailto links at the end of a sentence.
+        (
+            "mailto james@example.com.au.",
+            True,
+            'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.',
+        ),
+        # Incorrect email
+        ('"\\\n"@opa.ru', True, '"\\\n"@opa.ru'),
+    ],
+)
 def test_email_link(data, parse_email, expected):
     assert linkify(data, parse_email=parse_email) == expected
 
 
-@pytest.mark.parametrize('data, expected', [
-    (
-        '"james"@example.com',
-        '''<a href='mailto:"james"@example.com'>"james"@example.com</a>'''
-    ),
-    (
-        '"j\'ames"@example.com',
-        '''<a href="mailto:&quot;j'ames&quot;@example.com">"j'ames"@example.com</a>'''
-    ),
-    (
-        '"ja>mes"@example.com',
-        '''<a href='mailto:"ja>mes"@example.com'>"ja&gt;mes"@example.com</a>'''
-    ),
-])
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (
+            '"james"@example.com',
+            """<a href='mailto:"james"@example.com'>"james"@example.com</a>""",
+        ),
+        (
+            '"j\'ames"@example.com',
+            """<a href="mailto:&quot;j'ames&quot;@example.com">"j'ames"@example.com</a>""",
+        ),
+        (
+            '"ja>mes"@example.com',
+            """<a href='mailto:"ja>mes"@example.com'>"ja&gt;mes"@example.com</a>""",
+        ),
+    ],
+)
 def test_email_link_escaping(data, expected):
     assert linkify(data, parse_email=True) == expected
 
@@ -155,32 +149,20 @@ def noop(attrs, new=False):
     return attrs
 
 
-@pytest.mark.parametrize('callback,expected', [
-    (
-        [noop],
-        'a <a href="http://ex.mp">ex.mp</a> <a href="http://example.com">example</a>'
-    ),
-    (
-        [no_new_links, noop],
-        'a ex.mp <a href="http://example.com">example</a>'
-    ),
-    (
-        [noop, no_new_links],
-        'a ex.mp <a href="http://example.com">example</a>'
-    ),
-    (
-        [no_old_links, noop],
-        'a <a href="http://ex.mp">ex.mp</a> example'
-    ),
-    (
-        [noop, no_old_links],
-        'a <a href="http://ex.mp">ex.mp</a> example'
-    ),
-    (
-        [no_old_links, no_new_links],
-        'a ex.mp example'
-    )
-])
+@pytest.mark.parametrize(
+    "callback,expected",
+    [
+        (
+            [noop],
+            'a <a href="http://ex.mp">ex.mp</a> <a href="http://example.com">example</a>',
+        ),
+        ([no_new_links, noop], 'a ex.mp <a href="http://example.com">example</a>'),
+        ([noop, no_new_links], 'a ex.mp <a href="http://example.com">example</a>'),
+        ([no_old_links, noop], 'a <a href="http://ex.mp">ex.mp</a> example'),
+        ([noop, no_old_links], 'a <a href="http://ex.mp">ex.mp</a> example'),
+        ([no_old_links, no_new_links], "a ex.mp example"),
+    ],
+)
 def test_prevent_links(callback, expected):
     """Returning None from any callback should remove links or prevent them
     from being created."""
@@ -192,212 +174,230 @@ def test_set_attrs():
     """We can set random attributes on links."""
 
     def set_attr(attrs, new=False):
-        attrs[(None, 'rev')] = 'canonical'
+        attrs[(None, "rev")] = "canonical"
         return attrs
 
     assert (
-        linkify('ex.mp', callbacks=[set_attr]) ==
-        '<a href="http://ex.mp" rev="canonical">ex.mp</a>'
+        linkify("ex.mp", callbacks=[set_attr])
+        == '<a href="http://ex.mp" rev="canonical">ex.mp</a>'
     )
 
 
 def test_only_proto_links():
     """Only create links if there's a protocol."""
+
     def only_proto(attrs, new=False):
-        if new and not attrs['_text'].startswith(('http:', 'https:')):
+        if new and not attrs["_text"].startswith(("http:", "https:")):
             return None
         return attrs
 
     in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
     assert (
-        linkify(in_text, callbacks=[only_proto]) ==
-        'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>'
+        linkify(in_text, callbacks=[only_proto])
+        == 'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>'
     )
 
 
 def test_stop_email():
     """Returning None should prevent a link from being created."""
+
     def no_email(attrs, new=False):
-        if attrs[(None, 'href')].startswith('mailto:'):
+        if attrs[(None, "href")].startswith("mailto:"):
             return None
         return attrs
-    text = 'do not link james@example.com'
 
-    assert linkify(text, parse_email=True, callbacks=[no_email]) == text
+    text = "do not link james@example.com"
 
+    assert linkify(text, parse_email=True, callbacks=[no_email]) == text
 
-@pytest.mark.parametrize('data,expected', [
-    # tlds
-    ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'),
-    ('example.co', '<a href="http://example.co" rel="nofollow">example.co</a>'),
-    ('example.co.uk', '<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>'),
-    ('example.edu', '<a href="http://example.edu" rel="nofollow">example.edu</a>'),
-    ('example.xxx', '<a href="http://example.xxx" rel="nofollow">example.xxx</a>'),
-    ('bit.ly/fun', '<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>'),
 
-    # non-tlds
-    ('example.yyy', 'example.yyy'),
-    ('brie', 'brie'),
-])
+@pytest.mark.parametrize(
+    "data,expected",
+    [
+        # tlds
+        ("example.com", '<a href="http://example.com" rel="nofollow">example.com</a>'),
+        ("example.co", '<a href="http://example.co" rel="nofollow">example.co</a>'),
+        (
+            "example.co.uk",
+            '<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+        ),
+        ("example.edu", '<a href="http://example.edu" rel="nofollow">example.edu</a>'),
+        ("example.xxx", '<a href="http://example.xxx" rel="nofollow">example.xxx</a>'),
+        ("bit.ly/fun", '<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>'),
+        # non-tlds
+        ("example.yyy", "example.yyy"),
+        ("brie", "brie"),
+    ],
+)
 def test_tlds(data, expected):
     assert linkify(data) == expected
 
 
 def test_escaping():
-    assert linkify('< unrelated') == '&lt; unrelated'
+    assert linkify("< unrelated") == "&lt; unrelated"
 
 
 def test_nofollow_off():
-    assert linkify('example.com', callbacks=[]) == '<a href="http://example.com">example.com</a>'
+    assert (
+        linkify("example.com", callbacks=[])
+        == '<a href="http://example.com">example.com</a>'
+    )
 
 
 def test_link_in_html():
     assert (
-        linkify('<i>http://yy.com</i>') ==
-        '<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>'
+        linkify("<i>http://yy.com</i>")
+        == '<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>'
     )
     assert (
-        linkify('<em><strong>http://xx.com</strong></em>') ==
-        '<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a></strong></em>'
+        linkify("<em><strong>http://xx.com</strong></em>")
+        == '<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a></strong></em>'
     )
 
 
 def test_links_https():
     assert (
-        linkify('https://yy.com') ==
-        '<a href="https://yy.com" rel="nofollow">https://yy.com</a>'
+        linkify("https://yy.com")
+        == '<a href="https://yy.com" rel="nofollow">https://yy.com</a>'
     )
 
 
 def test_add_rel_nofollow():
     """Verify that rel="nofollow" is added to an existing link"""
     assert (
-        linkify('<a href="http://yy.com">http://yy.com</a>') ==
-        '<a href="http://yy.com" rel="nofollow">http://yy.com</a>'
+        linkify('<a href="http://yy.com">http://yy.com</a>')
+        == '<a href="http://yy.com" rel="nofollow">http://yy.com</a>'
     )
 
 
 def test_url_with_path():
     assert (
-        linkify('http://example.com/path/to/file') ==
-        '<a href="http://example.com/path/to/file" rel="nofollow">'
-        'http://example.com/path/to/file</a>'
+        linkify("http://example.com/path/to/file")
+        == '<a href="http://example.com/path/to/file" rel="nofollow">'
+        "http://example.com/path/to/file</a>"
     )
 
 
 def test_link_ftp():
     assert (
-        linkify('ftp://ftp.mozilla.org/some/file') ==
-        '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
-        'ftp://ftp.mozilla.org/some/file</a>'
+        linkify("ftp://ftp.mozilla.org/some/file")
+        == '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+        "ftp://ftp.mozilla.org/some/file</a>"
     )
 
 
 def test_link_query():
     assert (
-        linkify('http://xx.com/?test=win') ==
-        '<a href="http://xx.com/?test=win" rel="nofollow">http://xx.com/?test=win</a>'
+        linkify("http://xx.com/?test=win")
+        == '<a href="http://xx.com/?test=win" rel="nofollow">http://xx.com/?test=win</a>'
     )
     assert (
-        linkify('xx.com/?test=win') ==
-        '<a href="http://xx.com/?test=win" rel="nofollow">xx.com/?test=win</a>'
+        linkify("xx.com/?test=win")
+        == '<a href="http://xx.com/?test=win" rel="nofollow">xx.com/?test=win</a>'
     )
     assert (
-        linkify('xx.com?test=win') ==
-        '<a href="http://xx.com?test=win" rel="nofollow">xx.com?test=win</a>'
+        linkify("xx.com?test=win")
+        == '<a href="http://xx.com?test=win" rel="nofollow">xx.com?test=win</a>'
     )
 
 
 def test_link_fragment():
     assert (
-        linkify('http://xx.com/path#frag') ==
-        '<a href="http://xx.com/path#frag" rel="nofollow">http://xx.com/path#frag</a>'
+        linkify("http://xx.com/path#frag")
+        == '<a href="http://xx.com/path#frag" rel="nofollow">http://xx.com/path#frag</a>'
     )
 
 
 def test_link_entities():
     assert (
-        linkify('http://xx.com/?a=1&b=2') ==
-        '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
+        linkify("http://xx.com/?a=1&b=2")
+        == '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
     )
 
 
 def test_escaped_html():
     """If I pass in escaped HTML, it should probably come out escaped."""
-    s = '&lt;em&gt;strong&lt;/em&gt;'
+    s = "&lt;em&gt;strong&lt;/em&gt;"
     assert linkify(s) == s
 
 
 def test_link_http_complete():
     assert (
-        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') ==
-        '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
-        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'
+        linkify("https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f")
+        == '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
+        "https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>"
     )
 
 
 def test_non_url():
     """document.vulnerable should absolutely not be linkified."""
-    s = 'document.vulnerable'
+    s = "document.vulnerable"
     assert linkify(s) == s
 
 
 def test_javascript_url():
     """javascript: urls should never be linkified."""
-    s = 'javascript:document.vulnerable'
+    s = "javascript:document.vulnerable"
     assert linkify(s) == s
 
 
 def test_unsafe_url():
     """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
     assert (
-        linkify('All your{"xx.yy.com/grover.png"}base are') ==
-        'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}'
-        'base are'
+        linkify('All your{"xx.yy.com/grover.png"}base are')
+        == 'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}'
+        "base are"
     )
 
 
 def test_skip_tags():
     """Skip linkification in skip tags"""
-    simple = 'http://xx.com <pre>http://xx.com</pre>'
-    linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
-              '<pre>http://xx.com</pre>')
-    all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
-                  '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
-                  '</a></pre>')
-    assert linkify(simple, skip_tags=['pre']) == linked
+    simple = "http://xx.com <pre>http://xx.com</pre>"
+    linked = (
+        '<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+        "<pre>http://xx.com</pre>"
+    )
+    all_linked = (
+        '<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+        '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
+        "</a></pre>"
+    )
+    assert linkify(simple, skip_tags=["pre"]) == linked
     assert linkify(simple) == all_linked
 
     already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
     nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
     assert linkify(already_linked) == nofollowed
-    assert linkify(already_linked, skip_tags=['pre']) == nofollowed
+    assert linkify(already_linked, skip_tags=["pre"]) == nofollowed
 
-    assert (
-        linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_tags=['pre']) ==
-        (
-            '<pre><code>http://example.com</code></pre>'
-            '<a href="http://example.com" rel="nofollow">http://example.com</a>'
-        )
+    assert linkify(
+        "<pre><code>http://example.com</code></pre>http://example.com",
+        skip_tags=["pre"],
+    ) == (
+        "<pre><code>http://example.com</code></pre>"
+        '<a href="http://example.com" rel="nofollow">http://example.com</a>'
     )
 
 
 def test_libgl():
     """libgl.so.1 should not be linkified."""
-    s = 'libgl.so.1'
+    s = "libgl.so.1"
     assert linkify(s) == s
 
 
-@pytest.mark.parametrize('url,periods', [
-    ('example.com', '.'),
-    ('example.com', '...'),
-    ('ex.com/foo', '.'),
-    ('ex.com/foo', '....'),
-])
+@pytest.mark.parametrize(
+    "url,periods",
+    [
+        ("example.com", "."),
+        ("example.com", "..."),
+        ("ex.com/foo", "."),
+        ("ex.com/foo", "...."),
+    ],
+)
 def test_end_of_sentence(url, periods):
     """example.com. should match."""
     out = '<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}'
-    intxt = '{0!s}{1!s}'
+    intxt = "{0!s}{1!s}"
 
     assert linkify(intxt.format(url, periods)) == out.format(url, periods)
 
@@ -405,79 +405,92 @@ def test_end_of_sentence(url, periods):
 def test_end_of_clause():
     """example.com/foo, shouldn't include the ,"""
     assert (
-        linkify('ex.com/foo, bar') ==
-        '<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar'
+        linkify("ex.com/foo, bar")
+        == '<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar'
     )
 
 
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
-    assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
-
-
-@pytest.mark.parametrize('data,expected_data', [
-    (
-        '(example.com)',
-        ('(', 'example.com', 'example.com', ')')
-    ),
-    (
-        '(example.com/)',
-        ('(', 'example.com/', 'example.com/', ')')
-    ),
-    (
-        '(example.com/foo)',
-        ('(', 'example.com/foo', 'example.com/foo', ')')
-    ),
-    (
-        '(((example.com/))))',
-        ('(((', 'example.com/', 'example.com/', '))))')
-    ),
-    (
-        'example.com/))',
-        ('', 'example.com/', 'example.com/', '))')
-    ),
-    (
-        '(foo http://example.com/)',
-        ('(foo ', 'example.com/', 'http://example.com/', ')')
-    ),
-    (
-        '(foo http://example.com)',
-        ('(foo ', 'example.com', 'http://example.com', ')')
-    ),
-    (
-        'http://en.wikipedia.org/wiki/Test_(assessment)',
-        ('', 'en.wikipedia.org/wiki/Test_(assessment)',
-         'http://en.wikipedia.org/wiki/Test_(assessment)', '')
-    ),
-    (
-        '(http://en.wikipedia.org/wiki/Test_(assessment))',
-        ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
-         'http://en.wikipedia.org/wiki/Test_(assessment)', ')')
-    ),
-    (
-        '((http://en.wikipedia.org/wiki/Test_(assessment))',
-        ('((', 'en.wikipedia.org/wiki/Test_(assessment',
-         'http://en.wikipedia.org/wiki/Test_(assessment', '))')
-    ),
-    (
-        '(http://en.wikipedia.org/wiki/Test_(assessment)))',
-        ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
-         'http://en.wikipedia.org/wiki/Test_(assessment))', ')')
-    ),
-    (
-        '(http://en.wikipedia.org/wiki/)Test_(assessment',
-        ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
-         'http://en.wikipedia.org/wiki/)Test_(assessment', '')
-    ),
-    (
-        'hello (http://www.mu.de/blah.html) world',
-        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world')
-    ),
-    (
-        'hello (http://www.mu.de/blah.html). world',
-        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world')
-    )
-])
+    assert linkify("Yeah right <sarcasm/>") == "Yeah right &lt;sarcasm/&gt;"
+
+
+@pytest.mark.parametrize(
+    "data,expected_data",
+    [
+        ("(example.com)", ("(", "example.com", "example.com", ")")),
+        ("(example.com/)", ("(", "example.com/", "example.com/", ")")),
+        ("(example.com/foo)", ("(", "example.com/foo", "example.com/foo", ")")),
+        ("(((example.com/))))", ("(((", "example.com/", "example.com/", "))))")),
+        ("example.com/))", ("", "example.com/", "example.com/", "))")),
+        (
+            "(foo http://example.com/)",
+            ("(foo ", "example.com/", "http://example.com/", ")"),
+        ),
+        (
+            "(foo http://example.com)",
+            ("(foo ", "example.com", "http://example.com", ")"),
+        ),
+        (
+            "http://en.wikipedia.org/wiki/Test_(assessment)",
+            (
+                "",
+                "en.wikipedia.org/wiki/Test_(assessment)",
+                "http://en.wikipedia.org/wiki/Test_(assessment)",
+                "",
+            ),
+        ),
+        (
+            "(http://en.wikipedia.org/wiki/Test_(assessment))",
+            (
+                "(",
+                "en.wikipedia.org/wiki/Test_(assessment)",
+                "http://en.wikipedia.org/wiki/Test_(assessment)",
+                ")",
+            ),
+        ),
+        (
+            "((http://en.wikipedia.org/wiki/Test_(assessment))",
+            (
+                "((",
+                "en.wikipedia.org/wiki/Test_(assessment",
+                "http://en.wikipedia.org/wiki/Test_(assessment",
+                "))",
+            ),
+        ),
+        (
+            "(http://en.wikipedia.org/wiki/Test_(assessment)))",
+            (
+                "(",
+                "en.wikipedia.org/wiki/Test_(assessment))",
+                "http://en.wikipedia.org/wiki/Test_(assessment))",
+                ")",
+            ),
+        ),
+        (
+            "(http://en.wikipedia.org/wiki/)Test_(assessment",
+            (
+                "(",
+                "en.wikipedia.org/wiki/)Test_(assessment",
+                "http://en.wikipedia.org/wiki/)Test_(assessment",
+                "",
+            ),
+        ),
+        (
+            "hello (http://www.mu.de/blah.html) world",
+            ("hello (", "www.mu.de/blah.html", "http://www.mu.de/blah.html", ") world"),
+        ),
+        (
+            "hello (http://www.mu.de/blah.html). world",
+            (
+                "hello (",
+                "www.mu.de/blah.html",
+                "http://www.mu.de/blah.html",
+                "). world",
+            ),
+        ),
+    ],
+)
 def test_wrapping_parentheses(data, expected_data):
     """URLs wrapped in parantheses should not include them."""
     out = '{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}'
@@ -486,24 +499,28 @@ def test_wrapping_parentheses(data, expected_data):
 
 
 def test_parentheses_with_removing():
-    expected = '(test.py)'
+    expected = "(test.py)"
     assert linkify(expected, callbacks=[lambda *a: None]) == expected
 
 
-@pytest.mark.parametrize('data,expected_data', [
-    # Test valid ports
-    ('http://foo.com:8000', ('http://foo.com:8000', '')),
-    ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
-
-    # Test non ports
-    ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
-    ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
-    ('http://foo.com:', ('http://foo.com', ':')),
-
-    # Test non-ascii ports
-    ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')),
-    ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')),
-])
+@pytest.mark.parametrize(
+    "data,expected_data",
+    [
+        # Test valid ports
+        ("http://foo.com:8000", ("http://foo.com:8000", "")),
+        ("http://foo.com:8000/", ("http://foo.com:8000/", "")),
+        # Test non ports
+        ("http://bar.com:xkcd", ("http://bar.com", ":xkcd")),
+        ("http://foo.com:81/bar", ("http://foo.com:81/bar", "")),
+        ("http://foo.com:", ("http://foo.com", ":")),
+        # Test non-ascii ports
+        ("http://foo.com:\u0663\u0669/", ("http://foo.com", ":\u0663\u0669/")),
+        (
+            "http://foo.com:\U0001d7e0\U0001d7d8/",
+            ("http://foo.com", ":\U0001d7e0\U0001d7d8/"),
+        ),
+    ],
+)
 def test_ports(data, expected_data):
     """URLs can contain port numbers."""
     out = '<a href="{0}" rel="nofollow">{0}</a>{1}'
@@ -511,43 +528,37 @@ def test_ports(data, expected_data):
 
 
 def test_ignore_bad_protocols():
+    assert linkify("foohttp://bar") == "foohttp://bar"
     assert (
-        linkify('foohttp://bar') ==
-        'foohttp://bar'
-    )
-    assert (
-        linkify('fohttp://exampl.com') ==
-        'fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>'
+        linkify("fohttp://exampl.com")
+        == 'fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>'
     )
 
 
 def test_link_emails_and_urls():
     """parse_email=True shouldn't prevent URLs from getting linkified."""
-    assert (
-        linkify('http://example.com person@example.com', parse_email=True) ==
-        (
-            '<a href="http://example.com" rel="nofollow">'
-            'http://example.com</a> <a href="mailto:person@example.com">'
-            'person@example.com</a>'
-        )
+    assert linkify("http://example.com person@example.com", parse_email=True) == (
+        '<a href="http://example.com" rel="nofollow">'
+        'http://example.com</a> <a href="mailto:person@example.com">'
+        "person@example.com</a>"
     )
 
 
 def test_links_case_insensitive():
     """Protocols and domain names are case insensitive."""
     expect = '<a href="HTTP://EXAMPLE.COM" rel="nofollow">HTTP://EXAMPLE.COM</a>'
-    assert linkify('HTTP://EXAMPLE.COM') == expect
+    assert linkify("HTTP://EXAMPLE.COM") == expect
 
 
 def test_elements_inside_links():
     assert (
-        linkify('<a href="#">hello<br></a>') ==
-        '<a href="#" rel="nofollow">hello<br></a>'
+        linkify('<a href="#">hello<br></a>')
+        == '<a href="#" rel="nofollow">hello<br></a>'
     )
 
     assert (
-        linkify('<a href="#"><strong>bold</strong> hello<br></a>') ==
-        '<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>'
+        linkify('<a href="#"><strong>bold</strong> hello<br></a>')
+        == '<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>'
     )
 
 
@@ -555,25 +566,28 @@ def test_drop_link_tags():
     """Verify that dropping link tags *just* drops the tag and not the content"""
     html = (
         'first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">'
-        'fourth</a> fifth'
+        "fourth</a> fifth"
     )
     assert (
-        linkify(html, callbacks=[lambda attrs, new: None]) ==
-        'first second third fourth fifth'
+        linkify(html, callbacks=[lambda attrs, new: None])
+        == "first second third fourth fifth"
     )
 
 
-@pytest.mark.parametrize('text, expected', [
-    ('&lt;br&gt;', '&lt;br&gt;'),
-    (
-        '&lt;br&gt; http://example.com',
-        '&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>'
-    ),
-    (
-        '&lt;br&gt; <br> http://example.com',
-        '&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>'
-    )
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("&lt;br&gt;", "&lt;br&gt;"),
+        (
+            "&lt;br&gt; http://example.com",
+            '&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>',
+        ),
+        (
+            "&lt;br&gt; <br> http://example.com",
+            '&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>',
+        ),
+    ],
+)
 def test_naughty_unescaping(text, expected):
     """Verify that linkify is not unescaping things it shouldn't be"""
     assert linkify(text) == expected
@@ -582,16 +596,16 @@ def test_naughty_unescaping(text, expected):
 def test_hang():
     """This string would hang linkify. Issue #200"""
     assert (
-        linkify("an@email.com<mailto:an@email.com>", parse_email=True) ==
-        '<a href="mailto:an@email.com">an@email.com</a>&lt;mailto:<a href="mailto:an@email.com">an@email.com</a>&gt;'  # noqa
+        linkify("an@email.com<mailto:an@email.com>", parse_email=True)
+        == '<a href="mailto:an@email.com">an@email.com</a>&lt;mailto:<a href="mailto:an@email.com">an@email.com</a>&gt;'  # noqa
     )
 
 
 def test_hyphen_in_mail():
     """Test hyphens `-` in mails. Issue #300."""
     assert (
-        linkify('ex@am-ple.com', parse_email=True) ==
-        '<a href="mailto:ex@am-ple.com">ex@am-ple.com</a>'
+        linkify("ex@am-ple.com", parse_email=True)
+        == '<a href="mailto:ex@am-ple.com">ex@am-ple.com</a>'
     )
 
 
@@ -601,13 +615,13 @@ def test_url_re_arg():
 
     linker = Linker(url_re=fred_re)
     assert (
-        linker.linkify('a b c fred.com d e f') ==
-        'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f'
+        linker.linkify("a b c fred.com d e f")
+        == 'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f'
     )
 
     assert (
-        linker.linkify('a b c http://example.com d e f') ==
-        'a b c http://example.com d e f'
+        linker.linkify("a b c http://example.com d e f")
+        == "a b c http://example.com d e f"
     )
 
 
@@ -617,35 +631,34 @@ def test_email_re_arg():
 
     linker = Linker(parse_email=True, email_re=fred_re)
     assert (
-        linker.linkify('a b c fred@example.com d e f') ==
-        'a b c <a href="mailto:fred@example.com">fred@example.com</a> d e f'
+        linker.linkify("a b c fred@example.com d e f")
+        == 'a b c <a href="mailto:fred@example.com">fred@example.com</a> d e f'
     )
 
     assert (
-        linker.linkify('a b c jim@example.com d e f') ==
-        'a b c jim@example.com d e f'
+        linker.linkify("a b c jim@example.com d e f") == "a b c jim@example.com d e f"
     )
 
 
 def test_recognized_tags_arg():
     """Verifies that recognized_tags works"""
     # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it
-    linker = Linker(recognized_tags=['p'])
+    linker = Linker(recognized_tags=["p"])
     assert (
-        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
-        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p>&lt;sarcasm&gt;'  # noqa
+        linker.linkify("<p>http://example.com/</p><sarcasm>")
+        == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p>&lt;sarcasm&gt;'  # noqa
     )
 
     # The html parser recognizes "sarcasm" as a tag and fixes it
-    linker = Linker(recognized_tags=['p', 'sarcasm'])
+    linker = Linker(recognized_tags=["p", "sarcasm"])
     assert (
-        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
-        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>'  # noqa
+        linker.linkify("<p>http://example.com/</p><sarcasm>")
+        == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>'  # noqa
     )
 
 
 def test_linkify_idempotent():
-    dirty = '<span>invalid & </span> < extra http://link.com<em>'
+    dirty = "<span>invalid & </span> < extra http://link.com<em>"
     assert linkify(linkify(dirty)) == linkify(dirty)
 
 
@@ -656,16 +669,17 @@ def test_no_href_links(self):
 
     def test_rel_already_there(self):
         """Make sure rel attribute is updated not replaced"""
-        linked = ('Click <a href="http://example.com" rel="tooltip">'
-                  'here</a>.')
+        linked = 'Click <a href="http://example.com" rel="tooltip">' "here</a>."
 
-        link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+        link_good = (
+            'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+        )
 
         assert linkify(linked) == link_good
         assert linkify(link_good) == link_good
 
     def test_only_text_is_linkified(self):
-        some_text = 'text'
+        some_text = "text"
         some_type = int
         no_type = None
 
@@ -678,22 +692,25 @@ def test_only_text_is_linkified(self):
             linkify(no_type)
 
 
-@pytest.mark.parametrize('text, expected', [
-    ('abc', 'abc'),
-    ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'),
-    (
-        'http://example.com?b=1&c=2',
-        '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>'
-    ),
-    (
-        'http://example.com?b=1&amp;c=2',
-        '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>'
-    ),
-    (
-        'link: https://example.com/watch#anchor',
-        'link: <a href="https://example.com/watch#anchor" rel="nofollow">https://example.com/watch#anchor</a>'
-    )
-])
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("abc", "abc"),
+        ("example.com", '<a href="http://example.com" rel="nofollow">example.com</a>'),
+        (
+            "http://example.com?b=1&c=2",
+            '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>',
+        ),
+        (
+            "http://example.com?b=1&amp;c=2",
+            '<a href="http://example.com?b=1&amp;c=2" rel="nofollow">http://example.com?b=1&amp;c=2</a>',
+        ),
+        (
+            "link: https://example.com/watch#anchor",
+            'link: <a href="https://example.com/watch#anchor" rel="nofollow">https://example.com/watch#anchor</a>',
+        ),
+    ],
+)
 def test_linkify_filter(text, expected):
     cleaner = Cleaner(filters=[LinkifyFilter])
     assert cleaner.clean(text) == expected
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index 08ab3f4e..50538039 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -7,41 +7,46 @@
 
 
 def test_japanese_safe_simple():
-    assert clean('ヘルプとチュートリアル') == 'ヘルプとチュートリアル'
-    assert linkify('ヘルプとチュートリアル') == 'ヘルプとチュートリアル'
+    assert clean("ヘルプとチュートリアル") == "ヘルプとチュートリアル"
+    assert linkify("ヘルプとチュートリアル") == "ヘルプとチュートリアル"
 
 
 def test_japanese_strip():
-    assert clean('<em>ヘルプとチュートリアル</em>') == '<em>ヘルプとチュートリアル</em>'
-    assert clean('<span>ヘルプとチュートリアル</span>') == '&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;'
+    assert clean("<em>ヘルプとチュートリアル</em>") == "<em>ヘルプとチュートリアル</em>"
+    assert clean("<span>ヘルプとチュートリアル</span>") == "&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;"
 
 
 def test_russian_simple():
-    assert clean('Домашняя') == 'Домашняя'
-    assert linkify('Домашняя') == 'Домашняя'
+    assert clean("Домашняя") == "Домашняя"
+    assert linkify("Домашняя") == "Домашняя"
 
 
 def test_mixed():
-    assert clean('Домашняяヘルプとチュートリアル') == 'Домашняяヘルプとチュートリアル'
+    assert clean("Домашняяヘルプとチュートリアル") == "Домашняяヘルプとチュートリアル"
 
 
 def test_mixed_linkify():
     assert (
-        linkify('Домашняя http://example.com ヘルプとチュートリアル') ==
-        'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル'
+        linkify("Домашняя http://example.com ヘルプとチュートリアル")
+        == 'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル'
     )
 
 
-@pytest.mark.parametrize('test,expected', [
-    ('http://éxámplé.com/', 'http://éxámplé.com/'),
-    ('http://éxámplé.com/íàñá/', 'http://éxámplé.com/íàñá/'),
-    ('http://éxámplé.com/íàñá/?foo=bar', 'http://éxámplé.com/íàñá/?foo=bar'),
-    ('http://éxámplé.com/íàñá/?fóo=bár', 'http://éxámplé.com/íàñá/?fóo=bár'),
-])
+@pytest.mark.parametrize(
+    "test,expected",
+    [
+        ("http://éxámplé.com/", "http://éxámplé.com/"),
+        ("http://éxámplé.com/íàñá/", "http://éxámplé.com/íàñá/"),
+        ("http://éxámplé.com/íàñá/?foo=bar", "http://éxámplé.com/íàñá/?foo=bar"),
+        ("http://éxámplé.com/íàñá/?fóo=bár", "http://éxámplé.com/íàñá/?fóo=bár"),
+    ],
+)
 def test_url_utf8(test, expected):
     """Allow UTF8 characters in URLs themselves."""
-    outs = ('<a href="{0!s}" rel="nofollow">{0!s}</a>',
-            '<a rel="nofollow" href="{0!s}">{0!s}</a>')
+    outs = (
+        '<a href="{0!s}" rel="nofollow">{0!s}</a>',
+        '<a rel="nofollow" href="{0!s}">{0!s}</a>',
+    )
 
     out = lambda url: [x.format(url) for x in outs]
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 076617df..76d1c71f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -10,35 +10,14 @@ def test_empty_cases(self):
         assert alphabetize_attributes({}) == {}
 
     def test_ordering(self):
-        assert (
-            alphabetize_attributes({
-                (None, 'a'): 1,
-                (None, 'b'): 2
-            }) ==
-            OrderedDict([
-                ((None, 'a'), 1),
-                ((None, 'b'), 2)
-            ])
+        assert alphabetize_attributes({(None, "a"): 1, (None, "b"): 2}) == OrderedDict(
+            [((None, "a"), 1), ((None, "b"), 2)]
         )
-        assert (
-            alphabetize_attributes({
-                (None, 'b'): 1,
-                (None, 'a'): 2}
-            ) ==
-            OrderedDict([
-                ((None, 'a'), 2),
-                ((None, 'b'), 1)
-            ])
+        assert alphabetize_attributes({(None, "b"): 1, (None, "a"): 2}) == OrderedDict(
+            [((None, "a"), 2), ((None, "b"), 1)]
         )
 
     def test_different_namespaces(self):
-        assert (
-            alphabetize_attributes({
-                ('xlink', 'href'): 'abc',
-                (None, 'alt'): '123'
-            }) ==
-            OrderedDict([
-                ((None, 'alt'), '123'),
-                (('xlink', 'href'), 'abc')
-            ])
-        )
+        assert alphabetize_attributes(
+            {("xlink", "href"): "abc", (None, "alt"): "123"}
+        ) == OrderedDict([((None, "alt"), "123"), (("xlink", "href"), "abc")])
diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py
index 5870d64c..5f888b8c 100755
--- a/tests_website/data_to_json.py
+++ b/tests_website/data_to_json.py
@@ -22,36 +22,40 @@
 def main():
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
-        'data_dir',
-        help=(
-            'directory containing test cases with names like <testcase>.test'
-        )
+        "data_dir",
+        help=("directory containing test cases with names like <testcase>.test"),
     )
 
     args = parser.parse_args()
 
     filenames = os.listdir(args.data_dir)
-    ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')]
+    ins = [
+        os.path.join(args.data_dir, f)
+        for f in filenames
+        if fnmatch.fnmatch(f, "*.test")
+    ]
 
     testcases = []
     for infn in ins:
-        case_name = infn.rsplit('.test', 1)[0]
+        case_name = infn.rsplit(".test", 1)[0]
 
-        with open(infn, 'r') as fin:
-            data, expected = fin.read().split('\n--\n')
+        with open(infn, "r") as fin:
+            data, expected = fin.read().split("\n--\n")
             data = data.strip()
             expected = expected.strip()
 
-            testcases.append({
-                'title': case_name,
-                'input_filename': infn,
-                'payload': data,
-                'actual': bleach.clean(data),
-                'expected': expected,
-            })
+            testcases.append(
+                {
+                    "title": case_name,
+                    "input_filename": infn,
+                    "payload": data,
+                    "actual": bleach.clean(data),
+                    "expected": expected,
+                }
+            )
 
     print(json.dumps(testcases, indent=4, sort_keys=True))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py
index a01de73b..d1d8c127 100755
--- a/tests_website/open_test_page.py
+++ b/tests_website/open_test_page.py
@@ -5,7 +5,7 @@
 
 TEST_BROWSERS = {
     # 'mozilla',
-    'firefox',
+    "firefox",
     # 'netscape',
     # 'galeon',
     # 'epiphany',
@@ -20,17 +20,17 @@
     # 'elinks',
     # 'lynx',
     # 'w3m',
-    'windows-default',
+    "windows-default",
     # 'macosx',
-    'safari',
+    "safari",
     # 'google-chrome',
-    'chrome',
+    "chrome",
     # 'chromium',
     # 'chromium-browser',
 }
 REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     for b in TEST_BROWSERS & REGISTERED_BROWSERS:
-        webbrowser.get(b).open_new_tab('http://localhost:8080')
+        webbrowser.get(b).open_new_tab("http://localhost:8080")
diff --git a/tests_website/server.py b/tests_website/server.py
index edc791a4..834729f9 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -21,32 +21,32 @@
 class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
     def do_POST(self):
         if six.PY2:
-            content_len = int(self.headers.getheader('content-length', 0))
+            content_len = int(self.headers.getheader("content-length", 0))
         else:
-            content_len = int(self.headers.get('content-length', 0))
+            content_len = int(self.headers.get("content-length", 0))
         body = self.rfile.read(content_len)
         print("read %s bytes: %s" % (content_len, body))
 
         if six.PY3:
-            body = body.decode('utf-8')
-        print('input: %r' % body)
+            body = body.decode("utf-8")
+        print("input: %r" % body)
         cleaned = bleach.clean(body)
 
         self.send_response(200)
-        self.send_header('Content-Length', len(cleaned))
-        self.send_header('Content-Type', 'text/plain;charset=UTF-8')
+        self.send_header("Content-Length", len(cleaned))
+        self.send_header("Content-Type", "text/plain;charset=UTF-8")
         self.end_headers()
 
         if six.PY3:
-            cleaned = bytes(cleaned, encoding='utf-8')
+            cleaned = bytes(cleaned, encoding="utf-8")
         print("cleaned: %r" % cleaned)
         self.wfile.write(cleaned)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Prevent 'cannot bind to address' errors on restart
     six.moves.socketserver.TCPServer.allow_reuse_address = True
 
-    httpd = six.moves.socketserver.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
+    httpd = six.moves.socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler)
     print("listening on localhost port %d" % PORT)
     httpd.serve_forever()

From 56d4a756bba795cd43f10b73f6dd76a9de0d47e4 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 16 Sep 2020 13:21:43 -0400
Subject: [PATCH 279/314] flake8 ignore E203 whitespace before :

refs: https://github.com/PyCQA/pycodestyle/issues/373
---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index ef44e287..451fef17 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,6 +6,8 @@ exclude =
 ignore =
     # E731: do not assign a lambda expression, use a def
     E731,
+    # E203: whitespace before : (refs: https://github.com/PyCQA/pycodestyle/issues/373)
+    E203,
     # W503: line break occurred before a binary operator
     W503
 max-line-length = 100

From c4a98da74fc2d1f975e133d6b56fd338a1048b46 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 17 Sep 2020 09:56:25 -0400
Subject: [PATCH 280/314] upgrade html5lib deprecation warnings to errors

instead of ignoring them
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 451fef17..7875aeb5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -13,7 +13,7 @@ ignore =
 max-line-length = 100
 
 [tool:pytest]
-filterwarnings = ignore:html5lib:DeprecationWarning
+addopts = -W error:html5lib:DeprecationWarning
 
 [wheel]
 universal=1

From 38122ef260e64c240bfd645408b01136df277ead Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 17 Sep 2020 10:19:26 -0400
Subject: [PATCH 281/314] suppress html5lib sanitizer deprecation warnings

---
 bleach/html5lib_shim.py | 51 ++++++++++++++++++++++++++++++-----------
 bleach/sanitizer.py     | 11 +++++----
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 72642246..7bfd5815 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -12,26 +12,51 @@
 
 import six
 
-# filter out html5lib deprecation warnings to use bleach
-warnings.simplefilter("ignore", category=DeprecationWarning)
+# ignore html5lib deprecation warnings to use bleach; we are bleach
+# apply before we import submodules that import html5lib
+warnings.filterwarnings(
+    "ignore",
+    message="html5lib's sanitizer is deprecated",
+    category=DeprecationWarning,
+    module="bleach._vendor.html5lib",
+)
 
-from bleach._vendor.html5lib import (
+from bleach._vendor.html5lib import (  # noqa: E402 module level import not at top of file
     HTMLParser,
     getTreeWalker,
 )
-from bleach._vendor.html5lib import constants
-from bleach._vendor.html5lib.constants import (
+from bleach._vendor.html5lib import (
+    constants,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.constants import (  # noqa: E402 module level import not at top of file
     namespaces,
     prefixes,
 )
-from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
-from bleach._vendor.html5lib.filters.base import Filter
-from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
-from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
-from bleach._vendor.html5lib._inputstream import HTMLInputStream
-from bleach._vendor.html5lib.serializer import HTMLSerializer
-from bleach._vendor.html5lib._tokenizer import attributeMap, HTMLTokenizer
-from bleach._vendor.html5lib._trie import Trie
+from bleach._vendor.html5lib.constants import (
+    _ReparseException as ReparseException,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.base import (
+    Filter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    allowed_protocols,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    Filter as SanitizerFilter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._inputstream import (
+    HTMLInputStream,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.serializer import (
+    HTMLSerializer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._tokenizer import (
+    attributeMap,
+    HTMLTokenizer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._trie import (
+    Trie,
+)  # noqa: E402 module level import not at top of file
 
 
 #: Map of entity name to expanded entity
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 8b5dafa7..bc66ad2a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -12,10 +12,6 @@
 from bleach.utils import alphabetize_attributes, force_unicode
 
 
-# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
-warnings.simplefilter("ignore", category=DeprecationWarning)
-
-
 #: List of allowed tags
 ALLOWED_TAGS = [
     "a",
@@ -282,6 +278,13 @@ def __init__(
         self.strip_disallowed_elements = strip_disallowed_elements
         self.strip_html_comments = strip_html_comments
 
+        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
+        warnings.filterwarnings(
+            "ignore",
+            message="html5lib's sanitizer is deprecated",
+            category=DeprecationWarning,
+            module="bleach._vendor.html5lib",
+        )
         return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
     def sanitize_stream(self, token_iterator):

From f5971aa57461cce00d6dd474e901bcbd00f240f1 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Fri, 18 Sep 2020 09:14:33 -0400
Subject: [PATCH 282/314] Update for v3.2.1 release

---
 CHANGES            | 3 ++-
 bleach/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index ddcf8d8b..2204bab9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,7 +1,7 @@
 Bleach changes
 ==============
 
-Version replaceme (replaceme, 2020)
+Version 3.2.1 (September 18th, 2020)
 ------------------------------------
 
 **Security fixes**
@@ -15,6 +15,7 @@ None
 **Bug fixes**
 
 * change linkifier to add rel="nofollow" as documented. Thank you @mitar.
+* suppress html5lib sanitizer DeprecationWarnings #557
 
 Version 3.2.0 (September 16th, 2020)
 ------------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 852db4a7..90075564 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ""
+__releasedate__ = "20200918"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.2.1dev0"
+__version__ = "3.2.1"
 VERSION = packaging.version.Version(__version__)
 
 

From 263675d84aee9050f9ec0e4abbca6a12efc019ef Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Tue, 28 Jan 2020 11:29:48 +0200
Subject: [PATCH 283/314] Test on GitHub Actions

---
 .github/workflows/test.yml | 62 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..02d89029
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,62 @@
+name: Test
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.5, 3.6, 3.7, 3.8, pypy3]
+        os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest]
+
+    steps:
+      - uses: actions/checkout@v1
+
+      - name: Ubuntu cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'ubuntu')
+        with:
+          path: ~/.cache/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: macOS cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'macOS')
+        with:
+          path: ~/Library/Caches/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: Windows cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'windows')
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip setuptools>=18.5
+          python -m pip install -r requirements-dev.txt
+
+      - name: Tests
+        shell: bash
+        run: ./scripts/run_tests.sh

From 29dc1fcf4138d22d1ccad4741723969f4a44cc81 Mon Sep 17 00:00:00 2001
From: Hugo <hugovk@users.noreply.github.com>
Date: Tue, 28 Jan 2020 11:36:39 +0200
Subject: [PATCH 284/314] Lint on GitHub Actions

---
 .github/workflows/lint.yml | 63 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000..7d78da12
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,63 @@
+name: Lint
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.8]
+        os: [ubuntu-latest]
+        mode: [lint, vendorverify, docs]
+
+    steps:
+      - uses: actions/checkout@v1
+
+      - name: Ubuntu cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'ubuntu')
+        with:
+          path: ~/.cache/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: macOS cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'macOS')
+        with:
+          path: ~/Library/Caches/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: Windows cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'windows')
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
+            }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip setuptools>=18.5
+          python -m pip install -r requirements-dev.txt
+
+      - name: Tests
+        shell: bash
+        run: ./scripts/run_tests.sh ${{ matrix.mode }}

From 49b3e4388a01de8eec487601822d8985da8b15dc Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 14 Sep 2020 17:56:34 +0300
Subject: [PATCH 285/314] Update and simplify GHA config

---
 .github/workflows/lint.yml | 31 ++++-------------------------
 .github/workflows/test.yml | 40 +++++++++++---------------------------
 2 files changed, 15 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7d78da12..b75d2e66 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -13,11 +13,10 @@ jobs:
         mode: [lint, vendorverify, docs]
 
     steps:
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v2
 
-      - name: Ubuntu cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'ubuntu')
+      - name: Cache
+        uses: actions/cache@v2
         with:
           path: ~/.cache/pip
           key:
@@ -26,30 +25,8 @@ jobs:
           restore-keys: |
             ${{ matrix.os }}-${{ matrix.python-version }}-
 
-      - name: macOS cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'macOS')
-        with:
-          path: ~/Library/Caches/pip
-          key:
-            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
-            }}
-          restore-keys: |
-            ${{ matrix.os }}-${{ matrix.python-version }}-
-
-      - name: Windows cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'windows')
-        with:
-          path: ~\AppData\Local\pip\Cache
-          key:
-            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
-            }}
-          restore-keys: |
-            ${{ matrix.os }}-${{ matrix.python-version }}-
-
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 02d89029..ee4ed870 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,46 +12,28 @@ jobs:
         os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest]
 
     steps:
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v2
 
-      - name: Ubuntu cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'ubuntu')
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
         with:
-          path: ~/.cache/pip
-          key:
-            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
-            }}
-          restore-keys: |
-            ${{ matrix.os }}-${{ matrix.python-version }}-
+          python-version: ${{ matrix.python-version }}
 
-      - name: macOS cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'macOS')
-        with:
-          path: ~/Library/Caches/pip
-          key:
-            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
-            }}
-          restore-keys: |
-            ${{ matrix.os }}-${{ matrix.python-version }}-
+      - name: Get pip cache dir
+        id: pip-cache
+        run: |
+          echo "::set-output name=dir::$(pip cache dir)"
 
-      - name: Windows cache
-        uses: actions/cache@v1
-        if: startsWith(matrix.os, 'windows')
+      - name: Cache
+        uses: actions/cache@v2
         with:
-          path: ~\AppData\Local\pip\Cache
+          path: ${{ steps.pip-cache.outputs.dir }}
           key:
             ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py')
             }}
           restore-keys: |
             ${{ matrix.os }}-${{ matrix.python-version }}-
 
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
-        with:
-          python-version: ${{ matrix.python-version }}
-
       - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools>=18.5

From caed8b5b74b1c7dfc35d0cf9228f14e2178c421d Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 5 Jan 2021 11:54:23 -0500
Subject: [PATCH 286/314] ci: remove travis

---
 .travis.yml | 39 ---------------------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 4cf14201..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Note: If you update this, make sure to update tox.ini, too.
-dist: xenial
-language: python
-cache:
-  directories:
-  - "~/.cache/pip"
-
-python:
-  - "2.7"
-  - "3.5"
-  - "3.6"
-  - "3.7"
-  - "3.8"
-  - "pypy"
-  - "pypy3"
-
-install:
-  - pip install -U pip setuptools>=18.5
-  - pip install -r requirements-dev.txt
-
-matrix:
-  include:
-    - python: "2.7"
-      env: MODE=lint
-    - python: "2.7"
-      env: MODE=vendorverify
-    - python: "3.8"
-      env: MODE=lint
-    - python: "3.8"
-      env: MODE=format-check
-      install:
-        - pip install -U pip setuptools>=18.5
-        - pip install -r requirements-dev.txt
-        - pip install -U black
-    - python: "3.6"
-      env: MODE=docs
-
-script:
-  - ./scripts/run_tests.sh $MODE

From 1d13e9f7610cb4ecf068cadd647a7c6afd6dbc66 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 5 Jan 2021 12:05:36 -0500
Subject: [PATCH 287/314] docs: replace travis ci badges with GHA

---
 README.rst               | 7 +++++--
 tests_website/index.html | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index e006c997..23562409 100644
--- a/README.rst
+++ b/README.rst
@@ -2,8 +2,11 @@
 Bleach
 ======
 
-.. image:: https://travis-ci.org/mozilla/bleach.svg?branch=master
-   :target: https://travis-ci.org/mozilla/bleach
+.. image:: https://github.com/mozilla/bleach/workflows/Test/badge.svg
+   :target: https://github.com/mozilla/bleach/actions?query=workflow%3ATest
+
+.. image:: https://github.com/mozilla/bleach/workflows/Lint/badge.svg
+   :target: https://github.com/mozilla/bleach/actions?query=workflow%3ALint
 
 .. image:: https://badge.fury.io/py/bleach.svg
    :target: http://badge.fury.io/py/bleach
diff --git a/tests_website/index.html b/tests_website/index.html
index 8544cc4e..5f9cc605 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -23,7 +23,7 @@
         <h2>Python Bleach 2.0.0</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
-            <a href="https://travis-ci.org/mozilla/bleach"><img style="max-width:100%;" alt="Build Status" src="https://travis-ci.org/mozilla/bleach.svg?branch=master"></a>
+            <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>
         </p>
 
         <div class="demo">

From 0f5b25c17a3149b0ac47813165572785d099d7cb Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 5 Jan 2021 12:06:12 -0500
Subject: [PATCH 288/314] docs: replace travis refs in dev docs and tox comment

---
 docs/dev.rst | 6 +++---
 tox.ini      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/dev.rst b/docs/dev.rst
index fcb90666..7464326c 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -42,7 +42,7 @@ Run::
 That'll run Bleach tests in all the supported Python environments. Note
 that you need the necessary Python binaries for them all to be tested.
 
-Tests are run in Travis CI via a GitHub webhook.
+Tests are run as github actions for test and pull request events.
 
 
 Release process
@@ -80,9 +80,9 @@ Release process
 
 6. Commit the changes.
 
-7. Push the changes to GitHub. This will cause Travis to run the tests.
+7. Push the changes to GitHub. This will cause Github Actions to run the tests.
 
-8. After Travis is happy, create a signed tag for the release::
+8. After CI passes, create a signed tag for the release::
 
      $ git tag -s v0.4.0
 
diff --git a/tox.ini b/tox.ini
index b746b370..5eb57f44 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,4 +1,4 @@
-# Note: If you update this, make sure to update .travis.yml, too.
+# Note: If you update this, make sure to update .github/workflows/, too.
 
 [tox]
 envlist =

From a4ed4d2e3781b01ca9fea6db0e647ff0b20b7ea6 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 5 Jan 2021 12:06:38 -0500
Subject: [PATCH 289/314] tests_website: bump to current bleach version

---
 tests_website/index.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests_website/index.html b/tests_website/index.html
index 5f9cc605..31db6860 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -2,7 +2,7 @@
 <html>
     <head>
         <meta charset="UTF-8">
-        <title>Python Bleach 2.0.0</title>
+        <title>Python Bleach 3.2.1</title>
         <style>
          textarea, iframe {
              width: 95%;
@@ -20,7 +20,7 @@
         </style>
     </head>
     <body>
-        <h2>Python Bleach 2.0.0</h2>
+        <h2>Python Bleach 3.2.1</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>

From eb8aebd45737ffba8abd23ba88b14dede47f35af Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 20 Jan 2021 12:59:20 -0500
Subject: [PATCH 290/314] linkify: fix IndexError in convert_entity

fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=29581
---
 bleach/html5lib_shim.py |  2 ++
 tests/test_linkify.py   | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 7bfd5815..9984b175 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -457,6 +457,8 @@ def convert_entity(value):
 
     """
     if value[0] == "#":
+        if len(value) < 2:
+            return None
         if value[1] in ("x", "X"):
             return six.unichr(int(value[2:], 16))
         return six.unichr(int(value[1:], 10))
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index f3aca67b..b5704a1e 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -233,8 +233,15 @@ def test_tlds(data, expected):
     assert linkify(data) == expected
 
 
-def test_escaping():
-    assert linkify("< unrelated") == "&lt; unrelated"
+@pytest.mark.parametrize(
+    "data,expected",
+    [
+        ("< unrelated", "&lt; unrelated"),
+        ("<U \x7f=&#;>", '<u \x7f="&amp;#;"></u>'),
+    ],
+)
+def test_escaping(data, expected):
+    assert linkify(data) == expected
 
 
 def test_nofollow_off():

From 90cb80be961aaf650ebc65b2ba2b789a2e9b129f Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 20 Jan 2021 14:00:09 -0500
Subject: [PATCH 291/314] Update for v3.2.2 release

---
 CHANGES                  | 15 +++++++++++++++
 CONTRIBUTORS             |  1 +
 bleach/__init__.py       |  4 ++--
 tests_website/index.html |  4 ++--
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/CHANGES b/CHANGES
index 2204bab9..936ecf8c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,21 @@
 Bleach changes
 ==============
 
+Version 3.2.2 (January 20th, 2021)
+------------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+* Migrate CI to Github Actions. Thank you @hugovk.
+
+**Bug fixes**
+
+* fix linkify raising an IndexError on certain inputs. Thank you @Google-Autofuzz.
+
 Version 3.2.1 (September 18th, 2020)
 ------------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index b114de94..3ce10451 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -38,6 +38,7 @@ Contributors:
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
+- Google Autofuzz Team
 - Greg Guthe
 - hugovk
 - Istvan Albert
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 90075564..b2f58fd7 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20200918"
+__releasedate__ = "20210120"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.2.1"
+__version__ = "3.2.2"
 VERSION = packaging.version.Version(__version__)
 
 
diff --git a/tests_website/index.html b/tests_website/index.html
index 31db6860..c017eb88 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -2,7 +2,7 @@
 <html>
     <head>
         <meta charset="UTF-8">
-        <title>Python Bleach 3.2.1</title>
+        <title>Python Bleach 3.2.2</title>
         <style>
          textarea, iframe {
              width: 95%;
@@ -20,7 +20,7 @@
         </style>
     </head>
     <body>
-        <h2>Python Bleach 3.2.1</h2>
+        <h2>Python Bleach 3.2.2</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>

From 6879f6a67058c0d5977a8aa580b6338c9d34ff0e Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Mon, 25 Jan 2021 17:39:42 -0500
Subject: [PATCH 292/314] html5lib_shim: validate unicode points for
 convert_entity

fixes:

* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=29865
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=29882
---
 bleach/html5lib_shim.py     | 17 +++++++++++++++--
 tests/test_html5lib_shim.py | 10 ++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 9984b175..c71947ee 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -459,9 +459,22 @@ def convert_entity(value):
     if value[0] == "#":
         if len(value) < 2:
             return None
+
         if value[1] in ("x", "X"):
-            return six.unichr(int(value[2:], 16))
-        return six.unichr(int(value[1:], 10))
+            # hex-encoded code point
+            int_as_string, base = value[2:], 16
+        else:
+            # decimal code point
+            int_as_string, base = value[1:], 10
+
+        if int_as_string == "":
+            return None
+
+        code_point = int(int_as_string, base)
+        if 0 < code_point < 0x110000:
+            return six.unichr(code_point)
+        else:
+            return None
 
     return ENTITIES.get(value, None)
 
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index fcb7799d..5a836bcd 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -19,6 +19,16 @@
         ("&xx;", "&xx;"),
         # Handles multiple entities in the same string
         ("this &amp; that &amp; that", "this & that & that"),
+        # Handles empty decimal and hex encoded code points
+        ("&#x;", "&#x;"),
+        ("&#;", "&#;"),
+        # Handles too high unicode points
+        ("&#x110000;", "&#x110000;"),
+        ("&#x110111;", "&#x110111;"),
+        ("&#9277809;", "&#9277809;"),
+        # Handles negative unicode points
+        ("&#-1;", "&#-1;"),
+        ("&#x-1;", "&#x-1;"),
     ],
 )
 def test_convert_entities(data, expected):

From 612b8080ada0fba45f0575bfcd4f3a0bda7bfaca Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Mon, 25 Jan 2021 17:47:06 -0500
Subject: [PATCH 293/314] Update for v3.2.3 release

---
 CHANGES                  | 17 ++++++++++++++++-
 bleach/__init__.py       |  4 ++--
 tests_website/index.html |  4 ++--
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/CHANGES b/CHANGES
index 936ecf8c..b93ae510 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,23 @@
 Bleach changes
 ==============
 
+Version 3.2.3 (January 26th, 2021)
+----------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* fix clean and linkify raising ValueErrors for certain inputs. Thank you @Google-Autofuzz.
+
 Version 3.2.2 (January 20th, 2021)
-------------------------------------
+----------------------------------
 
 **Security fixes**
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index b2f58fd7..02301f7e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20210120"
+__releasedate__ = "20210126"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.2.2"
+__version__ = "3.2.3"
 VERSION = packaging.version.Version(__version__)
 
 
diff --git a/tests_website/index.html b/tests_website/index.html
index c017eb88..5df8ed99 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -2,7 +2,7 @@
 <html>
     <head>
         <meta charset="UTF-8">
-        <title>Python Bleach 3.2.2</title>
+        <title>Python Bleach 3.2.3</title>
         <style>
          textarea, iframe {
              width: 95%;
@@ -20,7 +20,7 @@
         </style>
     </head>
     <body>
-        <h2>Python Bleach 3.2.2</h2>
+        <h2>Python Bleach 3.2.3</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>

From 10b1c5dda8ebceffce1d8f7d66d4b309b4f8c0cf Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 26 Jan 2021 16:34:34 -0500
Subject: [PATCH 294/314] vendor: add html5lib-1.1.dist-info/REQUESTED

---
 bleach/_vendor/html5lib-1.1.dist-info/REQUESTED | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 bleach/_vendor/html5lib-1.1.dist-info/REQUESTED

diff --git a/bleach/_vendor/html5lib-1.1.dist-info/REQUESTED b/bleach/_vendor/html5lib-1.1.dist-info/REQUESTED
new file mode 100644
index 00000000..e69de29b

From 491abb06ce89012d852f4c5ab3aff8f572532611 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 26 Jan 2021 16:57:53 -0500
Subject: [PATCH 295/314] fix typo s/vnedoring/vendoring/

---
 scripts/vendor_verify.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
index e9abe5e0..cb2243ab 100755
--- a/scripts/vendor_verify.sh
+++ b/scripts/vendor_verify.sh
@@ -18,7 +18,7 @@ pip --version
 # Install vendored dependencies into temp directory
 pip install --no-binary all --no-compile --no-deps -r bleach/_vendor/vendor.txt --target "${DEST}"
 
-# Diff contents of temp directory and bleach/_vendor/ excluding vnedoring
+# Diff contents of temp directory and bleach/_vendor/ excluding vendoring
 # infrastructure
 echo "diffing directory trees..."
 diff -r \

From 1334134d34397966a7f7cfebd38639e9ba2c680e Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 28 Jan 2021 14:56:24 -0500
Subject: [PATCH 296/314] sanitizer: escape HTML comments

fixes: bug 1689399 / GHSA vv2x-vrpj-qqpq
---
 bleach/html5lib_shim.py |  1 +
 bleach/sanitizer.py     |  4 ++++
 tests/test_clean.py     | 47 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index c71947ee..b886ca50 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -48,6 +48,7 @@
     HTMLInputStream,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.serializer import (
+    escape,
     HTMLSerializer,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib._tokenizer import (
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index bc66ad2a..0f5b7cc5 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -371,6 +371,10 @@ def sanitize_token(self, token):
 
         elif token_type == "Comment":
             if not self.strip_html_comments:
+                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
+                token["data"] = html5lib_shim.escape(
+                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
+                )
                 return token
             else:
                 return None
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 1cd58df0..7c565750 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -739,6 +739,53 @@ def test_namespace_rc_data_element_strip_false(
     )
 
 
+@pytest.mark.parametrize(
+    "namespace_tag, end_tag, data, expected",
+    [
+        (
+            "math",
+            "p",
+            "<math><style><!--</style><img src/onerror=alert(1)>",
+            "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
+        ),
+        (
+            "math",
+            "br",
+            "<math></br><style><!--</style><img src/onerror=alert(1)>",
+            "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "<svg></p><style><!--</style><img src/onerror=alert(1)>",
+            "<svg><p></p><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "<svg></br><style><!--</style><img src/onerror=alert(1)>",
+            "<svg><br><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
+        ),
+    ],
+)
+def test_html_comments_escaped(namespace_tag, end_tag, data, expected):
+    # refs: bug 1689399 / GHSA-vv2x-vrpj-qqpq
+    #
+    # p and br can be just an end tag (e.g. </p> == <p></p>)
+    #
+    # In browsers:
+    #
+    # * img and other tags break out of the svg or math namespace (e.g. <svg><img></svg> == <svg><img></svg>)
+    # * style does not (e.g. <svg><style></svg> == <svg><style></style></svg>)
+    # * the breaking tag ejects trailing elements (e.g. <svg><img><style></style></svg> == <svg></svg><img><style></style>)
+    #
+    # the ejected elements can trigger XSS
+    assert (
+        clean(data, tags=[namespace_tag, end_tag, "style"], strip_comments=False)
+        == expected
+    )
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 842fcb4a05e59d9a22dafb8c51865ee79d753c03 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Fri, 29 Jan 2021 11:22:55 -0500
Subject: [PATCH 297/314] Update for v3.3.0 release

---
 CHANGES                  | 19 +++++++++++++++++++
 SECURITY.md              |  4 ++--
 bleach/__init__.py       |  4 ++--
 tests_website/index.html |  4 ++--
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/CHANGES b/CHANGES
index b93ae510..d4a0e500 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,25 @@
 Bleach changes
 ==============
 
+Version 3.3.0 (February 1st, 2021)
+----------------------------------
+
+**Backwards incompatible changes**
+
+* clean escapes HTML comments even when strip_comments=False
+
+**Security fixes**
+
+* Fix bug 1621692 / GHSA-m6xf-fq7q-8743. See the advisory for details.
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
 Version 3.2.3 (January 26th, 2021)
 ----------------------------------
 
diff --git a/SECURITY.md b/SECURITY.md
index 47b7d7a8..751dfdbb 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -7,8 +7,8 @@ currently being supported with security updates.
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 3.2.x   | :white_check_mark: |
-| < 3.1   | :x:                |
+| 3.3.x   | :white_check_mark: |
+| < 3.2   | :x:                |
 
 ## Reporting a Vulnerability
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 02301f7e..6cc994c2 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20210126"
+__releasedate__ = "20210201"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.2.3"
+__version__ = "3.3.0"
 VERSION = packaging.version.Version(__version__)
 
 
diff --git a/tests_website/index.html b/tests_website/index.html
index 5df8ed99..b4a92f9d 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -2,7 +2,7 @@
 <html>
     <head>
         <meta charset="UTF-8">
-        <title>Python Bleach 3.2.3</title>
+        <title>Python Bleach 3.3.0</title>
         <style>
          textarea, iframe {
              width: 95%;
@@ -20,7 +20,7 @@
         </style>
     </head>
     <body>
-        <h2>Python Bleach 3.2.3</h2>
+        <h2>Python Bleach 3.3.0</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>

From d398c89e54ced6b1039d3677689707456ba42dec Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Tue, 2 Feb 2021 11:13:09 -0500
Subject: [PATCH 298/314] tests: add tests for more eject tags for
 GHSA-vv2x-vrpj-qqpq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

reported by Michał Bentkowski at Securitum
---
 tests/test_clean.py | 244 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 241 insertions(+), 3 deletions(-)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index 7c565750..0b7570c7 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -740,35 +740,272 @@ def test_namespace_rc_data_element_strip_false(
 
 
 @pytest.mark.parametrize(
-    "namespace_tag, end_tag, data, expected",
+    "namespace_tag, end_tag, eject_tag, data, expected",
     [
+        # eject with style
         (
             "math",
             "p",
+            "style",
             "<math><style><!--</style><img src/onerror=alert(1)>",
             "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
         ),
         (
             "math",
             "br",
+            "style",
             "<math></br><style><!--</style><img src/onerror=alert(1)>",
             "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
         ),
         (
             "svg",
             "p",
+            "style",
             "<svg></p><style><!--</style><img src/onerror=alert(1)>",
             "<svg><p></p><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
         ),
         (
             "svg",
             "br",
+            "style",
             "<svg></br><style><!--</style><img src/onerror=alert(1)>",
             "<svg><br><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
         ),
+        # eject with title
+        (
+            "math",
+            "p",
+            "title",
+            "<math><title><!--</title><img src/onerror=alert(1)>",
+            "<math><title><!--&lt;/title&gt;&lt;img src/onerror=alert(1)&gt;--></title></math>",
+        ),
+        (
+            "math",
+            "br",
+            "title",
+            "<math></br><title><!--</title><img src/onerror=alert(1)>",
+            "<math><title><!--&lt;/title&gt;&lt;img src/onerror=alert(1)&gt;--></title></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "title",
+            "<svg></p><title><!--</title><img src/onerror=alert(1)>",
+            "<svg><p></p><title><!--&lt;/title&gt;&lt;img src/onerror=alert(1)&gt;--></title></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "title",
+            "<svg></br><title><!--</title><img src/onerror=alert(1)>",
+            "<svg><br><title><!--&lt;/title&gt;&lt;img src/onerror=alert(1)&gt;--></title></svg>",
+        ),
+        # eject with noscript
+        (
+            "math",
+            "p",
+            "noscript",
+            "<math><noscript><!--</noscript><img src/onerror=alert(1)>",
+            "<math><noscript><!--&lt;/noscript&gt;&lt;img src/onerror=alert(1)&gt;--></noscript></math>",
+        ),
+        (
+            "math",
+            "br",
+            "noscript",
+            "<math></br><noscript><!--</noscript><img src/onerror=alert(1)>",
+            "<math><noscript><!--&lt;/noscript&gt;&lt;img src/onerror=alert(1)&gt;--></noscript></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "noscript",
+            "<svg></p><noscript><!--</noscript><img src/onerror=alert(1)>",
+            "<svg><p></p><noscript><!--&lt;/noscript&gt;&lt;img src/onerror=alert(1)&gt;--></noscript></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "noscript",
+            "<svg></br><noscript><!--</noscript><img src/onerror=alert(1)>",
+            "<svg><br><noscript><!--&lt;/noscript&gt;&lt;img src/onerror=alert(1)&gt;--></noscript></svg>",
+        ),
+        # eject with script
+        (
+            "math",
+            "p",
+            "script",
+            "<math><script><!--</script><img src/onerror=alert(1)>",
+            "<math><script><!--&lt;/script&gt;&lt;img src/onerror=alert(1)&gt;--></script></math>",
+        ),
+        (
+            "math",
+            "br",
+            "script",
+            "<math></br><script><!--</script><img src/onerror=alert(1)>",
+            "<math><script><!--&lt;/script&gt;&lt;img src/onerror=alert(1)&gt;--></script></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "script",
+            "<svg></p><script><!--</script><img src/onerror=alert(1)>",
+            "<svg><p></p><script><!--&lt;/script&gt;&lt;img src/onerror=alert(1)&gt;--></script></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "script",
+            "<svg></br><script><!--</script><img src/onerror=alert(1)>",
+            "<svg><br><script><!--&lt;/script&gt;&lt;img src/onerror=alert(1)&gt;--></script></svg>",
+        ),
+        # eject with noembed
+        (
+            "math",
+            "p",
+            "noembed",
+            "<math><noembed><!--</noembed><img src/onerror=alert(1)>",
+            "<math><noembed><!--&lt;/noembed&gt;&lt;img src/onerror=alert(1)&gt;--></noembed></math>",
+        ),
+        (
+            "math",
+            "br",
+            "noembed",
+            "<math></br><noembed><!--</noembed><img src/onerror=alert(1)>",
+            "<math><noembed><!--&lt;/noembed&gt;&lt;img src/onerror=alert(1)&gt;--></noembed></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "noembed",
+            "<svg></p><noembed><!--</noembed><img src/onerror=alert(1)>",
+            "<svg><p></p><noembed><!--&lt;/noembed&gt;&lt;img src/onerror=alert(1)&gt;--></noembed></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "noembed",
+            "<svg></br><noembed><!--</noembed><img src/onerror=alert(1)>",
+            "<svg><br><noembed><!--&lt;/noembed&gt;&lt;img src/onerror=alert(1)&gt;--></noembed></svg>",
+        ),
+        # eject with textarea
+        (
+            "math",
+            "p",
+            "textarea",
+            "<math><textarea><!--</textarea><img src/onerror=alert(1)>",
+            "<math><textarea><!--&lt;/textarea&gt;&lt;img src/onerror=alert(1)&gt;--></textarea></math>",
+        ),
+        (
+            "math",
+            "br",
+            "textarea",
+            "<math></br><textarea><!--</textarea><img src/onerror=alert(1)>",
+            "<math><textarea><!--&lt;/textarea&gt;&lt;img src/onerror=alert(1)&gt;--></textarea></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "textarea",
+            "<svg></p><textarea><!--</textarea><img src/onerror=alert(1)>",
+            "<svg><p></p><textarea><!--&lt;/textarea&gt;&lt;img src/onerror=alert(1)&gt;--></textarea></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "textarea",
+            "<svg></br><textarea><!--</textarea><img src/onerror=alert(1)>",
+            "<svg><br><textarea><!--&lt;/textarea&gt;&lt;img src/onerror=alert(1)&gt;--></textarea></svg>",
+        ),
+        # eject with noframes
+        (
+            "math",
+            "p",
+            "noframes",
+            "<math><noframes><!--</noframes><img src/onerror=alert(1)>",
+            "<math><noframes><!--&lt;/noframes&gt;&lt;img src/onerror=alert(1)&gt;--></noframes></math>",
+        ),
+        (
+            "math",
+            "br",
+            "noframes",
+            "<math></br><noframes><!--</noframes><img src/onerror=alert(1)>",
+            "<math><noframes><!--&lt;/noframes&gt;&lt;img src/onerror=alert(1)&gt;--></noframes></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "noframes",
+            "<svg></p><noframes><!--</noframes><img src/onerror=alert(1)>",
+            "<svg><p></p><noframes><!--&lt;/noframes&gt;&lt;img src/onerror=alert(1)&gt;--></noframes></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "noframes",
+            "<svg></br><noframes><!--</noframes><img src/onerror=alert(1)>",
+            "<svg><br><noframes><!--&lt;/noframes&gt;&lt;img src/onerror=alert(1)&gt;--></noframes></svg>",
+        ),
+        # eject with iframe
+        (
+            "math",
+            "p",
+            "iframe",
+            "<math><iframe><!--</iframe><img src/onerror=alert(1)>",
+            "<math><iframe><!--&lt;/iframe&gt;&lt;img src/onerror=alert(1)&gt;--></iframe></math>",
+        ),
+        (
+            "math",
+            "br",
+            "iframe",
+            "<math></br><iframe><!--</iframe><img src/onerror=alert(1)>",
+            "<math><iframe><!--&lt;/iframe&gt;&lt;img src/onerror=alert(1)&gt;--></iframe></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "iframe",
+            "<svg></p><iframe><!--</iframe><img src/onerror=alert(1)>",
+            "<svg><p></p><iframe><!--&lt;/iframe&gt;&lt;img src/onerror=alert(1)&gt;--></iframe></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "iframe",
+            "<svg></br><iframe><!--</iframe><img src/onerror=alert(1)>",
+            "<svg><br><iframe><!--&lt;/iframe&gt;&lt;img src/onerror=alert(1)&gt;--></iframe></svg>",
+        ),
+        # eject with xmp
+        (
+            "math",
+            "p",
+            "xmp",
+            "<math><xmp><!--</xmp><img src/onerror=alert(1)>",
+            "<math><xmp><!--&lt;/xmp&gt;&lt;img src/onerror=alert(1)&gt;--></xmp></math>",
+        ),
+        (
+            "math",
+            "br",
+            "xmp",
+            "<math></br><xmp><!--</xmp><img src/onerror=alert(1)>",
+            "<math><xmp><!--&lt;/xmp&gt;&lt;img src/onerror=alert(1)&gt;--></xmp></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "xmp",
+            "<svg></p><xmp><!--</xmp><img src/onerror=alert(1)>",
+            "<svg><p></p><xmp><!--&lt;/xmp&gt;&lt;img src/onerror=alert(1)&gt;--></xmp></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "xmp",
+            "<svg></br><xmp><!--</xmp><img src/onerror=alert(1)>",
+            "<svg><br><xmp><!--&lt;/xmp&gt;&lt;img src/onerror=alert(1)&gt;--></xmp></svg>",
+        ),
     ],
 )
-def test_html_comments_escaped(namespace_tag, end_tag, data, expected):
+def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected):
     # refs: bug 1689399 / GHSA-vv2x-vrpj-qqpq
     #
     # p and br can be just an end tag (e.g. </p> == <p></p>)
@@ -777,11 +1014,12 @@ def test_html_comments_escaped(namespace_tag, end_tag, data, expected):
     #
     # * img and other tags break out of the svg or math namespace (e.g. <svg><img></svg> == <svg><img></svg>)
     # * style does not (e.g. <svg><style></svg> == <svg><style></style></svg>)
+    # * style and other tags without child elements does not (e.g. <svg><style></svg> == <svg><style></style></svg>)
     # * the breaking tag ejects trailing elements (e.g. <svg><img><style></style></svg> == <svg></svg><img><style></style>)
     #
     # the ejected elements can trigger XSS
     assert (
-        clean(data, tags=[namespace_tag, end_tag, "style"], strip_comments=False)
+        clean(data, tags=[namespace_tag, end_tag, eject_tag], strip_comments=False)
         == expected
     )
 

From 45f3de7f3cc45397e00cf7be4e841b7eb3498a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nguy=E1=BB=85n=20Gia=20Phong?= <mcsinyx@disroot.org>
Date: Sat, 20 Feb 2021 14:07:39 +0700
Subject: [PATCH 299/314] Remove duplicated h1 in changelog

---
 CHANGES | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index d4a0e500..a1302469 100644
--- a/CHANGES
+++ b/CHANGES
@@ -223,9 +223,6 @@ None
 
 None
 
-Bleach changes
-==============
-
 Version 3.1.0 (January 9th, 2019)
 ---------------------------------
 

From 8da3105370ccdb1be3fecd26e3c82361dfbc6819 Mon Sep 17 00:00:00 2001
From: Greg Guthe <g-k@users.noreply.github.com>
Date: Thu, 24 Jun 2021 15:21:45 -0400
Subject: [PATCH 300/314] Update bug-report.md

s/regression/untriaged/

all bugs are not regressions
---
 .github/ISSUE_TEMPLATE/bug-report.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index c99d006e..d13db676 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -2,7 +2,7 @@
 name: bug report
 about: Create a report to a bug or regression
 title: 'bug: '
-labels: regression
+labels: untriaged
 assignees: ''
 
 ---

From ee775e22b6d501cc94c7c5aa581ce3aacd58c7fd Mon Sep 17 00:00:00 2001
From: Greg Guthe <g-k@users.noreply.github.com>
Date: Thu, 24 Jun 2021 16:08:03 -0400
Subject: [PATCH 301/314] Update tox.ini

fix docs interpreter version
---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 5eb57f44..f810609f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -66,7 +66,7 @@ commands =
     ./run_tests.sh format-check
 
 [testenv:docs]
-basepython = python3.6
+basepython = python3.8
 changedir = docs
 deps =
     -rrequirements-dev.txt

From 1c16d17ba7991c06a78a567dc73256cdd432eb91 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 14 Jul 2021 12:12:27 -0400
Subject: [PATCH 302/314] scripts: fail tox vendorverify target when tree diff
 fails

diff should exit with a non-zero exit code when the tree diff includes
extra files or directories
---
 scripts/vendor_verify.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/vendor_verify.sh b/scripts/vendor_verify.sh
index cb2243ab..fe5299f5 100755
--- a/scripts/vendor_verify.sh
+++ b/scripts/vendor_verify.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 # Install vendored packages into /tmp and then compare with what's in
 # bleach/_vendor/.
 

From 599226634f31130db6fed4dcef483e6194ae12fd Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 14 Jul 2021 12:18:54 -0400
Subject: [PATCH 303/314] tox: bump lint and vendorverify python to 3.8

match python version in lint GHA workflow
---
 tox.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index f810609f..08f119ad 100644
--- a/tox.ini
+++ b/tox.ini
@@ -41,7 +41,7 @@ commands =
     python setup.py build
 
 [testenv:lint]
-basepython = python3.6
+basepython = python3.8
 changedir = scripts
 deps =
     -rrequirements-dev.txt
@@ -49,7 +49,7 @@ commands =
     ./run_tests.sh lint
 
 [testenv:vendorverify]
-basepython = python3.6
+basepython = python3.8
 changedir = scripts
 deps =
     -rrequirements-dev.txt

From ef0a2fafa748c6ab17ecce1cb9b4c008c393abfd Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 14 Jul 2021 12:38:28 -0400
Subject: [PATCH 304/314] docs: add vendorverify and artifact verification to
 release process

---
 docs/dev.rst | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/docs/dev.rst b/docs/dev.rst
index 7464326c..5a77e096 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -76,7 +76,12 @@ Release process
          $ cd docs/
          $ make doctest
 
-   4. Verify everything works
+   4. Verify the local vendored files (the second invocation should **not** exit with ``/tmp/vendor-test exists. Please remove.`` and the exit code should be zero)::
+
+	$ ./scripts/run_tests.sh vendorverify
+	$ ./scripts/run_tests.sh vendorverify
+
+   5. Run any additional tests to verify everything else works
 
 6. Commit the changes.
 
@@ -92,14 +97,20 @@ Release process
 
      $ python setup.py sdist bdist_wheel
 
-10. Upload them to PyPI::
+10. Sanity check the release contents and sizes:
+
+    $ ls -lh dist/* # file sizes should be similar
+    $ tar tvzf dist/bleach-${VERSION}.tar.gz
+    $ unzip -v dist/bleach-${VERSION}-py2.py3-none-any.whl
+
+11. Upload them to PyPI::
 
       $ twine upload dist/*
 
-11. Push the new tag::
+12. Push the new tag::
 
       $ git push --tags official master
 
     That will push the release to PyPI.
 
-12. Blog posts, twitter, etc.
+13. Blog posts, twitter, etc.

From bf4c67a9623a34d5935399cc791fd284e55b9b00 Mon Sep 17 00:00:00 2001
From: CheesyFeet <mike.oakley@hotmail.co.uk>
Date: Wed, 14 Jul 2021 17:52:00 +0100
Subject: [PATCH 305/314] Fix attribute name in linkify docs.

---
 docs/linkify.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index 6d6a63ac..78255f17 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -101,7 +101,7 @@ For example, you could add a ``title`` attribute to all links:
    'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
 
 
-This would set the value of the ``rel`` attribute, stomping on a previous value
+This would set the value of the ``title`` attribute, stomping on a previous value
 if there was one.
 
 Here's another example that makes external links open in a new tab and look like

From cf313fa3fe390412dbf350523479ac48f2debe42 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 14 Jul 2021 15:02:47 -0400
Subject: [PATCH 306/314] Update for v3.3.1 release

---
 CHANGES            | 20 ++++++++++++++++++++
 CONTRIBUTORS       |  1 +
 bleach/__init__.py |  4 ++--
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index a1302469..770918a2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
 Bleach changes
 ==============
 
+Version 3.3.1 (July 14th, 2021)
+-------------------------------
+
+**Security fixes**
+
+None
+
+**Features**
+
+* add more tests for CVE-2021-23980 / GHSA-vv2x-vrpj-qqpq
+* bump python version to 3.8 for tox doc, vendorverify, and lint targets
+* update bug report template tag
+* update vendorverify script to detect and fail when extra files are vendored
+* update release process docs to check vendorverify passes locally
+
+**Bug fixes**
+
+* remove extra vendored django present in the v3.3.0 whl #595
+* duplicate h1 header doc fix (thanks Nguyễn Gia Phong / @McSinyx!)
+
 Version 3.3.0 (February 1st, 2021)
 ----------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 3ce10451..f3b90f97 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -59,6 +59,7 @@ Contributors:
 - Mark Paschal
 - mdxs
 - mitar
+- Nguyễn Gia Phong
 - Nikita Sobolev
 - nikolas
 - Oh Jinkyun
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 6cc994c2..d096ad51 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20210201"
+__releasedate__ = "20210714"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.3.0"
+__version__ = "3.3.1"
 VERSION = packaging.version.Version(__version__)
 
 

From ea1849a72adfad76f3a8836d72eb3ab65ea45a4e Mon Sep 17 00:00:00 2001
From: Greg Guthe <g-k@users.noreply.github.com>
Date: Wed, 14 Jul 2021 15:20:43 -0400
Subject: [PATCH 307/314] Update dev.rst

fix sanity check step indentation and colon
---
 docs/dev.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/dev.rst b/docs/dev.rst
index 5a77e096..066bd31f 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -97,11 +97,11 @@ Release process
 
      $ python setup.py sdist bdist_wheel
 
-10. Sanity check the release contents and sizes:
+10. Sanity check the release contents and sizes::
 
-    $ ls -lh dist/* # file sizes should be similar
-    $ tar tvzf dist/bleach-${VERSION}.tar.gz
-    $ unzip -v dist/bleach-${VERSION}-py2.py3-none-any.whl
+     $ ls -lh dist/* # file sizes should be similar
+     $ tar tvzf dist/bleach-${VERSION}.tar.gz
+     $ unzip -v dist/bleach-${VERSION}-py2.py3-none-any.whl
 
 11. Upload them to PyPI::
 

From e96c8d7147b34090c140cc619be7c4b024164ec6 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 12:46:10 -0500
Subject: [PATCH 308/314] drop EOL pythons from tox, setup.py, and ci

* tox: drop EOL pythons
* tox: drop pypy2
* tox: drop python 3.5
* setup.py: drop EOL pythons
* ci: drop python 3.5
---
 .github/workflows/test.yml |  2 +-
 setup.py                   |  7 ++-----
 tox.ini                    | 16 ++--------------
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ee4ed870..baea09bd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.5, 3.6, 3.7, 3.8, pypy3]
+        python-version: [3.6, 3.7, 3.8, pypy3]
         os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest]
 
     steps:
diff --git a/setup.py b/setup.py
index e53002fa..cf0f8525 100755
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@ def get_version():
     include_package_data=True,
     package_data={'': ['README.rst']},
     zip_safe=False,
-    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
+    python_requires='>=3.6',
     install_requires=install_requires,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
@@ -54,10 +54,7 @@ def get_version():
         'License :: OSI Approved :: Apache Software License',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3 :: Only',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
diff --git a/tox.ini b/tox.ini
index 08f119ad..244e9b53 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,8 +2,8 @@
 
 [tox]
 envlist =
-    py{27,35,36,37,38,py,py3}
-    py{27,35,36,37,38}-build-no-lang
+    py{36,37,38,py3}
+    py{36,37,38}-build-no-lang
     docs
     format-check
     lint
@@ -16,18 +16,6 @@ commands =
     pytest {posargs:-v}
     python setup.py build
 
-[testenv:py27-build-no-lang]
-setenv =
-    LANG=
-commands =
-    python setup.py build
-
-[testenv:py35-build-no-lang]
-setenv =
-    LANG=
-commands =
-     python setup.py build
-
 [testenv:py36-build-no-lang]
 setenv =
     LANG=

From ae196a3fd4da6db488421ed23889090628289ea1 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 13:09:10 -0500
Subject: [PATCH 309/314] tests_website: remove six, bump version, update open
 script

* tests_website: remove six usage from server.py
* tests_website: bump version
* tests_website: update open_test_page.py for Python 3
---
 tests_website/index.html        |  4 ++--
 tests_website/open_test_page.py |  9 ++++++---
 tests_website/server.py         | 25 +++++++++++--------------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests_website/index.html b/tests_website/index.html
index b4a92f9d..95df20f0 100644
--- a/tests_website/index.html
+++ b/tests_website/index.html
@@ -2,7 +2,7 @@
 <html>
     <head>
         <meta charset="UTF-8">
-        <title>Python Bleach 3.3.0</title>
+        <title>Python Bleach 4.0.0</title>
         <style>
          textarea, iframe {
              width: 95%;
@@ -20,7 +20,7 @@
         </style>
     </head>
     <body>
-        <h2>Python Bleach 3.3.0</h2>
+        <h2>Python Bleach 4.0.0</h2>
         <p>
             <a href="http://badge.fury.io/py/bleach"><img style="max-width:100%;" alt="pypi version" src="https://badge.fury.io/py/bleach.svg"></a>
             <a href="https://github.com/mozilla/bleach/actions?query=workflow%3ATest"><img style="max-width:100%;" alt="Build Status" src="https://github.com/mozilla/bleach/workflows/Test/badge.svg"></a>
diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py
index d1d8c127..23e15277 100755
--- a/tests_website/open_test_page.py
+++ b/tests_website/open_test_page.py
@@ -28,9 +28,12 @@
     # 'chromium',
     # 'chromium-browser',
 }
-REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
 
 
 if __name__ == "__main__":
-    for b in TEST_BROWSERS & REGISTERED_BROWSERS:
-        webbrowser.get(b).open_new_tab("http://localhost:8080")
+    for browser_name in TEST_BROWSERS:
+        try:
+            browser = webbrowser.get(browser_name)
+            browser.open_new_tab("http://localhost:8080")
+        except Exception as error:
+            print("error getting test browser %s: %s" % (browser_name, error))
diff --git a/tests_website/server.py b/tests_website/server.py
index 834729f9..2d25ea25 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -10,7 +10,8 @@
 
 """
 
-import six
+import http.server
+import socketserver
 
 import bleach
 
@@ -18,17 +19,17 @@
 PORT = 8080
 
 
-class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
+class BleachCleanHandler(http.server.SimpleHTTPRequestHandler):
+
+    # Prevent 'cannot bind to address' errors on restart
+    allow_reuse_address = True
+
     def do_POST(self):
-        if six.PY2:
-            content_len = int(self.headers.getheader("content-length", 0))
-        else:
-            content_len = int(self.headers.get("content-length", 0))
+        content_len = int(self.headers.get("content-length", 0))
         body = self.rfile.read(content_len)
         print("read %s bytes: %s" % (content_len, body))
 
-        if six.PY3:
-            body = body.decode("utf-8")
+        body = body.decode("utf-8")
         print("input: %r" % body)
         cleaned = bleach.clean(body)
 
@@ -37,16 +38,12 @@ def do_POST(self):
         self.send_header("Content-Type", "text/plain;charset=UTF-8")
         self.end_headers()
 
-        if six.PY3:
-            cleaned = bytes(cleaned, encoding="utf-8")
+        cleaned = bytes(cleaned, encoding="utf-8")
         print("cleaned: %r" % cleaned)
         self.wfile.write(cleaned)
 
 
 if __name__ == "__main__":
-    # Prevent 'cannot bind to address' errors on restart
-    six.moves.socketserver.TCPServer.allow_reuse_address = True
-
-    httpd = six.moves.socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler)
+    httpd = socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler)
     print("listening on localhost port %d" % PORT)
     httpd.serve_forever()

From 23d1397b66bdf2034c67d8a5a14f501e49cf2160 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 13:00:32 -0500
Subject: [PATCH 310/314] drop security support for bleach 3.x versions

---
 SECURITY.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 751dfdbb..83299538 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -7,8 +7,8 @@ currently being supported with security updates.
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 3.3.x   | :white_check_mark: |
-| < 3.2   | :x:                |
+| 4.0.x   | :white_check_mark: |
+| < 4     | :x:                |
 
 ## Reporting a Vulnerability
 

From 783029f75baed2cf647604bf568498f509b6e1de Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 12:59:20 -0500
Subject: [PATCH 311/314] remove six usage, __future__.unicode_literals, compat
 functions

* remove non-vendored six.moves imports
* sanitizer: replace six.string_types check with str
* linkifier: replace six.string_types check with str
* sanitizer: drop force_unicode call
* linkifier: drop force_unicode calls
* html5lib_shim: replace six text_type and unichr calls
* utils: remove force_unicode fn
* remove unicode_literals imports
---
 bleach/__init__.py          |  2 --
 bleach/callbacks.py         |  1 -
 bleach/html5lib_shim.py     |  8 ++------
 bleach/linkifier.py         | 14 +++++---------
 bleach/sanitizer.py         | 11 +++--------
 bleach/utils.py             | 21 ---------------------
 docs/clean.rst              |  2 +-
 docs/linkify.rst            |  4 ++--
 tests/test_callbacks.py     |  2 --
 tests/test_clean.py         |  2 --
 tests/test_css.py           |  2 --
 tests/test_html5lib_shim.py |  2 --
 tests/test_linkify.py       |  4 +---
 tests/test_unicode.py       |  2 --
 14 files changed, 14 insertions(+), 63 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index d096ad51..fcf1a8c7 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-from __future__ import unicode_literals
-
 import packaging.version
 
 from bleach.linkifier import (
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index 6ef4c259..c895203d 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -1,5 +1,4 @@
 """A set of basic callbacks for bleach.linkify."""
-from __future__ import unicode_literals
 
 
 def nofollow(attrs, new=False):
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index b886ca50..35fab607 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -4,14 +4,10 @@
 html5lib library without having to change a lot of code.
 """
 
-from __future__ import unicode_literals
-
 import re
 import string
 import warnings
 
-import six
-
 # ignore html5lib deprecation warnings to use bleach; we are bleach
 # apply before we import submodules that import html5lib
 warnings.filterwarnings(
@@ -245,7 +241,7 @@ def get_tag(self):
         is the "tag" that is being tokenized.
 
         """
-        return six.text_type("").join(self._buffer)
+        return "".join(self._buffer)
 
     def start_tag(self):
         """Resets stream history to just '<'
@@ -473,7 +469,7 @@ def convert_entity(value):
 
         code_point = int(int_as_string, base)
         if 0 < code_point < 0x110000:
-            return six.unichr(code_point)
+            return chr(code_point)
         else:
             return None
 
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index c7618e85..834c0152 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -1,10 +1,8 @@
-from __future__ import unicode_literals
 import re
-import six
 
 from bleach import callbacks as linkify_callbacks
 from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
+from bleach.utils import alphabetize_attributes
 
 
 #: List of default callbacks
@@ -171,11 +169,9 @@ def linkify(self, text):
         :raises TypeError: if ``text`` is not a text type
 
         """
-        if not isinstance(text, six.string_types):
+        if not isinstance(text, str):
             raise TypeError("argument must be of text type")
 
-        text = force_unicode(text)
-
         if not text:
             return ""
 
@@ -324,7 +320,7 @@ def handle_email_addresses(self, src_iter):
                         new_tokens.extend(
                             [
                                 {"type": "StartTag", "name": "a", "data": attrs},
-                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "Characters", "data": str(_text)},
                                 {"type": "EndTag", "name": "a"},
                             ]
                         )
@@ -448,7 +444,7 @@ def handle_links(self, src_iter):
                         new_tokens.extend(
                             [
                                 {"type": "StartTag", "name": "a", "data": attrs},
-                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "Characters", "data": str(_text)},
                                 {"type": "EndTag", "name": "a"},
                             ]
                         )
@@ -511,7 +507,7 @@ def handle_a_tag(self, token_buffer):
                 # all the tokens between the start and end "a" tags and replace
                 # it with the new text
                 yield a_token
-                yield {"type": "Characters", "data": force_unicode(new_text)}
+                yield {"type": "Characters", "data": str(new_text)}
                 yield token_buffer[-1]
 
     def __iter__(self):
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0f5b7cc5..c75ebb8f 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,15 +1,12 @@
-from __future__ import unicode_literals
-
 from itertools import chain
 import re
 import warnings
 
-import six
-from six.moves.urllib.parse import urlparse
+from urllib.parse import urlparse
 from xml.sax.saxutils import unescape
 
 from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
+from bleach.utils import alphabetize_attributes
 
 
 #: List of allowed tags
@@ -160,7 +157,7 @@ def clean(self, text):
         :raises TypeError: if ``text`` is not a text type
 
         """
-        if not isinstance(text, six.string_types):
+        if not isinstance(text, str):
             message = (
                 "argument cannot be of '{name}' type, must be of text type".format(
                     name=text.__class__.__name__
@@ -171,8 +168,6 @@ def clean(self, text):
         if not text:
             return ""
 
-        text = force_unicode(text)
-
         dom = self.parser.parseFragment(text)
         filtered = BleachSanitizerFilter(
             source=self.walker(dom),
diff --git a/bleach/utils.py b/bleach/utils.py
index ad780d52..6be59f6f 100644
--- a/bleach/utils.py
+++ b/bleach/utils.py
@@ -1,7 +1,5 @@
 from collections import OrderedDict
 
-import six
-
 
 def _attr_key(attr):
     """Returns appropriate key for sorting attribute names
@@ -21,22 +19,3 @@ def alphabetize_attributes(attrs):
         return attrs
 
     return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])
-
-
-def force_unicode(text):
-    """Takes a text (Python 2: str/unicode; Python 3: unicode) and converts to unicode
-
-    :arg str/unicode text: the text in question
-
-    :returns: text as unicode
-
-    :raises UnicodeDecodeError: if the text was a Python 2 str and isn't in
-        utf-8
-
-    """
-    # If it's already unicode, then return it
-    if isinstance(text, six.text_type):
-        return text
-
-    # If not, convert it
-    return six.text_type(text, "utf-8", "strict")
diff --git a/docs/clean.rst b/docs/clean.rst
index 9279765a..4c1869b0 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -173,7 +173,7 @@ attributes for specified tags:
 
 .. doctest::
 
-   >>> from six.moves.urllib.parse import urlparse
+   >>> from urllib.parse import urlparse
    >>> import bleach
 
    >>> def allow_src(tag, name, value):
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 78255f17..9f4ca997 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -109,7 +109,7 @@ an external link:
 
 .. doctest::
 
-   >>> from six.moves.urllib.parse import urlparse
+   >>> from urllib.parse import urlparse
    >>> from bleach.linkifier import Linker
 
    >>> def set_target(attrs, new=False):
@@ -204,7 +204,7 @@ Example of switching all links to go through a bouncer first:
 
 .. doctest::
 
-   >>> from six.moves.urllib.parse import quote, urlparse
+   >>> from urllib.parse import quote, urlparse
    >>> from bleach.linkifier import Linker
 
    >>> def outgoing_bouncer(attrs, new=False):
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 121d14bf..69441e75 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 from bleach.callbacks import nofollow, target_blank
 
 
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 0b7570c7..0f18b173 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 import os
 
 import pytest
diff --git a/tests/test_css.py b/tests/test_css.py
index 43bdc4b6..50b47701 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 from functools import partial
 from timeit import timeit
 
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 5a836bcd..22de03bf 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 import pytest
 
 from bleach import html5lib_shim
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index b5704a1e..a215da51 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -1,9 +1,7 @@
-from __future__ import unicode_literals
-
 import re
 
 import pytest
-from six.moves.urllib_parse import quote_plus
+from urllib.parse import quote_plus
 
 from bleach import linkify, DEFAULT_CALLBACKS as DC
 from bleach.linkifier import Linker, LinkifyFilter
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index 50538039..db3545e1 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -1,6 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
 import pytest
 
 from bleach import clean, linkify

From 4d8a2e04fd88e4d501cb8524c66cecb1ee65c659 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Mon, 25 Jan 2021 12:07:31 -0500
Subject: [PATCH 312/314] drop object from class definitions

---
 bleach/html5lib_shim.py | 2 +-
 bleach/linkifier.py     | 2 +-
 bleach/sanitizer.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 35fab607..3c9c3306 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -190,7 +190,7 @@
 ]
 
 
-class InputStreamWithMemory(object):
+class InputStreamWithMemory:
     """Wraps an HTMLInputStream to remember characters since last <
 
     This wraps existing HTMLInputStream classes to keep track of the stream
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 834c0152..759882e9 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -90,7 +90,7 @@ def build_email_re(tlds=TLDS):
 EMAIL_RE = build_email_re()
 
 
-class Linker(object):
+class Linker:
     """Convert URL-like strings in an HTML fragment to links
 
     This function converts strings that look like URLs, domain names and email
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c75ebb8f..327b0224 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -52,7 +52,7 @@
 INVISIBLE_REPLACEMENT_CHAR = "?"
 
 
-class Cleaner(object):
+class Cleaner:
     """Cleaner for cleaning HTML fragments of malicious content
 
     This cleaner is a security-focused function whose sole purpose is to remove

From 275a51c305943154969c1d8bcb482a89cda16504 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 13:01:35 -0500
Subject: [PATCH 313/314] bump major version and update releasedate

---
 bleach/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index fcf1a8c7..c2fe89e0 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -16,9 +16,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20210714"
+__releasedate__ = "20210803"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.3.1"
+__version__ = "4.0.0"
 VERSION = packaging.version.Version(__version__)
 
 

From 5537128215f43d278ebecd2d4c9e398c8f77936b Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 5 Nov 2020 13:12:56 -0500
Subject: [PATCH 314/314] update CHANGES and CONTRIBUTORS

---
 CHANGES      | 15 +++++++++++++++
 CONTRIBUTORS |  1 +
 2 files changed, 16 insertions(+)

diff --git a/CHANGES b/CHANGES
index 770918a2..cec900b4 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,21 @@
 Bleach changes
 ==============
 
+Version 4.0.0 (August 3rd, 2021)
+--------------------------------
+
+**Backwards incompatible changes**
+
+* Drop support for unsupported Python versions <3.6 #520
+
+**Security fixes**
+
+None
+
+**Features**
+
+* fix attribute name in the linkify docs (thanks @CheesyFeet!)
+
 Version 3.3.1 (July 14th, 2021)
 -------------------------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index f3b90f97..20624e19 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -31,6 +31,7 @@ Contributors:
 - Anton Kovalyov
 - Benjamin Peterson
 - Chad Birch
+- CheesyFeet
 - Chris Beaven
 - Dan Gayle
 - dave-shawley