From d0b3af80c723a0407e6dad8e6b2070829aef8f07 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 18 May 2016 18:03:23 +0100 Subject: [PATCH 001/314] Limit html5lib version before sanitizer API changes get released. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ddd95719..1c716ff4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ ordereddict six -html5lib>=0.999 +html5lib>=0.999,<0.99999999 # Requirements to run the test suite: nose From 6ef04917d14b47005ee0ebdae68fec3c144d4577 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 May 2016 09:26:48 -0400 Subject: [PATCH 002/314] Update CHANGES re: html5lib limitation --- CHANGES | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGES b/CHANGES index 4588b80a..8784e83e 100644 --- a/CHANGES +++ b/CHANGES @@ -17,6 +17,15 @@ Version 1.5? (in progress) allowed protocols. Thank you, Andreas Malecki! #149 +Version 1.4.3 (May 23rd, 2016) +------------------------------ + +**Changes** + +- Limit to html5lib >=0.999<0.99999999 because of impending change to + sanitizer api. #195 + + Version 1.4.2 (September 11, 2015) ---------------------------------- From 532463ef194be11f2b73af78a65b0016b68000c1 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 May 2016 09:27:52 -0400 Subject: [PATCH 003/314] Add html5lib restriction to setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6562aa97..8c37dcf1 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ install_requires = [ 'six', - 'html5lib>=0.999', + 'html5lib>=0.999,<0.99999999', ] try: From b652ef4f72f888fab5ee2416293c0f8c18cbfacd Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 May 2016 09:28:56 -0400 Subject: [PATCH 004/314] Update for 1.4.3 release --- CONTRIBUTORS | 1 + bleach/__init__.py | 2 +- docs/conf.py | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 3eb6c7f8..d93749c5 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -24,6 +24,7 @@ Contributors: - Chris Beaven - Erik Rose - Gaurav Dadhania +- Geoffrey Sneddon - Jaime Irurzun - Jeff Balogh - Lee, Cheon-il diff --git a/bleach/__init__.py b/bleach/__init__.py index aec2d340..0a574a3e 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -13,7 +13,7 @@ from .sanitizer import BleachSanitizer -VERSION = (1, 4, 2) +VERSION = (1, 4, 3) __version__ = '.'.join([str(n) for n in VERSION]) __all__ = ['clean', 'linkify'] diff --git a/docs/conf.py b/docs/conf.py index c1e953fd..3ea7bd10 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ # The short X.Y version. version = '1.4' # The full version, including alpha/beta/rc tags. -release = '1.4.1' +release = '1.4.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 8c37dcf1..9283a803 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def get_long_desc(): setup( name='bleach', - version='1.4.2', + version='1.4.3', description='An easy whitelist-based HTML-sanitizing tool.', long_description=get_long_desc(), maintainer='Jannis Leidel, Will Kahn-Greene', From e24095c8f3c2f00fea7dda9c43d5dd6ee200bf2f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 May 2016 09:40:21 -0400 Subject: [PATCH 005/314] Add testing for python 3.5 --- .travis.yml | 1 + MANIFEST.in | 1 + requirements.txt | 3 +++ setup.py | 1 + tox.ini | 2 +- 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 193f70a0..a0e93b16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.2" - "3.3" - "3.4" + - "3.5" - "pypy" install: - "pip install -r requirements.txt" diff --git a/MANIFEST.in b/MANIFEST.in index 9d5d250d..870f669c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ +include CHANGES include LICENSE include README.rst diff --git a/requirements.txt b/requirements.txt index 1c716ff4..b6d538e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,6 @@ tox # Requirements for building docs Sphinx + +# Requirements for updating package +twine diff --git a/setup.py b/setup.py index 9283a803..da408f0a 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def get_long_desc(): 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Software Development :: Libraries :: Python Modules', ] ) diff --git a/tox.ini b/tox.ini index 5d4fe518..704c3b51 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33, py34, pypy +envlist = py26, py27, py32, py33, py34, py35, pypy [testenv] commands = nosetests {posargs:-v} From 55d5ffbeb72cab230af17ea51e2577285ae59037 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 May 2016 09:40:46 -0400 Subject: [PATCH 006/314] Add dev docs This walks through the release process which will make it easier to remember how to do a release in the future. --- docs/dev.rst | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 67 insertions(+) create mode 100644 docs/dev.rst diff --git a/docs/dev.rst b/docs/dev.rst new file mode 100644 index 00000000..7338953a --- /dev/null +++ b/docs/dev.rst @@ -0,0 +1,66 @@ +================== +Bleach development +================== + +Docs +==== + +Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs +via a GitHub webhook. + + +Testing +======= + +Run:: + + $ tox + +That'll run bleach tests in all the supported Python environments. Note +that you need the necessary Python binaries for them all to be tested. + +Tests are run in Travis CI via a GitHub webhook. + + +Release process +=============== + +1. Checkout master tip. + +2. Check to make sure ``setup.py`` and ``requirements.txt``. + +3. Update version numbers in: + + * ``setup.py`` + * ``bleach/__init__.py`` + * ``docs/confg.py`` + + Set the version to something like ``0.4``. + +4. Update ``CONTRIBUTORS``, ``CHANGES`` and ``MANIFEST.in``. + +5. Verify correctness. + + 1. Run tests with tox + 2. Build the docs + 3. Verify everything works + +6. Push everything to GitHub. This will cause Travis to run the tests. + +7. After Travis is happy, tag the release:: + + $ git tag -a v0.4 + + Copy the details from ``CHANGES`` into the tag comment. + +8. Push the new tag:: + + $ git push --tags official master + +9. Update PyPI:: + + $ rm -rf dist + $ python setup.py sdist bdist_wheel + $ twine upload sdist/* + +10. Blog posts, twitter, update topic in ``#bleach``, etc. diff --git a/docs/index.rst b/docs/index.rst index 1d8c94b9..217dc159 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,7 @@ Contents clean linkify goals + dev changes From 26aac508617a2ad298479f2cd4cffe1b788d92f0 Mon Sep 17 00:00:00 2001 From: Tim Dumol Date: Fri, 27 May 2016 20:45:54 +0800 Subject: [PATCH 007/314] Unify version information to rely on a single source. --- bleach/__init__.py | 5 +---- bleach/version.py | 6 ++++++ docs/conf.py | 6 ++++-- setup.py | 21 ++++++++++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 bleach/version.py diff --git a/bleach/__init__.py b/bleach/__init__.py index 0a574a3e..3092cb7f 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -11,10 +11,7 @@ from . import callbacks as linkify_callbacks from .encoding import force_unicode from .sanitizer import BleachSanitizer - - -VERSION = (1, 4, 3) -__version__ = '.'.join([str(n) for n in VERSION]) +from .version import __version__, VERSION # flake8: noqa __all__ = ['clean', 'linkify'] diff --git a/bleach/version.py b/bleach/version.py new file mode 100644 index 00000000..134e4857 --- /dev/null +++ b/bleach/version.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +VERSION = (1, 4, 3) +__version__ = '.'.join([str(n) for n in VERSION]) diff --git a/docs/conf.py b/docs/conf.py index 3ea7bd10..88fe431c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,6 +13,8 @@ import sys, os +from bleach import __version__, VERSION + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -49,9 +51,9 @@ # built documents. # # The short X.Y version. -version = '1.4' +version = '.'.join([str(n) for n in VERSION[:2]]) # The full version, including alpha/beta/rc tags. -release = '1.4.3' +release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index da408f0a..d84e407f 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,7 @@ +import re + from setuptools import setup, find_packages +from distutils.util import convert_path install_requires = [ 'six', @@ -20,10 +23,26 @@ def get_long_desc(): desc += open('CHANGES').read() return desc +# foolproof way of single-sourcing version as per +# http://stackoverflow.com/a/24517154/112943 +# Here we use re.search instead of exec to avoid any +# possibility of side effects in version.py +version_path = convert_path('bleach/version.py') +with open(version_path) as version_file: + for line in version_file: + if line.startswith('VERSION = '): + match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line) + __version__ = '{0!s}.{1!s}.{2!s}'.format( + match.group(1), + match.group(2), + match.group(3) + ) + break + setup( name='bleach', - version='1.4.3', + version=__version__, description='An easy whitelist-based HTML-sanitizing tool.', long_description=get_long_desc(), maintainer='Jannis Leidel, Will Kahn-Greene', From b6da6ca9719dbb6aba0180686a593f5cc6a4c5af Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 14 Jun 2016 15:26:44 +0100 Subject: [PATCH 008/314] Move version acquiring to a function --- setup.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index d84e407f..39d9696e 100644 --- a/setup.py +++ b/setup.py @@ -23,26 +23,23 @@ def get_long_desc(): desc += open('CHANGES').read() return desc -# foolproof way of single-sourcing version as per -# http://stackoverflow.com/a/24517154/112943 -# Here we use re.search instead of exec to avoid any -# possibility of side effects in version.py -version_path = convert_path('bleach/version.py') -with open(version_path) as version_file: - for line in version_file: - if line.startswith('VERSION = '): - match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line) - __version__ = '{0!s}.{1!s}.{2!s}'.format( - match.group(1), - match.group(2), - match.group(3) - ) - break + +def get_version(): + version_path = convert_path('bleach/version.py') + with open(version_path) as version_file: + for line in version_file: + if line.startswith('VERSION = '): + match = re.search(r'[(](\d+), (\d+), (\d+)[)]$', line) + return '{0!s}.{1!s}.{2!s}'.format( + match.group(1), + match.group(2), + match.group(3) + ) setup( name='bleach', - version=__version__, + version=get_version(), description='An easy whitelist-based HTML-sanitizing tool.', long_description=get_long_desc(), maintainer='Jannis Leidel, Will Kahn-Greene', From 2235b8fcadc8abef3a2845bb0ce67206982f3489 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 14 Jun 2016 15:26:54 +0100 Subject: [PATCH 009/314] Add Tim to CONTRIBUTORS list --- CONTRIBUTORS | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index d93749c5..a587e807 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -39,6 +39,7 @@ Contributors: - Ricky Rosario - Ryan Niemeyer - Sébastien Fievet +- Tim Dumol - Timothy Fitz - Vitaly Volkov - mdxs From 592972bd12962bed9ca1f5af551199828a678be1 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 14 Jun 2016 16:10:51 +0100 Subject: [PATCH 010/314] Fix linkify edge case with ending ) It's common to put urls in parentheses. Sometimes the url_re will pick up end parentheses that are part of the parenthetical, but shouldn't be part of the url. This fix alleviates that somewhat by checking to see if the url has a ( and if not, stripping any ) from the end. This assumes that urls won't end in ) without also having a ( in them. This is based on a fix from Istvan Albert. Fixes #190 --- bleach/__init__.py | 8 ++++++++ bleach/tests/test_links.py | 15 ++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 3092cb7f..c55bc6b7 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -344,6 +344,14 @@ def link_repl(match): if url.startswith('('): _wrapping = strip_wrapping_parentheses(url) url, open_brackets, close_brackets = _wrapping + if url.endswith(')') and '(' not in url: + # This is a clumsy handling for the case where we have something + # like (foo http://example.com) and the ) gets picked up by the + # url_re but we don't want it part of the link. + new_url = url.rstrip(')') + close_brackets += len(url) - len(new_url) + url = new_url + end = '' m = re.search(punct_re, url) if m: diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py index 62da8d19..97226c1c 100644 --- a/bleach/tests/test_links.py +++ b/bleach/tests/test_links.py @@ -360,11 +360,16 @@ def test_wrapping_parentheses(): tests = ( ('(example.com)', ('(', 'example.com', 'example.com', ')')), ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), - ('(example.com/foo)', ('(', 'example.com/foo', - 'example.com/foo', ')')), - ('(((example.com/))))', ('(((', 'example.com/)', - 'example.com/)', ')))')), - ('example.com/))', ('', 'example.com/))', 'example.com/))', '')), + ('(example.com/foo)', + ('(', 'example.com/foo', 'example.com/foo', ')')), + ('(((example.com/))))', + ('(((', 'example.com/', 'example.com/', '))))')), + ('example.com/))', + ('', 'example.com/', 'example.com/', '))')), + ('(foo http://example.com/)', + ('(foo ', 'example.com/', 'http://example.com/', ')')), + ('(foo http://example.com)', + ('(foo ', 'example.com', 'http://example.com', ')')), ('http://en.wikipedia.org/wiki/Test_(assessment)', ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), From f635db1718db681afa712da98e0ca76e3a70d298 Mon Sep 17 00:00:00 2001 From: Dan Gayle Date: Thu, 3 Dec 2015 15:55:32 -0800 Subject: [PATCH 011/314] Fix logging when there's no handler Added NullHandler to logging, to prevent "No handlers could be found for logger "bleach"" warnings in applications that haven't set up logging properly. Fixes #182 --- bleach/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 3092cb7f..5c53d3a4 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -2,6 +2,12 @@ from __future__ import unicode_literals import logging +try: # Python 2.7+ + from logging import NullHandler +except ImportError: + class NullHandler(logging.Handler): + def emit(self, record): + pass import re import html5lib @@ -15,7 +21,7 @@ __all__ = ['clean', 'linkify'] -log = logging.getLogger('bleach') +log = logging.getLogger(__name__).addHandler(NullHandler()) ALLOWED_TAGS = [ 'a', From 86af2305f2f4e04ca9f62a00cae7cfba782bc20e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 14 Jun 2016 16:34:25 +0100 Subject: [PATCH 012/314] Fix log so it's not None Fixes #182 --- bleach/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 5c53d3a4..53217bac 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -21,7 +21,8 @@ def emit(self, record): __all__ = ['clean', 'linkify'] -log = logging.getLogger(__name__).addHandler(NullHandler()) +log = logging.getLogger(__name__) +log.addHandler(NullHandler()) ALLOWED_TAGS = [ 'a', From 9bd8f721f2eabc5a014f33e7053e03bbba98c736 Mon Sep 17 00:00:00 2001 From: Istvan Albert Date: Tue, 14 Jun 2016 17:10:39 +0100 Subject: [PATCH 013/314] Children of
 tags should not be linkified when
 skip_pre=True

Fixes #150
---
 bleach/__init__.py         | 2 +-
 bleach/tests/test_links.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3092cb7f..ac163d12 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -315,7 +315,7 @@ def linkify_nodes(tree, parse_text=True):
                 if node.tag == ETREE_TAG('pre') and skip_pre:
                     linkify_nodes(node, False)
                 elif not (node in _seen):
-                    linkify_nodes(node, True)
+                    linkify_nodes(node, parse_text)
 
             current_child += 1
 
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index 62da8d19..2958f5e6 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -314,6 +314,13 @@ def test_skip_pre():
     eq_(nofollowed, linkify(already_linked))
     eq_(nofollowed, linkify(already_linked, skip_pre=True))
 
+    eq_(
+        linkify('
http://example.com
http://example.com', + skip_pre=True), + ('
http://example.com
' + 'http://example.com') + ) + def test_libgl(): """libgl.so.1 should not be linkified.""" From c28b9e37ed659a588e49bf7bf1881ec4e6d7bc25 Mon Sep 17 00:00:00 2001 From: Jannis Leidel Date: Tue, 21 Jun 2016 16:48:21 +0200 Subject: [PATCH 014/314] Auto-release to PyPI on tag. Fix #209. This also makes wheel files be universal (Python 2 & 3). --- .travis.yml | 35 ++++++++++++++++++++++++----------- setup.cfg | 3 +++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index a0e93b16..9a1f0b43 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,28 @@ sudo: false language: python +cache: + directories: + - "~/.cache/pip" python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - - "pypy" -install: - - "pip install -r requirements.txt" +- "2.6" +- "2.7" +- "3.2" +- "3.3" +- "3.4" +- "3.5" +- "pypy" +install: +- pip install -r requirements.txt script: - - nosetests - - flake8 bleach/ +- nosetests +- flake8 bleach/ +deploy: + provider: pypi + user: jezdez + distributions: sdist bdist_wheel + password: + secure: TTLpnNBAmRBPe4qITwtM6MRXw3CvGpflnkG6V97oKYL1RJhDXmxIxxImkGyVoT2IR4Oy/jqEikWUCCC3aDoqDnIkkDVriTPmo5PGnS2WgvEmYdcaTIp+RXdKwKhpCVX8ITEuye0iCXYu28vDaySGjnxjlYAP4S0PGPUzh/tn4DY= + on: + tags: true + repo: mozilla/bleach + python: "2.7" diff --git a/setup.cfg b/setup.cfg index 81cd366c..38f6166d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,5 @@ [flake8] ignore = E731,W503 + +[wheel] +universal=1 From 7aebc95eb95224f9303762adfbbce70689ad4b81 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 16 Jun 2016 12:08:55 +0100 Subject: [PATCH 015/314] Change requirements to use py.test --- requirements.txt | 22 ++++++++++++---------- setup.py | 4 ---- tox.ini | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index b6d538e3..a45f7810 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,16 @@ -ordereddict -six -html5lib>=0.999,<0.99999999 +-e . +ordereddict==1.1 +six==1.10.0 +html5lib>=0.999,<0.99999999 + # Requirements to run the test suite: -nose -flake8 -tox - +pytest==2.9.2 +flake8==2.6.0 +tox==2.3.1 + # Requirements for building docs -Sphinx - +Sphinx==1.4.4 + # Requirements for updating package -twine +twine==1.6.5 diff --git a/setup.py b/setup.py index 39d9696e..26686efd 100644 --- a/setup.py +++ b/setup.py @@ -50,10 +50,6 @@ def get_version(): package_data={'': ['README.rst']}, zip_safe=False, install_requires=install_requires, - tests_require=[ - 'nose>=1.3', - ], - test_suite='nose.collector', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', diff --git a/tox.ini b/tox.ini index 704c3b51..da138989 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ envlist = py26, py27, py32, py33, py34, py35, pypy [testenv] -commands = nosetests {posargs:-v} +commands = py.test {posargs:-v} deps = six html5lib==0.999 From 01f8cf62d9e70948d412d9dc48b0216d2edec216 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 16 Jun 2016 12:10:25 +0100 Subject: [PATCH 016/314] Move tests out of the code root --- {bleach/tests => tests}/__init__.py | 0 {bleach/tests => tests}/test_basics.py | 0 {bleach/tests => tests}/test_css.py | 0 {bleach/tests => tests}/test_links.py | 0 {bleach/tests => tests}/test_security.py | 0 {bleach/tests => tests}/test_unicode.py | 0 {bleach/tests => tests}/tools.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename {bleach/tests => tests}/__init__.py (100%) rename {bleach/tests => tests}/test_basics.py (100%) rename {bleach/tests => tests}/test_css.py (100%) rename {bleach/tests => tests}/test_links.py (100%) rename {bleach/tests => tests}/test_security.py (100%) rename {bleach/tests => tests}/test_unicode.py (100%) rename {bleach/tests => tests}/tools.py (100%) diff --git a/bleach/tests/__init__.py b/tests/__init__.py similarity index 100% rename from bleach/tests/__init__.py rename to tests/__init__.py diff --git a/bleach/tests/test_basics.py b/tests/test_basics.py similarity index 100% rename from bleach/tests/test_basics.py rename to tests/test_basics.py diff --git a/bleach/tests/test_css.py b/tests/test_css.py similarity index 100% rename from bleach/tests/test_css.py rename to tests/test_css.py diff --git a/bleach/tests/test_links.py b/tests/test_links.py similarity index 100% rename from bleach/tests/test_links.py rename to tests/test_links.py diff --git a/bleach/tests/test_security.py b/tests/test_security.py similarity index 100% rename from bleach/tests/test_security.py rename to tests/test_security.py diff --git a/bleach/tests/test_unicode.py b/tests/test_unicode.py similarity index 100% rename from bleach/tests/test_unicode.py rename to tests/test_unicode.py diff --git a/bleach/tests/tools.py b/tests/tools.py similarity index 100% rename from bleach/tests/tools.py rename to tests/tools.py From 04f49ff1f93ec6344479a226df5b02cf5bf5a715 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 11:03:32 -0400 Subject: [PATCH 017/314] Update dev requirements to latest --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index a45f7810..79aa6e61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,12 +5,12 @@ six==1.10.0 html5lib>=0.999,<0.99999999 # Requirements to run the test suite: -pytest==2.9.2 -flake8==2.6.0 -tox==2.3.1 +pytest==3.0.3 +flake8==3.0.4 +tox==2.4.1 # Requirements for building docs -Sphinx==1.4.4 +Sphinx==1.4.8 # Requirements for updating package -twine==1.6.5 +twine==1.8.1 From 6c070a8a132f9de3b57d9293fa0aebdfa5ce3af7 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 11:10:34 -0400 Subject: [PATCH 018/314] Rewrite test_basics.py to work with py.test --- tests/__init__.py | 0 tests/test_basics.py | 165 +++++++++++++++++++++++++++---------------- 2 files changed, 103 insertions(+), 62 deletions(-) delete mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_basics.py b/tests/test_basics.py index 18fc2a6c..6fa6c22e 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,13 +1,11 @@ import six import html5lib -from nose.tools import eq_ import bleach -from bleach.tests.tools import in_ def test_empty(): - eq_('', bleach.clean('')) + assert bleach.clean('') == '' def test_nbsp(): @@ -16,48 +14,58 @@ def test_nbsp(): else: expected = six.u('\\xa0test string\\xa0') - eq_(expected, bleach.clean(' test string ')) + assert bleach.clean(' test string ') == expected def test_comments_only(): comment = '' open_comment = ''.format(open_comment), bleach.clean(open_comment, - strip_comments=False)) + assert bleach.clean(comment) == '' + assert bleach.clean(open_comment) == '' + assert bleach.clean(comment, strip_comments=False) == comment + assert ( + bleach.clean(open_comment, strip_comments=False) == + '{0!s}-->'.format(open_comment) + ) def test_with_comments(): html = 'Just text' - eq_('Just text', bleach.clean(html)) - eq_(html, bleach.clean(html, strip_comments=False)) + assert 'Just text', bleach.clean(html) == 'Just text' + assert bleach.clean(html, strip_comments=False) == html def test_no_html(): - eq_('no html string', bleach.clean('no html string')) + assert bleach.clean('no html string') == 'no html string' def test_allowed_html(): - eq_('an allowed tag', - bleach.clean('an allowed tag')) - eq_('another good tag', - bleach.clean('another good tag')) + assert ( + bleach.clean('an allowed tag') == + 'an allowed tag' + ) + assert ( + bleach.clean('another good tag') == + 'another good tag' + ) def test_bad_html(): - eq_('a fixed tag', - bleach.clean('a fixed tag')) + assert ( + bleach.clean('a fixed tag') == + 'a fixed tag' + ) def test_function_arguments(): TAGS = ['span', 'br'] ATTRS = {'span': ['style']} - eq_('a
test', + assert ( bleach.clean('a
test', - tags=TAGS, attributes=ATTRS)) + tags=TAGS, attributes=ATTRS) == + 'a
test' + ) def test_named_arguments(): @@ -65,73 +73,104 @@ def test_named_arguments(): s = ('xx.com', 'xx.com') - eq_('xx.com', bleach.clean(s[0])) - in_(s, bleach.clean(s[0], attributes=ATTRS)) + assert bleach.clean(s[0]) == 'xx.com' + # FIXME: This might not be needed if attribute order is stable now. + assert bleach.clean(s[0], attributes=ATTRS) in s def test_disallowed_html(): - eq_('a <script>safe()</script> test', - bleach.clean('a test')) - eq_('a <style>body{}</style> test', - bleach.clean('a test')) + assert ( + bleach.clean('a test') == + 'a <script>safe()</script> test' + ) + assert ( + bleach.clean('a test') == + 'a <style>body{}</style> test' + ) def test_bad_href(): - eq_('no link', - bleach.clean('no link')) + assert ( + bleach.clean('no link') == + 'no link' + ) def test_bare_entities(): - eq_('an & entity', bleach.clean('an & entity')) - eq_('an < entity', bleach.clean('an < entity')) - eq_('tag < and entity', - bleach.clean('tag < and entity')) - eq_('&', bleach.clean('&')) + assert ( + bleach.clean('an & entity') == + 'an & entity' + ) + assert ( + bleach.clean('an < entity') == + 'an < entity' + ) + + assert ( + bleach.clean('tag < and entity') == + 'tag < and entity' + ) + + assert ( + bleach.clean('&') == + '&' + ) def test_escaped_entities(): s = '<em>strong</em>' - eq_(s, bleach.clean(s)) + assert bleach.clean(s) == s def test_serializer(): s = '
' - eq_(s, bleach.clean(s, tags=['table'])) - eq_('test
', bleach.linkify('test
')) - eq_('

test

', bleach.clean('

test

', tags=['p'])) + assert bleach.clean(s, tags=['table']) == s + assert bleach.linkify('test
') == 'test
' + assert bleach.clean('

test

', tags=['p']) == '

test

' def test_no_href_links(): s = 'x' - eq_(s, bleach.linkify(s)) + assert bleach.linkify(s) == s def test_weird_strings(): s = 'with
html tags', - bleach.clean('a test with html tags', strip=True)) - eq_('a test with html tags', - bleach.clean('a test with ' - 'html tags', strip=True)) + assert ( + bleach.clean('a test with html tags', strip=True) == + 'a test with html tags' + ) + assert ( + bleach.clean('a test with html tags', strip=True) == + 'a test with html tags' + ) s = '

link text

' - eq_('

link text

', bleach.clean(s, tags=['p'], strip=True)) + assert ( + bleach.clean(s, tags=['p'], strip=True) == + '

link text

' + ) s = '

multiply nested text

' - eq_('

multiply nested text

', bleach.clean(s, tags=['p'], strip=True)) + assert ( + bleach.clean(s, tags=['p'], strip=True) == + '

multiply nested text

' + ) s = ('

' '

') - eq_('

', - bleach.clean(s, tags=['p', 'a'], strip=True)) + assert ( + bleach.clean(s, tags=['p', 'a'], strip=True) == + '

' + ) def test_allowed_styles(): @@ -139,10 +178,12 @@ def test_allowed_styles(): STYLE = ['color'] blank = '' s = '' - eq_(blank, bleach.clean('', attributes=ATTR)) - eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE)) - eq_(s, bleach.clean('', - attributes=ATTR, styles=STYLE)) + assert bleach.clean('', attributes=ATTR) == blank + assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s + assert ( + bleach.clean('', attributes=ATTR, styles=STYLE) == + s + ) def test_idempotent(): @@ -150,10 +191,10 @@ def test_idempotent(): dirty = 'invalid & < extra http://link.com' clean = bleach.clean(dirty) - eq_(clean, bleach.clean(clean)) + assert bleach.clean(clean) == clean linked = bleach.linkify(dirty) - eq_(linked, bleach.linkify(linked)) + assert bleach.linkify(linked) == linked def test_rel_already_there(): @@ -165,15 +206,15 @@ def test_rel_already_there(): ('Click ' 'here.')) - in_(link_good, bleach.linkify(linked)) - in_(link_good, bleach.linkify(link_good[0])) + assert bleach.linkify(linked) in link_good + assert bleach.linkify(link_good[0]) in link_good def test_lowercase_html(): """We should output lowercase HTML.""" dirty = 'BAR' clean = 'BAR' - eq_(clean, bleach.clean(dirty, attributes=['class'])) + assert bleach.clean(dirty, attributes=['class']) == clean def test_wildcard_attributes(): @@ -186,22 +227,22 @@ def test_wildcard_attributes(): '') clean = ('both can have ', 'both can have ') - in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) + assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean def test_sarcasm(): """Jokes should crash.""" dirty = 'Yeah right ' clean = 'Yeah right <sarcasm/>' - eq_(clean, bleach.clean(dirty)) + assert bleach.clean(dirty) == clean def test_user_defined_protocols_valid(): valid_href = 'allowed href' - eq_(valid_href, bleach.clean(valid_href, protocols=['my_protocol'])) + assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href def test_user_defined_protocols_invalid(): invalid_href = 'invalid href' cleaned_href = 'invalid href' - eq_(cleaned_href, bleach.clean(invalid_href, protocols=['my_protocol'])) + assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href From 30637ed15a7912af215c1b4ac141b1e41f5ef2f7 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 11:18:04 -0400 Subject: [PATCH 019/314] Rewrite test_css.py to work with py.test --- tests/test_css.py | 107 +++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/tests/test_css.py b/tests/test_css.py index b40596ff..0b92f40b 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -1,6 +1,6 @@ from functools import partial -from nose.tools import eq_ +import pytest from bleach import clean @@ -8,47 +8,85 @@ clean = partial(clean, tags=['p'], attributes=['style']) -def test_allowed_css(): - tests = ( - ('font-family: Arial; color: red; float: left; ' - 'background-color: red;', 'color: red;', ['color']), - ('border: 1px solid blue; color: red; float: left;', 'color: red;', - ['color']), - ('border: 1px solid blue; color: red; float: left;', - 'color: red; float: left;', ['color', 'float']), - ('color: red; float: left; padding: 1em;', 'color: red; float: left;', - ['color', 'float']), - ('color: red; float: left; padding: 1em;', 'color: red;', ['color']), - ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']), - ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']), - ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']), - ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", - ['text-overflow']), - ('text-overflow: "," ellipsis;', 'text-overflow: "," ellipsis;', - ['text-overflow']), - ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']), - ) +@pytest.mark.parametrize('data,styles,expected', [ + ( + 'font-family: Arial; color: red; float: left; background-color: red;', + ['color'], + 'color: red;' + ), + ( + 'border: 1px solid blue; color: red; float: left;', + ['color'], + 'color: red;' + ), + ( + 'border: 1px solid blue; color: red; float: left;', + ['color', 'float'], + 'color: red; float: left;' + ), + ( + 'color: red; float: left; padding: 1em;', + ['color', 'float'], + 'color: red; float: left;' + ), + ( + 'color: red; float: left; padding: 1em;', + ['color'], + 'color: red;' + ), + ( + 'cursor: -moz-grab;', + ['cursor'], + 'cursor: -moz-grab;' + ), + ( + 'color: hsl(30,100%,50%);', + ['color'], + 'color: hsl(30,100%,50%);' + ), + ( + 'color: rgba(255,0,0,0.4);', + ['color'], + 'color: rgba(255,0,0,0.4);' + ), + ( + "text-overflow: ',' ellipsis;", + ['text-overflow'], + "text-overflow: ',' ellipsis;" + ), + ( + 'text-overflow: "," ellipsis;', + ['text-overflow'], + 'text-overflow: "," ellipsis;' + ), + ( + 'font-family: "Arial";', + ['font-family'], + 'font-family: "Arial";' + ), +]) +def test_allowed_css(data, styles, expected): p_single = '

bar

' p_double = "

bar

" - def check(i, o, s): - if '"' in i: - eq_(p_double.format(o), clean(p_double.format(i), styles=s)) - else: - eq_(p_single.format(o), clean(p_single.format(i), styles=s)) - - for i, o, s in tests: - yield check, i, o, s + if '"' in data: + assert clean(p_double.format(data), styles=styles) == p_double.format(expected) + else: + assert clean(p_single.format(data), styles=styles) == p_single.format(expected) def test_valid_css(): """The sanitizer should fix missing CSS values.""" styles = ['color', 'float'] - eq_('

foo

', - clean('

foo

', styles=styles)) - eq_('

foo

', - clean('

foo

', styles=styles)) + assert ( + clean('

foo

', styles=styles) == + '

foo

' + ) + assert ( + clean('

foo

', styles=styles) == + '

foo

' + ) def test_style_hang(): @@ -90,5 +128,4 @@ def test_style_hang(): """100%/normal 'Courier New', 'Andale Mono', monospace;">""" """Hello world

""") - result = clean(html, styles=styles) - eq_(expected, result) + assert clean(html, styles=styles) == expected From dbfb5401922c6b252f0ccb8cb3279d11ea51809b Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 13:19:13 -0400 Subject: [PATCH 020/314] Rewrite test_links.py to work with py.test --- tests/test_links.py | 634 ++++++++++++++++++++++++-------------------- 1 file changed, 353 insertions(+), 281 deletions(-) diff --git a/tests/test_links.py b/tests/test_links.py index 20d50ac8..15e40018 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -4,43 +4,49 @@ from urllib import quote_plus from html5lib.tokenizer import HTMLTokenizer -from nose.tools import eq_ +import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC def test_url_re(): - def no_match(s): - match = url_re.search(s) - if match: - assert not match, 'matched {0!s}'.format(s[slice(*match.span())]) - yield no_match, 'just what i am looking for...it' + text = 'just what i am looking for...it' + match = url_re.search(text) + assert not match, 'matched {0!s}'.format(text[slice(*match.span())]) def test_empty(): - eq_('', linkify('')) + assert linkify('') == '' def test_simple_link(): - eq_('a http://example.com' - ' link', - linkify('a http://example.com link')) - eq_('a https://example.com' - ' link', - linkify('a https://example.com link')) - eq_('a example.com link', - linkify('a example.com link')) + assert ( + linkify('a http://example.com link') == + 'a http://example.com link' + ) + assert ( + linkify('a https://example.com link') == + 'a https://example.com link' + ) + assert ( + linkify('a example.com link') == + 'a example.com link' + ) def test_trailing_slash(): - eq_('http://examp.com/', - linkify('http://examp.com/')) - eq_('' - 'http://example.com/foo/', - linkify('http://example.com/foo/')) - eq_('' - 'http://example.com/foo/bar/', - linkify('http://example.com/foo/bar/')) + assert ( + linkify('http://examp.com/') == + 'http://examp.com/' + ) + assert ( + linkify('http://example.com/foo/') == + 'http://example.com/foo/' + ) + assert ( + linkify('http://example.com/foo/bar/') == + 'http://example.com/foo/bar/' + ) def test_mangle_link(): @@ -50,9 +56,10 @@ def filter_url(attrs, new=False): attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs - eq_('' - 'http://example.com', - linkify('http://example.com', DC + [filter_url])) + assert ( + linkify('http://example.com', DC + [filter_url]) == + 'http://example.com' + ) def test_mangle_text(): @@ -62,92 +69,114 @@ def ft(attrs, new=False): attrs['_text'] = 'bar' return attrs - eq_('bar bar', - linkify('http://ex.mp foo', [ft])) - - -def test_email_link(): - tests = ( - ('a james@example.com mailto', False, 'a james@example.com mailto'), - ('a james@example.com.au mailto', False, - 'a james@example.com.au mailto'), - ('a james@example.com mailto', - True, 'a james@example.com mailto'), - ('aussie ' - 'james@example.com.au mailto', True, - 'aussie james@example.com.au mailto'), - # This is kind of a pathological case. I guess we do our best here. - ('email to ' - 'james@example.com', - True, - 'email to james@example.com'), - ('
' - 'jinkyun@example.com', - True, - '
jinkyun@example.com'), + assert ( + linkify('http://ex.mp foo', [ft]) == + 'bar bar' ) - def _check(o, p, i): - eq_(o, linkify(i, parse_email=p)) - for (o, p, i) in tests: - yield _check, o, p, i - - -def test_email_link_escaping(): - tests = ( - ('''''' - '''"james"@example.com''', - '"james"@example.com'), - ('''''' - '''"j'ames"@example.com''', - '"j\'ames"@example.com'), - ('''''' - '''"ja>mes"@example.com''', - '"ja>mes"@example.com'), +@pytest.mark.parametrize('data,parse_email,expected', [ + ( + 'a james@example.com mailto', + False, + 'a james@example.com mailto' + ), + ( + 'a james@example.com.au mailto', + False, + 'a james@example.com.au mailto' + ), + ( + 'a james@example.com mailto', + True, + 'a james@example.com mailto' + ), + ( + 'aussie james@example.com.au mailto', + True, + 'aussie james@example.com.au mailto' + ), + # This is kind of a pathological case. I guess we do our best here. + ( + 'email to james@example.com', + True, + 'email to james@example.com' + ), + ( + '
jinkyun@example.com', + True, + '
jinkyun@example.com' ) - - def _check(o, i): - eq_(o, linkify(i, parse_email=True)) - - for (o, i) in tests: - yield _check, o, i - - -def test_prevent_links(): +]) +def test_email_link(data, parse_email, expected): + assert linkify(data, parse_email=parse_email) == expected + + +@pytest.mark.parametrize('data,expected', [ + ( + '"james"@example.com', + '''"james"@example.com''' + ), + ( + '"j\'ames"@example.com', + '''"j'ames"@example.com''' + ), + ( + '"ja>mes"@example.com', + '''"ja>mes"@example.com''' + ), +]) +def test_email_link_escaping(data, expected): + assert linkify(data, parse_email=True) == expected + + +def no_new_links(attrs, new=False): + if new: + return None + return attrs + + +def no_old_links(attrs, new=False): + if not new: + return None + return attrs + + +def noop(attrs, new=False): + return attrs + + +@pytest.mark.parametrize('callback,expected', [ + ( + [noop], + 'a ex.mp example' + ), + ( + [no_new_links, noop], + 'a ex.mp example' + ), + ( + [noop, no_new_links], + 'a ex.mp example' + ), + ( + [no_old_links, noop], + 'a ex.mp example' + ), + ( + [noop, no_old_links], + 'a ex.mp example' + ), + ( + [no_old_links, no_new_links], + 'a ex.mp example' + ) +]) +def test_prevent_links(callback, expected): """Returning None from any callback should remove links or prevent them from being created.""" - - def no_new_links(attrs, new=False): - if new: - return None - return attrs - - def no_old_links(attrs, new=False): - if not new: - return None - return attrs - - def noop(attrs, new=False): - return attrs - - in_text = 'a ex.mp example' - out_text = 'a ex.mp example' - tests = ( - ([noop], ('a ex.mp ' - 'example'), 'noop'), - ([no_new_links, noop], in_text, 'no new, noop'), - ([noop, no_new_links], in_text, 'noop, no new'), - ([no_old_links, noop], out_text, 'no old, noop'), - ([noop, no_old_links], out_text, 'noop, no old'), - ([no_old_links, no_new_links], 'a ex.mp example', 'no links'), - ) - - def _check(cb, o, msg): - eq_(o, linkify(in_text, cb), msg) - - for (cb, o, msg) in tests: - yield _check, cb, o, msg + text = 'a ex.mp example' + assert linkify(text, callback) == expected def test_set_attrs(): @@ -157,8 +186,10 @@ def set_attr(attrs, new=False): attrs['rev'] = 'canonical' return attrs - eq_('ex.mp', - linkify('ex.mp', [set_attr])) + assert ( + linkify('ex.mp', [set_attr]) == + 'ex.mp' + ) def test_only_proto_links(): @@ -169,9 +200,10 @@ def only_proto(attrs, new=False): return attrs in_text = 'a ex.mp http://ex.mp bar' - out_text = ('a ex.mp http://ex.mp ' - 'bar') - eq_(out_text, linkify(in_text, [only_proto])) + assert ( + linkify(in_text, [only_proto]) == + 'a ex.mp http://ex.mp bar' + ) def test_stop_email(): @@ -181,121 +213,138 @@ def no_email(attrs, new=False): return None return attrs text = 'do not link james@example.com' - eq_(text, linkify(text, parse_email=True, callbacks=[no_email])) - - -def test_tlds(): - eq_('example.com', - linkify('example.com')) - eq_('example.co', - linkify('example.co')) - eq_('example.co.uk', - linkify('example.co.uk')) - eq_('example.edu', - linkify('example.edu')) - eq_('example.xxx', - linkify('example.xxx')) - eq_('example.yyy', linkify('example.yyy')) - eq_(' brie', linkify(' brie')) - eq_('bit.ly/fun', - linkify('bit.ly/fun')) + + assert linkify(text, parse_email=True, callbacks=[no_email]) == text + + +@pytest.mark.parametrize('data,expected', [ + # tlds + ('example.com', 'example.com'), + ('example.co', 'example.co'), + ('example.co.uk', 'example.co.uk'), + ('example.edu', 'example.edu'), + ('example.xxx', 'example.xxx'), + ('bit.ly/fun', 'bit.ly/fun'), + + # non-tlds + ('example.yyy', 'example.yyy'), + ('brie', 'brie'), +]) +def test_tlds(data, expected): + assert linkify(data) == expected def test_escaping(): - eq_('< unrelated', linkify('< unrelated')) + assert linkify('< unrelated') == '< unrelated' def test_nofollow_off(): - eq_('example.com', - linkify('example.com', [])) + assert linkify('example.com', []) == 'example.com' def test_link_in_html(): - eq_('http://yy.com', - linkify('http://yy.com')) - - eq_('http://xx.com' - '', - linkify('http://xx.com')) + assert ( + linkify('http://yy.com') == + 'http://yy.com' + ) + assert ( + linkify('http://xx.com') == + 'http://xx.com' + ) def test_links_https(): - eq_('https://yy.com', - linkify('https://yy.com')) + assert ( + linkify('https://yy.com') == + 'https://yy.com' + ) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" - eq_('http://yy.com', - linkify('http://yy.com')) + assert ( + linkify('http://yy.com') == + 'http://yy.com' + ) def test_url_with_path(): - eq_('' - 'http://example.com/path/to/file', - linkify('http://example.com/path/to/file')) + assert ( + linkify('http://example.com/path/to/file') == + 'http://example.com/path/to/file' + ) def test_link_ftp(): - eq_('' - 'ftp://ftp.mozilla.org/some/file', - linkify('ftp://ftp.mozilla.org/some/file')) + assert ( + linkify('ftp://ftp.mozilla.org/some/file') == + 'ftp://ftp.mozilla.org/some/file' + ) def test_link_query(): - eq_('' - 'http://xx.com/?test=win', - linkify('http://xx.com/?test=win')) - eq_('' - 'xx.com/?test=win', - linkify('xx.com/?test=win')) - eq_('' - 'xx.com?test=win', - linkify('xx.com?test=win')) + assert ( + linkify('http://xx.com/?test=win') == + 'http://xx.com/?test=win' + ) + assert ( + linkify('xx.com/?test=win') == + 'xx.com/?test=win' + ) + assert ( + linkify('xx.com?test=win') == + 'xx.com?test=win' + ) def test_link_fragment(): - eq_('' - 'http://xx.com/path#frag', - linkify('http://xx.com/path#frag')) + assert ( + linkify('http://xx.com/path#frag') == + 'http://xx.com/path#frag' + ) def test_link_entities(): - eq_('' - 'http://xx.com/?a=1&b=2', - linkify('http://xx.com/?a=1&b=2')) + assert ( + linkify('http://xx.com/?a=1&b=2') == + 'http://xx.com/?a=1&b=2' + ) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = '<em>strong</em>' - eq_(s, linkify(s)) + assert linkify(s) == s def test_link_http_complete(): - eq_('' - 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', - linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) + assert ( + linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') == + ( + '' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f' + ) + ) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' - eq_(s, linkify(s)) + assert linkify(s) == s def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' - eq_(s, linkify(s)) + assert linkify(s) == s def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" - eq_('All your{"xx.yy.com/grover.png"}base are', - linkify('All your{"xx.yy.com/grover.png"}base are')) + assert ( + linkify('All your{"xx.yy.com/grover.png"}base are') == + 'All your{"xx.yy.com/grover.png"}base are' + ) def test_skip_pre(): @@ -306,171 +355,194 @@ def test_skip_pre(): all_linked = ('http://xx.com ' '
http://xx.com'
                   '
') - eq_(linked, linkify(simple, skip_pre=True)) - eq_(all_linked, linkify(simple)) + assert linkify(simple, skip_pre=True) == linked + assert linkify(simple) == all_linked already_linked = '
xx
' nofollowed = '
xx
' - eq_(nofollowed, linkify(already_linked)) - eq_(nofollowed, linkify(already_linked, skip_pre=True)) - - eq_( - linkify('
http://example.com
http://example.com', - skip_pre=True), - ('
http://example.com
' - 'http://example.com') + assert linkify(already_linked) == nofollowed + assert linkify(already_linked, skip_pre=True) == nofollowed + + assert ( + linkify('
http://example.com
http://example.com', skip_pre=True) == + ( + '
http://example.com
' + 'http://example.com' + ) ) def test_libgl(): """libgl.so.1 should not be linkified.""" - eq_('libgl.so.1', linkify('libgl.so.1')) + s = 'libgl.so.1' + assert linkify(s) == s -def test_end_of_sentence(): +@pytest.mark.parametrize('url,periods', [ + ('example.com', '.'), + ('example.com', '...'), + ('ex.com/foo', '.'), + ('ex.com/foo', '....'), +]) +def test_end_of_sentence(url, periods): """example.com. should match.""" out = '{0!s}{1!s}' intxt = '{0!s}{1!s}' - def check(u, p): - eq_(out.format(u, p), - linkify(intxt.format(u, p))) - - tests = ( - ('example.com', '.'), - ('example.com', '...'), - ('ex.com/foo', '.'), - ('ex.com/foo', '....'), - ) - - for u, p in tests: - yield check, u, p + assert linkify(intxt.format(url, periods)) == out.format(url, periods) def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" - eq_('ex.com/foo, bar', - linkify('ex.com/foo, bar')) + assert ( + linkify('ex.com/foo, bar') == + 'ex.com/foo, bar' + ) def test_sarcasm(): """Jokes should crash.""" - dirty = 'Yeah right ' - clean = 'Yeah right <sarcasm/>' - eq_(clean, linkify(dirty)) - - -def test_wrapping_parentheses(): + assert linkify('Yeah right ') == 'Yeah right <sarcasm/>' + + +@pytest.mark.parametrize('data,expected_data', [ + ( + '(example.com)', + ('(', 'example.com', 'example.com', ')') + ), + ( + '(example.com/)', + ('(', 'example.com/', 'example.com/', ')') + ), + ( + '(example.com/foo)', + ('(', 'example.com/foo', 'example.com/foo', ')') + ), + ( + '(((example.com/))))', + ('(((', 'example.com/', 'example.com/', '))))') + ), + ( + 'example.com/))', + ('', 'example.com/', 'example.com/', '))') + ), + ( + '(foo http://example.com/)', + ('(foo ', 'example.com/', 'http://example.com/', ')') + ), + ( + '(foo http://example.com)', + ('(foo ', 'example.com', 'http://example.com', ')') + ), + ( + 'http://en.wikipedia.org/wiki/Test_(assessment)', + ('', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', '') + ), + ( + '(http://en.wikipedia.org/wiki/Test_(assessment))', + ('(', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', ')') + ), + ( + '((http://en.wikipedia.org/wiki/Test_(assessment))', + ('((', 'en.wikipedia.org/wiki/Test_(assessment', + 'http://en.wikipedia.org/wiki/Test_(assessment', '))') + ), + ( + '(http://en.wikipedia.org/wiki/Test_(assessment)))', + ('(', 'en.wikipedia.org/wiki/Test_(assessment))', + 'http://en.wikipedia.org/wiki/Test_(assessment))', ')') + ), + ( + '(http://en.wikipedia.org/wiki/)Test_(assessment', + ('(', 'en.wikipedia.org/wiki/)Test_(assessment', + 'http://en.wikipedia.org/wiki/)Test_(assessment', '') + ) +]) +def test_wrapping_parentheses(data, expected_data): """URLs wrapped in parantheses should not include them.""" out = '{0!s}{2!s}{3!s}' - tests = ( - ('(example.com)', ('(', 'example.com', 'example.com', ')')), - ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), - ('(example.com/foo)', - ('(', 'example.com/foo', 'example.com/foo', ')')), - ('(((example.com/))))', - ('(((', 'example.com/', 'example.com/', '))))')), - ('example.com/))', - ('', 'example.com/', 'example.com/', '))')), - ('(foo http://example.com/)', - ('(foo ', 'example.com/', 'http://example.com/', ')')), - ('(foo http://example.com)', - ('(foo ', 'example.com', 'http://example.com', ')')), - ('http://en.wikipedia.org/wiki/Test_(assessment)', - ('', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), - ('(http://en.wikipedia.org/wiki/Test_(assessment))', - ('(', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), - ('((http://en.wikipedia.org/wiki/Test_(assessment))', - ('((', 'en.wikipedia.org/wiki/Test_(assessment', - 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), - ('(http://en.wikipedia.org/wiki/Test_(assessment)))', - ('(', 'en.wikipedia.org/wiki/Test_(assessment))', - 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), - ('(http://en.wikipedia.org/wiki/)Test_(assessment', - ('(', 'en.wikipedia.org/wiki/)Test_(assessment', - 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), - ) - - def check(test, expected_output): - eq_(out.format(*expected_output), linkify(test)) - - for test, expected_output in tests: - yield check, test, expected_output + assert linkify(data) == out.format(*expected_data) def test_parentheses_with_removing(): - expect = '(test.py)' - eq_(expect, linkify(expect, callbacks=[lambda *a: None])) - - -def test_ports(): + expected = '(test.py)' + assert linkify(expected, callbacks=[lambda *a: None]) == expected + + +@pytest.mark.parametrize('data,expected_data', [ + ('http://foo.com:8000', ('http://foo.com:8000', '')), + ('http://foo.com:8000/', ('http://foo.com:8000/', '')), + ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), + ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), + ('http://foo.com:', ('http://foo.com', ':')), +]) +def test_ports(data, expected_data): """URLs can contain port numbers.""" - tests = ( - ('http://foo.com:8000', ('http://foo.com:8000', '')), - ('http://foo.com:8000/', ('http://foo.com:8000/', '')), - ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), - ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), - ('http://foo.com:', ('http://foo.com', ':')), - ) - - def check(test, output): - out = '{0}{1}' - eq_(out.format(*output), - linkify(test)) - - for test, output in tests: - yield check, test, output + out = '{0}{1}' + assert linkify(data) == out.format(*expected_data) def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' - eq_('test<x></x>', linkify(raw)) - eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) + assert linkify(raw) == 'test<x></x>' + assert linkify(raw, tokenizer=HTMLTokenizer) == raw def test_ignore_bad_protocols(): - eq_('foohttp://bar', - linkify('foohttp://bar')) - eq_('fohttp://exampl.com', - linkify('fohttp://exampl.com')) + assert ( + linkify('foohttp://bar') == + 'foohttp://bar' + ) + assert ( + linkify('fohttp://exampl.com') == + 'fohttp://exampl.com' + ) def test_max_recursion_depth(): """If we hit the max recursion depth, just return the string.""" test = '' * 2000 + 'foo' + '' * 2000 - eq_(test, linkify(test)) + assert linkify(test) == test def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" - output = ('' - 'http://example.com ' - 'person@example.com') - eq_(output, linkify('http://example.com person@example.com', - parse_email=True)) + assert ( + linkify('http://example.com person@example.com', parse_email=True) == + ( + '' + 'http://example.com ' + 'person@example.com' + ) + ) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' 'HTTP://EXAMPLE.COM') - eq_(expect, linkify('HTTP://EXAMPLE.COM')) + assert linkify('HTTP://EXAMPLE.COM') == expect def test_elements_inside_links(): - eq_('hello
', - linkify('hello
')) + assert ( + linkify('hello
') == + 'hello
' + ) - eq_('bold hello
', - linkify('bold hello
')) + assert ( + linkify('bold hello
') == + 'bold hello
' + ) def test_remove_first_childlink(): - expect = '

something

' callbacks = [lambda *a: None] - eq_(expect, - linkify('

something

', callbacks=callbacks)) + assert ( + linkify('

something

', callbacks=callbacks) == + '

something

' + ) From 530fcd283c9eab23a72739ae60e37acf16f23eec Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 13:47:16 -0400 Subject: [PATCH 021/314] Rewrite test_security.py to work with py.test --- tests/test_security.py | 121 +++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 47 deletions(-) diff --git a/tests/test_security.py b/tests/test_security.py index 6adab59b..7ebb25cd 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -1,90 +1,118 @@ """More advanced security tests""" -from nose.tools import eq_ - from bleach import clean def test_nested_script_tag(): - eq_('<<script>script>evil()<</script>/script>', - clean('</script>')) - eq_('<<x>script>evil()<</x>/script>', - clean('<script>evil()</script>')) + assert ( + clean('</script>') == + '<<script>script>evil()<</script>/script>' + ) + assert ( + clean('<script>evil()</script>') == + '<<x>script>evil()<</x>/script>' + ) def test_nested_script_tag_r(): - eq_('<script<script>>evil()</script<>>', - clean('>evil()>')) + assert ( + clean('>evil()>') == + '<script<script>>evil()</script<>>' + ) def test_invalid_attr(): IMG = ['img', ] IMG_ATTR = ['src'] - eq_('test', - clean('test')) - eq_('', - clean('', - tags=IMG, attributes=IMG_ATTR)) - eq_('', - clean('', - tags=IMG, attributes=IMG_ATTR)) + assert ( + clean('test') == + 'test' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) def test_unquoted_attr(): - eq_('myabbr', - clean('myabbr')) + assert ( + clean('myabbr') == + 'myabbr' + ) def test_unquoted_event_handler(): - eq_('xx.com', - clean('xx.com')) + assert ( + clean('xx.com') == + 'xx.com' + ) def test_invalid_attr_value(): - eq_('<img src="javascript:alert(\'XSS\');">', - clean('')) + assert ( + clean('') == + '<img src="javascript:alert(\'XSS\');">' + ) def test_invalid_href_attr(): - eq_('xss', - clean('xss')) + assert ( + clean('xss') == + 'xss' + ) def test_invalid_filter_attr(): IMG = ['img', ] IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"} - eq_('', - clean('', - tags=IMG, attributes=IMG_ATTR)) - - eq_('', clean('', - tags=IMG, attributes=IMG_ATTR)) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) def test_invalid_tag_char(): - eq_('<script xss="" src="http://xx.com/xss.js"></script>', - clean('')) - eq_('<script src="http://xx.com/xss.js"></script>', - clean('')) + assert ( + clean('') == + '<script xss="" src="http://xx.com/xss.js"></script>' + ) + assert ( + clean('') == + '<script src="http://xx.com/xss.js"></script>' + ) def test_unclosed_tag(): - eq_('<script src="http://xx.com/xss.js&lt;b">', - clean('ipt>' - eq_('pt>alert(1)ipt>', clean(s, strip=True)) + assert clean(s, strip=True) == 'pt>alert(1)ipt>' s = 'pt>pt>alert(1)' - eq_('pt>pt>alert(1)', clean(s, strip=True)) + assert clean(s, strip=True) == 'pt>pt>alert(1)' def test_nasty(): @@ -94,7 +122,7 @@ def test_nasty(): expect = ('<scr<script></script>ipt type="text/javascript"' '>alert("foo");</script>script<del></del>' '>') - eq_(expect, clean(test)) + assert clean(test) == expect def test_poster_attribute(): @@ -102,11 +130,10 @@ def test_poster_attribute(): tags = ['video'] attrs = {'video': ['poster']} test = '' - expect = '' - eq_(expect, clean(test, tags=tags, attributes=attrs)) + assert clean(test, tags=tags, attributes=attrs) == '' ok = '' - eq_(ok, clean(ok, tags=tags, attributes=attrs)) + assert clean(ok, tags=tags, attributes=attrs) == ok def test_feed_protocol(): - eq_('foo', clean('foo')) + assert clean('foo') == 'foo' From 76f54caf0c4b3b53a2708b3a1f45f83c2be346eb Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 13:53:43 -0400 Subject: [PATCH 022/314] Rewrite test_unicode.py to work with py.test --- tests/test_unicode.py | 57 +++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/tests/test_unicode.py b/tests/test_unicode.py index 723df5f2..b8b670e8 100644 --- a/tests/test_unicode.py +++ b/tests/test_unicode.py @@ -1,59 +1,52 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from nose.tools import eq_ + +import pytest from bleach import clean, linkify -from bleach.tests.tools import in_ def test_japanese_safe_simple(): - eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル')) - eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル')) + assert clean('ヘルプとチュートリアル') == 'ヘルプとチュートリアル' + assert linkify('ヘルプとチュートリアル') == 'ヘルプとチュートリアル' def test_japanese_strip(): - eq_('ヘルプとチュートリアル', - clean('ヘルプとチュートリアル')) - eq_('<span>ヘルプとチュートリアル</span>', - clean('ヘルプとチュートリアル')) + assert clean('ヘルプとチュートリアル') == 'ヘルプとチュートリアル' + assert clean('ヘルプとチュートリアル') == '<span>ヘルプとチュートリアル</span>' def test_russian_simple(): - eq_('Домашняя', clean('Домашняя')) - eq_('Домашняя', linkify('Домашняя')) + assert clean('Домашняя') == 'Домашняя' + assert linkify('Домашняя') == 'Домашняя' def test_mixed(): - eq_('Домашняяヘルプとチュートリアル', - clean('Домашняяヘルプとチュートリアル')) + assert clean('Домашняяヘルプとチュートリアル') == 'Домашняяヘルプとチュートリアル' def test_mixed_linkify(): - in_(('Домашняя ' - 'http://example.com ヘルプとチュートリアル', - 'Домашняя ' - 'http://example.com ヘルプとチュートリアル'), - linkify('Домашняя http://example.com ヘルプとチュートリアル')) + assert ( + linkify('Домашняя http://example.com ヘルプとチュートリアル') in + ( + 'Домашняя http://example.com ヘルプとチュートリアル', + 'Домашняя http://example.com ヘルプとチュートリアル' + ) + ) -def test_url_utf8(): +@pytest.mark.parametrize('test,expected', [ + ('http://éxámplé.com/', 'http://éxámplé.com/'), + ('http://éxámplé.com/íàñá/', 'http://éxámplé.com/íàñá/'), + ('http://éxámplé.com/íàñá/?foo=bar', 'http://éxámplé.com/íàñá/?foo=bar'), + ('http://éxámplé.com/íàñá/?fóo=bár', 'http://éxámplé.com/íàñá/?fóo=bár'), +]) +def test_url_utf8(test, expected): """Allow UTF8 characters in URLs themselves.""" outs = ('{0!s}', '{0!s}') out = lambda url: [x.format(url) for x in outs] - tests = ( - ('http://éxámplé.com/', out('http://éxámplé.com/')), - ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')), - ('http://éxámplé.com/íàñá/?foo=bar', - out('http://éxámplé.com/íàñá/?foo=bar')), - ('http://éxámplé.com/íàñá/?fóo=bár', - out('http://éxámplé.com/íàñá/?fóo=bár')), - ) - - def check(test, expected_output): - in_(expected_output, linkify(test)) - - for test, expected_output in tests: - yield check, test, expected_output + expected = out(expected) + assert linkify(test) in expected From ba7a7825b25a628e2810279b75d9254f6caee199 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 14:06:20 -0400 Subject: [PATCH 023/314] Update travis and tox files to use py.test --- .travis.yml | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9a1f0b43..8b45c177 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ python: install: - pip install -r requirements.txt script: -- nosetests +- py.test - flake8 bleach/ deploy: provider: pypi diff --git a/tox.ini b/tox.ini index da138989..8d409cf4 100644 --- a/tox.ini +++ b/tox.ini @@ -11,4 +11,4 @@ commands = py.test {posargs:-v} deps = six html5lib==0.999 - nose + pytest From 4d749c9e36d95b852d470537a0b03bdb06e7f5fc Mon Sep 17 00:00:00 2001 From: Alexandre Macabies Date: Sun, 19 Jun 2016 00:59:43 +0200 Subject: [PATCH 024/314] Use ASCII digits in port number parsing --- bleach/__init__.py | 2 +- tests/test_links.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index bf67bf33..3f08bfdf 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -71,7 +71,7 @@ def emit(self, record): url_re = re.compile( r"""\(* # Match any opening parentheses. \b(?"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) diff --git a/tests/test_links.py b/tests/test_links.py index 15e40018..67fc2d1a 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -473,11 +473,18 @@ def test_parentheses_with_removing(): @pytest.mark.parametrize('data,expected_data', [ + # Test valid ports ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), + + # Test non ports ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), + + # Test non-ascii ports + ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')), + ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')), ]) def test_ports(data, expected_data): """URLs can contain port numbers.""" From 7a6ab9d3fb7f61d71d4215f5ee308c266cbe8454 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 15:23:56 -0400 Subject: [PATCH 025/314] Allow travis to fail with Python 2.6 and 3.2 It's failing now with Python 2.6 because flake8 doesn't work with Python 2.6. It's failing with Python 3.2 because py.test doesn't work with Python 3.2. I'm not quite ready to drop support for both of them, though. So for now, let's allow the failures. --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8b45c177..93941ff5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,10 @@ python: - "3.4" - "3.5" - "pypy" +matrix: + allow_failures: + - python: "2.6" + - python: "3.2" install: - pip install -r requirements.txt script: From 3cd9d32371b0e03d33071eb82735dc48e3f91a6e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 15:44:48 -0400 Subject: [PATCH 026/314] Fix test_idempotent to not sometimes fail linkify can return text with html attributes in a different order depending on how the attributes come out of the attrs dict. Because of that, there are several possible outcomes. This fixes the linkify test to accept both of them. Fixes #161 --- tests/test_basics.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_basics.py b/tests/test_basics.py index 6fa6c22e..459b29a0 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -193,8 +193,12 @@ def test_idempotent(): clean = bleach.clean(dirty) assert bleach.clean(clean) == clean + possible_outs = ( + 'invalid & < extra http://link.com', + 'invalid & < extra http://link.com' + ) linked = bleach.linkify(dirty) - assert bleach.linkify(linked) == linked + assert bleach.linkify(linked) == possible_outs def test_rel_already_there(): From 4df81b9b6fdf1e4d38715e6e77b22b7e22a30152 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 15:52:09 -0400 Subject: [PATCH 027/314] Fix the test I just broke --- tests/test_basics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_basics.py b/tests/test_basics.py index 459b29a0..07d4d918 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -198,7 +198,7 @@ def test_idempotent(): 'invalid & < extra http://link.com' ) linked = bleach.linkify(dirty) - assert bleach.linkify(linked) == possible_outs + assert bleach.linkify(linked) in possible_outs def test_rel_already_there(): From 0c070f6733501b5e2bb3c36e7946f7229b94b79b Mon Sep 17 00:00:00 2001 From: Lorenz Schori Date: Fri, 9 Sep 2016 10:23:11 +0200 Subject: [PATCH 028/314] Do not add trailing period when email address is at the end of a sentence --- bleach/__init__.py | 2 +- tests/test_links.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 3f08bfdf..3a53870d 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -88,7 +88,7 @@ def emit(self, record): (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom |^"([\001-\010\013\014\016-\037!#-\[\]-\177] |\\[\001-011\013\014\016-\177])*" # quoted-string - )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain + )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain """, re.IGNORECASE | re.MULTILINE | re.VERBOSE) diff --git a/tests/test_links.py b/tests/test_links.py index 67fc2d1a..40260785 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -106,7 +106,13 @@ def ft(attrs, new=False): '
jinkyun@example.com', True, '
jinkyun@example.com' - ) + ), + # Mailto links at the end of a sentence. + ( + 'mailto james@example.com.au.', + True, + 'mailto james@example.com.au.' + ), ]) def test_email_link(data, parse_email, expected): assert linkify(data, parse_email=parse_email) == expected From 751619b43668d8e509b00afc4835fbb31955d64f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 16:57:35 -0400 Subject: [PATCH 029/314] Update CHANGES; add .cache to .gitignore --- .gitignore | 1 + CHANGES | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index c24310fb..78421070 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist build .tox docs/_build/ +.cache diff --git a/CHANGES b/CHANGES index 8784e83e..645e7d03 100644 --- a/CHANGES +++ b/CHANGES @@ -15,6 +15,12 @@ Version 1.5? (in progress) - clean: Added ``protocols`` to arguments list to let you override the list of allowed protocols. Thank you, Andreas Malecki! #149 +- linkify: Fix a bug involving periods at the end of an email address. Thank you, + Lorenz Schori! #219 +- linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies! + #207 +- Fixed a test that failed periodically. #161 +- Switched from nose to py.test. Version 1.4.3 (May 23rd, 2016) From e8649dfa7d1a758eba2558240123f0c1cedc735e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 21:05:38 -0400 Subject: [PATCH 030/314] Nix duplicate requirements; add english description of html5lib versions The requirements.txt file specified requirements already in setup.py, so there's no point in having both versions. Therefore I nixed the ones in requirements.txt. The html5lib versions are hard to read, so I added english descriptions of them. --- requirements.txt | 4 ---- setup.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 79aa6e61..a026c46e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,5 @@ -e . -ordereddict==1.1 -six==1.10.0 -html5lib>=0.999,<0.99999999 - # Requirements to run the test suite: pytest==3.0.3 flake8==3.0.4 diff --git a/setup.py b/setup.py index 26686efd..40b62a72 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ install_requires = [ 'six', - 'html5lib>=0.999,<0.99999999', + 'html5lib>=0.999,<0.99999999', # 3 9s to 8 9s ] try: From feaad361ce840f080c81b4c88f5d8b60190807c5 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Oct 2016 21:07:21 -0400 Subject: [PATCH 031/314] Fix .travis.yml and tox.ini to test multiple html5lib versions This fixes .travis.yml and tox.ini to test multiple html5lib versions that we allegedly support. --- .travis.yml | 7 +++++++ tox.ini | 21 ++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 93941ff5..a6498a1f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,12 +11,19 @@ python: - "3.4" - "3.5" - "pypy" +env: +- HTML5LIB=0.999 # 3 +- HTML5LIB=0.9999 # 4 +- HTML5LIB=0.99999 # 5 +- HTML5LIB=0.999999 # 6 +- HTML5LIB=0.9999999 # 7 matrix: allow_failures: - python: "2.6" - python: "3.2" install: - pip install -r requirements.txt +- pip install html5lib==$HTML5LIB script: - py.test - flake8 bleach/ diff --git a/tox.ini b/tox.ini index 8d409cf4..3c0ba1f8 100644 --- a/tox.ini +++ b/tox.ini @@ -4,11 +4,22 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33, py34, py35, pypy +envlist = py{26,27,32,33,34,35}-html5lib{999,9999,99999,999999,9999999},pypy-html5lib9999999 [testenv] -commands = py.test {posargs:-v} +basepython = + py26: python2.6 + py27: python2.7 + py32: python3.2 + py33: python3.3 + py34: python3.4 + py35: python3.5 deps = - six - html5lib==0.999 - pytest + -rrequirements.txt + html5lib999: html5lib==0.999 + html5lib9999: html5lib==0.9999 + html5lib99999: html5lib==0.99999 + html5lib999999: html5lib==0.999999 + html5lib9999999: html5lib==0.9999999 +commands = + py.test {posargs:-v} From 8f879871a60cb6cd5e3b610fbc01c884633c1ecf Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 1 Nov 2016 09:37:27 -0400 Subject: [PATCH 032/314] Remove support for html5lib 0.9999 and 0.99999 html5lib 0.9999 and 0.99999 have a bug about relative urls and the bleach tests fail with those two versions. Given that, this removes support for both of those. Additionally, I tweaked the CHANGES file a bit. --- .travis.yml | 2 -- CHANGES | 13 ++++++++----- setup.py | 4 +++- tox.ini | 4 +--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index a6498a1f..59912666 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,8 +13,6 @@ python: - "pypy" env: - HTML5LIB=0.999 # 3 -- HTML5LIB=0.9999 # 4 -- HTML5LIB=0.99999 # 5 - HTML5LIB=0.999999 # 6 - HTML5LIB=0.9999999 # 7 matrix: diff --git a/CHANGES b/CHANGES index 645e7d03..000054d6 100644 --- a/CHANGES +++ b/CHANGES @@ -20,7 +20,10 @@ Version 1.5? (in progress) - linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies! #207 - Fixed a test that failed periodically. #161 -- Switched from nose to py.test. +- Switched from nose to py.test. #204 +- Add test matrix for all supported Python and html5lib versions. #230 +- Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999 + and 0.99999 are busted. Version 1.4.3 (May 23rd, 2016) @@ -28,7 +31,7 @@ Version 1.4.3 (May 23rd, 2016) **Changes** -- Limit to html5lib >=0.999<0.99999999 because of impending change to +- Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to sanitizer api. #195 @@ -37,7 +40,7 @@ Version 1.4.2 (September 11, 2015) **Changes** -- linkify: Fix hang in linkify with parse_email=True. #124 +- linkify: Fix hang in linkify with ``parse_email=True``. #124 - linkify: Fix crash in linkify when removing a link that is a first-child. #136 - Updated TLDs. - linkify: Don't remove exterior brackets when linkifying. #146 @@ -58,7 +61,7 @@ Version 1.4 (January 12, 2014) **Changes** - linkify: Update linkify to use etree type Treewalker instead of simpletree. -- Updated html5lib to version >= 0.999. +- Updated html5lib to version ``>=0.999``. - Update all code to be compatible with Python 3 and 2 using six. - Switch to Apache License. @@ -77,7 +80,7 @@ Version 1.2.2 (May 18, 2013) Version 1.2.1 (February 19, 2013) --------------------------------- -- clean() no longer considers "feed:" an acceptable protocol due to +- clean() no longer considers ``feed:`` an acceptable protocol due to inconsistencies in browser behavior. diff --git a/setup.py b/setup.py index 40b62a72..872970e0 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,9 @@ install_requires = [ 'six', - 'html5lib>=0.999,<0.99999999', # 3 9s to 8 9s + # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're + # busted + 'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999', ] try: diff --git a/tox.ini b/tox.ini index 3c0ba1f8..73c8511c 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py{26,27,32,33,34,35}-html5lib{999,9999,99999,999999,9999999},pypy-html5lib9999999 +envlist = py{26,27,32,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999 [testenv] basepython = @@ -17,8 +17,6 @@ basepython = deps = -rrequirements.txt html5lib999: html5lib==0.999 - html5lib9999: html5lib==0.9999 - html5lib99999: html5lib==0.99999 html5lib999999: html5lib==0.999999 html5lib9999999: html5lib==0.9999999 commands = From 93ac61f6969fb1c0290febaed2fc7e161dc02d7e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 2 Nov 2016 09:15:14 -0400 Subject: [PATCH 033/314] Add support for "python setup.py test" --- .gitignore | 3 ++- CHANGES | 1 + setup.cfg | 3 +++ setup.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 78421070..f5adb549 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ dist build .tox docs/_build/ -.cache +.cache/ +.eggs/ diff --git a/CHANGES b/CHANGES index 000054d6..ee4f2806 100644 --- a/CHANGES +++ b/CHANGES @@ -24,6 +24,7 @@ Version 1.5? (in progress) - Add test matrix for all supported Python and html5lib versions. #230 - Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999 and 0.99999 are busted. +- Add support for ``python setup.py test``. #97 Version 1.4.3 (May 23rd, 2016) diff --git a/setup.cfg b/setup.cfg index 38f6166d..f3a416e4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,6 @@ +[aliases] +test=pytest + [flake8] ignore = E731,W503 diff --git a/setup.py b/setup.py index 872970e0..2a28da45 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,18 @@ import re +import sys from setuptools import setup, find_packages from distutils.util import convert_path +setup_requires = [] +if 'test' in sys.argv: + # Only add pytest-runner to setup_requires if running tests + setup_requires.append('pytest-runner>=2.0,<3dev') + +tests_require = [ + 'pytest==3.0.3', +] + install_requires = [ 'six', # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're @@ -52,6 +62,8 @@ def get_version(): package_data={'': ['README.rst']}, zip_safe=False, install_requires=install_requires, + setup_requires=setup_requires, + tests_require=tests_require, classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', From 4e868219083a0bd6799e5a47b521d20847640661 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 2 Nov 2016 10:16:52 -0400 Subject: [PATCH 034/314] Clarify docs regarding appropriate use cases and goals This clarifies some of the guiding principles behind what bleach is for and not for so it's clearer to users whether their needs will be met. --- bleach/__init__.py | 25 +++++++++++++----- docs/goals.rst | 63 +++++++++++++++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 3a53870d..097c0d93 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -104,7 +104,14 @@ def emit(self, record): def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): - """Clean an HTML fragment and return it + """Clean an HTML fragment of malicious content and return it + + This function is a security-focused function whose sole purpose is to + remove malicious content from a string such that it can be displayed as + content in a web page. + + This function is not designed to use to transform content to be used in + non-web-page contexts. :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to @@ -139,12 +146,18 @@ class s(BleachSanitizer): def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False, tokenizer=HTMLSanitizer): - """Convert URL-like strings in an HTML fragment to links. + """Convert URL-like strings in an HTML fragment to links + + ``linkify()`` converts strings that look like URLs or domain names in a + blob of text that may be an HTML fragment to links, while preserving: + + 1. links already in the string + 2. urls found in attributes + 3. email addresses + + ``linkify()`` does a best-effort approach and tries to recover from bad + situations due to crazy text. - linkify() converts strings that look like URLs or domain names in a - blob of text that may be an HTML fragment to links, while preserving - (a) links already in the string, (b) urls found in attributes, and - (c) email addresses. """ text = force_unicode(text) diff --git a/docs/goals.rst b/docs/goals.rst index d62d54b5..74d0b171 100644 --- a/docs/goals.rst +++ b/docs/goals.rst @@ -6,13 +6,15 @@ This document lists the goals and non-goals of Bleach. My hope is that by focusing on these goals and explicitly listing the non-goals, the project will evolve in a stronger direction. +.. contents:: + Goals ===== -Whitelisting ------------- +Always take a whitelist-based approach +-------------------------------------- Bleach should always take a whitelist-based approach to allowing any kind of content or markup. Blacklisting is error-prone and not future proof. @@ -22,8 +24,8 @@ not blacklist all the other ``on*`` attributes. Future versions of HTML may add new event handlers, like ``ontouch``, that old blacklists would not prevent. -Sanitizing Input ----------------- +Main goal is to sanitize input of malicious content +--------------------------------------------------- The primary goal of Bleach is to sanitize user input that is allowed to contain *some* HTML as markup and is to be included in the content of a larger page. @@ -43,8 +45,8 @@ content, and will use the HTML5 parsing algorithm to handle invalid markup. See the :ref:`chapter on clean() ` for more info. -Safely Creating Links ---------------------- +Safely create cinks +------------------- The secondary goal of Bleach is to provide a mechanism for finding or altering links (```` tags with ``href`` attributes, or things that look like URLs or @@ -62,18 +64,43 @@ Non-Goals Bleach is designed to work with fragments of HTML by untrusted users. Some non-goal use cases include: -* **Sanitizing complete HTML documents.** Once you're creating whole documents, - you have to allow so many tags that a blacklist approach (e.g. forbidding - ``& diff --git a/tests/data/1.test.out b/tests/data/1.test.out new file mode 100644 index 00000000..d89228ad --- /dev/null +++ b/tests/data/1.test.out @@ -0,0 +1 @@ +>"><script>alert("XSS")</script>& \ No newline at end of file diff --git a/tests/data/10.test b/tests/data/10.test new file mode 100644 index 00000000..268771bc --- /dev/null +++ b/tests/data/10.test @@ -0,0 +1 @@ + diff --git a/tests/data/10.test.out b/tests/data/10.test.out new file mode 100644 index 00000000..29998a1f --- /dev/null +++ b/tests/data/10.test.out @@ -0,0 +1 @@ +<img src="javascript:alert('XSS');"> \ No newline at end of file diff --git a/tests/data/11.test b/tests/data/11.test new file mode 100644 index 00000000..16a49c70 --- /dev/null +++ b/tests/data/11.test @@ -0,0 +1 @@ + diff --git a/tests/data/11.test.out b/tests/data/11.test.out new file mode 100644 index 00000000..52a02dc6 --- /dev/null +++ b/tests/data/11.test.out @@ -0,0 +1 @@ +<img src="javascript:alert('XSS')"> \ No newline at end of file diff --git a/tests/data/12.test b/tests/data/12.test new file mode 100644 index 00000000..d4b96e6f --- /dev/null +++ b/tests/data/12.test @@ -0,0 +1 @@ + diff --git a/tests/data/12.test.out b/tests/data/12.test.out new file mode 100644 index 00000000..fb0807ae --- /dev/null +++ b/tests/data/12.test.out @@ -0,0 +1 @@ +<img src="JaVaScRiPt:alert('XSS')"> \ No newline at end of file diff --git a/tests/data/13.test b/tests/data/13.test new file mode 100644 index 00000000..07279a83 --- /dev/null +++ b/tests/data/13.test @@ -0,0 +1 @@ +")> diff --git a/tests/data/13.test.out b/tests/data/13.test.out new file mode 100644 index 00000000..1c866507 --- /dev/null +++ b/tests/data/13.test.out @@ -0,0 +1 @@ +<img src="JaVaScRiPt:alert("XSS&lt;WBR">")> \ No newline at end of file diff --git a/tests/data/14.test b/tests/data/14.test new file mode 100644 index 00000000..b704c0b4 --- /dev/null +++ b/tests/data/14.test @@ -0,0 +1 @@ +#115;crip&#116;:a diff --git a/tests/data/14.test.out b/tests/data/14.test.out new file mode 100644 index 00000000..16445739 --- /dev/null +++ b/tests/data/14.test.out @@ -0,0 +1 @@ +<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr>#116;:a \ No newline at end of file diff --git a/tests/data/15.test b/tests/data/15.test new file mode 100644 index 00000000..b6a2de6b --- /dev/null +++ b/tests/data/15.test @@ -0,0 +1 @@ +le&#114;t('XS;S')> diff --git a/tests/data/15.test.out b/tests/data/15.test.out new file mode 100644 index 00000000..334f916b --- /dev/null +++ b/tests/data/15.test.out @@ -0,0 +1 @@ +le&<wbr>#114;t('XS<wbr>;S')> \ No newline at end of file diff --git a/tests/data/16.test b/tests/data/16.test new file mode 100644 index 00000000..d66b5921 --- /dev/null +++ b/tests/data/16.test @@ -0,0 +1 @@ +#0000118as&#0000099ri&#0000112t:&#0000097le&#0000114t(&#0000039XS&#0000083')> diff --git a/tests/data/16.test.out b/tests/data/16.test.out new file mode 100644 index 00000000..9c6ca965 --- /dev/null +++ b/tests/data/16.test.out @@ -0,0 +1 @@ +<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr>#0000099ri&<wbr>#0000112t:&<wbr>#0000097le&<wbr>#0000114t(&<wbr>#0000039XS&<wbr>#0000083')> \ No newline at end of file diff --git a/tests/data/17.test b/tests/data/17.test new file mode 100644 index 00000000..6e71b152 --- /dev/null +++ b/tests/data/17.test @@ -0,0 +1 @@ +#x63ript:&#x61lert(&#x27XSS')> diff --git a/tests/data/17.test.out b/tests/data/17.test.out new file mode 100644 index 00000000..dabfaa2d --- /dev/null +++ b/tests/data/17.test.out @@ -0,0 +1 @@ +<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr>#x61lert(&<wbr>#x27XSS')> \ No newline at end of file diff --git a/tests/data/18.test b/tests/data/18.test new file mode 100644 index 00000000..1c173723 --- /dev/null +++ b/tests/data/18.test @@ -0,0 +1 @@ + diff --git a/tests/data/18.test.out b/tests/data/18.test.out new file mode 100644 index 00000000..8046c715 --- /dev/null +++ b/tests/data/18.test.out @@ -0,0 +1 @@ +<img src="jav ascript:alert(&lt;WBR&gt;'XSS');"> \ No newline at end of file diff --git a/tests/data/19.test b/tests/data/19.test new file mode 100644 index 00000000..e6e79742 --- /dev/null +++ b/tests/data/19.test @@ -0,0 +1 @@ + diff --git a/tests/data/19.test.out b/tests/data/19.test.out new file mode 100644 index 00000000..8eb8794c --- /dev/null +++ b/tests/data/19.test.out @@ -0,0 +1,2 @@ +<img src="jav +ascript:alert(&lt;WBR&gt;'XSS');"> \ No newline at end of file diff --git a/tests/data/2.test b/tests/data/2.test new file mode 100644 index 00000000..21b93db3 --- /dev/null +++ b/tests/data/2.test @@ -0,0 +1 @@ +"> diff --git a/tests/data/2.test.out b/tests/data/2.test.out new file mode 100644 index 00000000..0b32b6a4 --- /dev/null +++ b/tests/data/2.test.out @@ -0,0 +1 @@ +"><style>@import"javascript:alert('XSS')";</style> \ No newline at end of file diff --git a/tests/data/3.test b/tests/data/3.test new file mode 100644 index 00000000..8dc3a4ee --- /dev/null +++ b/tests/data/3.test @@ -0,0 +1 @@ +>"'> diff --git a/tests/data/3.test.out b/tests/data/3.test.out new file mode 100644 index 00000000..20c3d0d4 --- /dev/null +++ b/tests/data/3.test.out @@ -0,0 +1 @@ +>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> \ No newline at end of file diff --git a/tests/data/5.test b/tests/data/5.test new file mode 100644 index 00000000..0b03876b --- /dev/null +++ b/tests/data/5.test @@ -0,0 +1 @@ +>%22%27> diff --git a/tests/data/5.test.out b/tests/data/5.test.out new file mode 100644 index 00000000..1782eafb --- /dev/null +++ b/tests/data/5.test.out @@ -0,0 +1 @@ +>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22> \ No newline at end of file diff --git a/tests/data/7.test b/tests/data/7.test new file mode 100644 index 00000000..827f9b9e --- /dev/null +++ b/tests/data/7.test @@ -0,0 +1 @@ +"> diff --git a/tests/data/7.test.out b/tests/data/7.test.out new file mode 100644 index 00000000..41fd4322 --- /dev/null +++ b/tests/data/7.test.out @@ -0,0 +1 @@ +"> \ No newline at end of file diff --git a/tests/data/8.test b/tests/data/8.test new file mode 100644 index 00000000..ddf33a96 --- /dev/null +++ b/tests/data/8.test @@ -0,0 +1 @@ +>" diff --git a/tests/data/8.test.out b/tests/data/8.test.out new file mode 100644 index 00000000..bc1ffd44 --- /dev/null +++ b/tests/data/8.test.out @@ -0,0 +1 @@ +>" \ No newline at end of file diff --git a/tests/data/9.test b/tests/data/9.test new file mode 100644 index 00000000..9cf58659 --- /dev/null +++ b/tests/data/9.test @@ -0,0 +1 @@ +'';!--"=&{()} diff --git a/tests/data/9.test.out b/tests/data/9.test.out new file mode 100644 index 00000000..3a4d9b6c --- /dev/null +++ b/tests/data/9.test.out @@ -0,0 +1 @@ +'';!--"<xss>=&{()} \ No newline at end of file diff --git a/tests/test_security.py b/tests/test_security.py index 7ebb25cd..6ffaf449 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -1,5 +1,10 @@ """More advanced security tests""" +import os + +import pytest +import six + from bleach import clean @@ -137,3 +142,36 @@ def test_poster_attribute(): def test_feed_protocol(): assert clean('foo') == 'foo' + + +def get_tests(): + """Retrieves regression tests from data/ directory""" + datadir = os.path.join(os.path.dirname(__file__), 'data') + tests = [ + os.path.join(datadir, fn) for fn in os.listdir(datadir) + if fn.endswith('.test') + ] + # Sort numerically which makes it easier to iterate through them + tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0])) + return tests + + +@pytest.mark.parametrize('fn', get_tests()) +def test_regressions(fn): + """Regression tests for clean so we can see if there are issues""" + s = open(fn, 'r').read() + expected = six.text_type(open(fn + '.out', 'r').read()) + + # NOTE(willkg): This strips input and expected which makes it easier to + # maintain the files. If there comes a time when the input needs whitespace + # at the beginning or end, then we'll have to figure out something else. + assert clean(s.strip()) == expected.strip() + + +def test_regression_manually(): + """Regression tests for clean so we can see if there are issues""" + # NOTE(willkg): Have to do this one by hand because of the \r + s = """""" + expected = """<img src="jav\rascript:alert(&lt;WBR&gt;'XSS');">""" + + assert clean(s) == expected From 81e5bc737965fc59590f905107fb09096c4e239c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 Jan 2017 13:00:56 -0500 Subject: [PATCH 043/314] Add tesing for Python 3.6 --- .travis.yml | 1 + CHANGES | 4 ++++ tox.ini | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f88146d1..4e66cf1d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ python: - "3.3" - "3.4" - "3.5" +- "3.6" - "pypy" env: - HTML5LIB=0.999 # 3 diff --git a/CHANGES b/CHANGES index c52b5485..ec3bc992 100644 --- a/CHANGES +++ b/CHANGES @@ -9,6 +9,10 @@ Version 2.0 (in development) - Removed support for Python 2.6. #206 - Removed support for Python 3.2. #224 +**Changes** + +- Added testing for Python 3.6. + Version 1.5 (November 4th, 2016) -------------------------------- diff --git a/tox.ini b/tox.ini index 02dde2d3..09ed488f 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py{27,33,34,35}-html5lib{999,999999,9999999},pypy-html5lib9999999 +envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999 [testenv] basepython = @@ -12,6 +12,7 @@ basepython = py33: python3.3 py34: python3.4 py35: python3.5 + py36: python3.6 deps = -rrequirements.txt html5lib999: html5lib==0.999 From c94db9529d47943ddc8ab207108ed0b30f177f87 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 Jan 2017 13:02:03 -0500 Subject: [PATCH 044/314] Add Python 3.6 to classifiers list --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 908928e1..167658ff 100644 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ def get_version(): 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Software Development :: Libraries :: Python Modules', ] ) From ef0c48765c160f5724d915770060a095cbc69dda Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 17 Feb 2017 14:55:55 -0500 Subject: [PATCH 045/314] Update dev requirements --- requirements.txt | 7 ++++--- setup.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index a026c46e..6ec6bd90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ -e . # Requirements to run the test suite: -pytest==3.0.3 -flake8==3.0.4 +pytest==3.0.6 +pytest-wholenodeid +flake8==3.3.0 tox==2.4.1 # Requirements for building docs -Sphinx==1.4.8 +Sphinx==1.5.2 # Requirements for updating package twine==1.8.1 diff --git a/setup.py b/setup.py index 167658ff..6c627a38 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup_requires.append('pytest-runner>=2.0,<3dev') tests_require = [ - 'pytest==3.0.3', + 'pytest>=3.0.0', ] install_requires = [ From 567eebb53e0716f6d267fa8951c548d63bf78a70 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 Jan 2017 17:02:03 -0500 Subject: [PATCH 046/314] Overhaul bleach to use html5lib >= 0.99999999 This is a bleach rewrite to use the new sanitizer API in html5lib 0.99999999. The new API happens as a filter when emitting the tree rather than in the tokenizer. Because of that, the output of .clean() and .linkify() are different than in previous versions of bleach. --- bleach/__init__.py | 53 +++++++----- bleach/sanitizer.py | 189 ++++++++++++++++++++--------------------- setup.py | 5 +- tests/test_basics.py | 8 +- tests/test_css.py | 74 ++++++++++------ tests/test_links.py | 4 +- tests/test_security.py | 24 ++++-- tox.ini | 7 +- 8 files changed, 204 insertions(+), 160 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 09dad637..c54dc72b 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -5,13 +5,14 @@ import re import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer.htmlserializer import HTMLSerializer +from html5lib.filters import sanitizer +from html5lib.filters.sanitizer import allowed_protocols +from html5lib.serializer import HTMLSerializer -from . import callbacks as linkify_callbacks -from .encoding import force_unicode -from .sanitizer import BleachSanitizer -from .version import __version__, VERSION # flake8: noqa +from bleach import callbacks as linkify_callbacks +from bleach.encoding import force_unicode +from bleach.sanitizer import BleachSanitizerFilter +from bleach.version import __version__, VERSION # flake8: noqa __all__ = ['clean', 'linkify'] @@ -60,7 +61,7 @@ # Make sure that .com doesn't get matched by .co first TLDS.reverse() -PROTOCOLS = HTMLSanitizer.acceptable_protocols +PROTOCOLS = allowed_protocols url_re = re.compile( r"""\(* # Match any opening parentheses. @@ -125,21 +126,34 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, text = force_unicode(text) - class s(BleachSanitizer): - allowed_elements = tags - allowed_attributes = attributes - allowed_css_properties = styles - allowed_protocols = protocols - strip_disallowed_elements = strip - strip_html_comments = strip_comments + parser = html5lib.HTMLParser(namespaceHTMLElements=False) + dom = parser.parseFragment(text) - parser = html5lib.HTMLParser(tokenizer=s) + walker = html5lib.getTreeWalker('etree') + filtered = BleachSanitizerFilter( + source=walker(dom), + allowed_attributes_map=attributes, - return _render(parser.parseFragment(text)) + allowed_elements=tags, + allowed_css_properties=styles, + allowed_protocols=protocols, + + allowed_svg_properties=[], + + strip_disallowed_elements=strip, + strip_html_comments=strip_comments + ) + s = HTMLSerializer( + quote_attr_values='always', + alphabetical_attributes=True, + omit_optional_tags=False + ) + return s.render(filtered) def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - parse_email=False, tokenizer=HTMLSanitizer): + # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer): + parse_email=False): """Convert URL-like strings in an HTML fragment to links ``linkify()`` converts strings that look like URLs, domain names and email @@ -158,7 +172,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, if not text: return '' - parser = html5lib.HTMLParser(tokenizer=tokenizer) + # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer) + parser = html5lib.HTMLParser() forest = parser.parseFragment(text) _seen = set([]) @@ -427,7 +442,7 @@ def _render(tree): def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values=True, + serializer = HTMLSerializer(quote_attr_values='always', alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index eec6659b..fb502b85 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -2,118 +2,125 @@ import re from xml.sax.saxutils import escape, unescape -from html5lib.constants import tokenTypes -from html5lib.sanitizer import HTMLSanitizerMixin -from html5lib.tokenizer import HTMLTokenizer +from html5lib.constants import namespaces +from html5lib.filters import sanitizer -PROTOS = HTMLSanitizerMixin.acceptable_protocols -PROTOS.remove('feed') +class BleachSanitizerFilter(sanitizer.Filter): + def __init__(self, source, allowed_attributes_map, + strip_disallowed_elements=False, strip_html_comments=True, + **kwargs): + if isinstance(allowed_attributes_map, dict): + self.wildcard_attributes = allowed_attributes_map.get('*', []) + self.allowed_attributes_map = allowed_attributes_map + else: + self.wildcard_attributes = allowed_attributes_map + self.allowed_attributes_map = {} -class BleachSanitizerMixin(HTMLSanitizerMixin): - """Mixin to replace sanitize_token() and sanitize_css().""" + self.strip_disallowed_elements = strip_disallowed_elements + self.strip_html_comments = strip_html_comments - allowed_svg_properties = [] + return super(BleachSanitizerFilter, self).__init__(source, **kwargs) def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. - Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be - a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. + Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': + ['attribute', 'pairs'], 'tag': callable}. - Here callable is a function with two arguments of attribute name - and value. It should return true of false. + Here callable is a function with two arguments of attribute name and + value. It should return true of false. Also gives the option to strip tags instead of encoding. """ - if (getattr(self, 'wildcard_attributes', None) is None and - isinstance(self.allowed_attributes, dict)): - self.wildcard_attributes = self.allowed_attributes.get('*', []) - - if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], - tokenTypes['EmptyTag']): + token_type = token['type'] + if token_type in ['StartTag', 'EndTag', 'EmptyTag']: if token['name'] in self.allowed_elements: - if 'data' in token: - if isinstance(self.allowed_attributes, dict): - allowed_attributes = self.allowed_attributes.get( - token['name'], []) - if not callable(allowed_attributes): - allowed_attributes += self.wildcard_attributes - else: - allowed_attributes = self.allowed_attributes - attrs = dict([(name, val) for name, val in - token['data'][::-1] - if (allowed_attributes(name, val) - if callable(allowed_attributes) - else name in allowed_attributes)]) - for attr in self.attr_val_is_uri: - if attr not in attrs: - continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - # Remove replacement characters from unescaped - # characters. - val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) - and (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - if (token['name'] in self.svg_allow_local_href and - 'xlink:href' in attrs and - re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): - del attrs['xlink:href'] - if 'style' in attrs: - attrs['style'] = self.sanitize_css(attrs['style']) - token['data'] = [(name, val) for name, val in - attrs.items()] - return token + return self.allow_token(token) + elif self.strip_disallowed_elements: pass + else: - if token['type'] == tokenTypes['EndTag']: - token['data'] = ''.format(token['name']) - elif token['data']: - attr = ' {0!s}="{1!s}"' - attrs = ''.join([attr.format(k, escape(v)) for k, v in - token['data']]) - token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) - else: - token['data'] = '<{0!s}>'.format(token['name']) - if token['selfClosing']: - token['data'] = token['data'][:-1] + '/>' - token['type'] = tokenTypes['Characters'] - del token["name"] - return token - elif token['type'] == tokenTypes['Comment']: + return self.disallowed_token(token) + + elif token_type == 'Comment': if not self.strip_html_comments: return token + else: return token - def sanitize_css(self, style): - """HTMLSanitizerMixin.sanitize_css replacement. - - HTMLSanitizerMixin.sanitize_css always whitelists background-*, - border-*, margin-*, and padding-*. We only whitelist what's in - the whitelist. + def allow_token(self, token): + if 'data' in token: + allowed_attributes = self.allowed_attributes_map.get(token['name'], []) + if not callable(allowed_attributes): + allowed_attributes += self.wildcard_attributes + + # Drop any attributes that aren't allowed + attrs = {} + for namespaced_name, val in token['data'].items(): + namespace, name = namespaced_name + # FIXME(willkg): "name" used to be something like "xlink:href" + # but it's now (namespace['xlink'], 'href'). we should fix the + # name here so it's what the callable would expect. + if callable(allowed_attributes): + if allowed_attributes(name, val): + attrs[namespaced_name] = val + + elif name in allowed_attributes: + attrs[namespaced_name] = val + + # Go through all the uri-type attributes + for attr in self.attr_val_is_uri: + if attr not in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped characters. + val_unescaped = val_unescaped.replace("\ufffd", "") + + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and + (val_unescaped.split(':')[0] not in self.allowed_protocols)): + # It has a protocol, but it's not allowed--so drop it + del attrs[attr] + + # FIXME(willkg): is this right? + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + + # FIXME(willkg): is this right? + if (token['name'] in self.svg_allow_local_href and + (namespace['xlink'], 'href') in attrs and + re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])): + del attrs[(namespace['xlink'], 'href')] + + # Sanitize css in style attribute + if (None, u'style') in attrs: + attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')]) + + token['data'] = attrs + return token - """ + def sanitize_css(self, style): + """html5lib sanitizer filter replacement to fix issues""" # disallow urls style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet - # TODO: Make sure this does what it's meant to - I *think* it wants to - # validate style attribute contents. + + # Validate the css in the style tag and if it's not valid, then drop + # the whole thing. parts = style.split(';') - gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" - """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + gauntlet = re.compile( + r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""" + ) + for part in parts: if not gauntlet.match(part): return '' @@ -125,23 +132,11 @@ def sanitize_css(self, style): for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue + if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean) - - -class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): - def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, - lowercaseElementName=True, lowercaseAttrName=True, **kwargs): - HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, - lowercaseElementName, lowercaseAttrName, - **kwargs) - - def __iter__(self): - for token in HTMLTokenizer.__iter__(self): - token = self.sanitize_token(token) - if token: - yield token diff --git a/setup.py b/setup.py index 6c627a38..39fbb370 100644 --- a/setup.py +++ b/setup.py @@ -15,9 +15,8 @@ install_requires = [ 'six', - # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're - # busted - 'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999', + # >= 8 9s because of breaking API change + 'html5lib>=0.99999999', ] diff --git a/tests/test_basics.py b/tests/test_basics.py index 07d4d918..8e293ca6 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,5 +1,6 @@ -import six import html5lib +import pytest +import six import bleach @@ -234,6 +235,7 @@ def test_wildcard_attributes(): assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" dirty = 'Yeah right ' @@ -242,8 +244,8 @@ def test_sarcasm(): def test_user_defined_protocols_valid(): - valid_href = 'allowed href' - assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href + valid_href = 'allowed href' + assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href def test_user_defined_protocols_invalid(): diff --git a/tests/test_css.py b/tests/test_css.py index 0b92f40b..3d224fac 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -8,7 +8,7 @@ clean = partial(clean, tags=['p'], attributes=['style']) -@pytest.mark.parametrize('data,styles,expected', [ +@pytest.mark.parametrize('data, styles, expected', [ ( 'font-family: Arial; color: red; float: left; background-color: red;', ['color'], @@ -91,24 +91,40 @@ def test_valid_css(): def test_style_hang(): """The sanitizer should not hang on any inline styles""" - # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug - style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """ - """margin-left: 0px; padding-top: 15px; padding-right: 15px; """ - """padding-bottom: 15px; padding-left: 15px; border-top-width: """ - """1px; border-right-width: 1px; border-bottom-width: 1px; """ - """border-left-width: 1px; border-top-style: dotted; """ - """border-right-style: dotted; border-bottom-style: dotted; """ - """border-left-style: dotted; border-top-color: rgb(203, 200, """ - """185); border-right-color: rgb(203, 200, 185); """ - """border-bottom-color: rgb(203, 200, 185); border-left-color: """ - """rgb(203, 200, 185); background-image: initial; """ - """background-attachment: initial; background-origin: initial; """ - """background-clip: initial; background-color: """ - """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """ - """font: normal normal normal 100%/normal 'Courier New', """ - """'Andale Mono', monospace; background-position: initial """ - """initial; background-repeat: initial initial;""") - html = '

Hello world

'.format(style) + style = [ + 'margin-top: 0px;', + 'margin-right: 0px;', + 'margin-bottom: 1.286em;', + 'margin-left: 0px;', + 'padding-top: 15px;', + 'padding-right: 15px;', + 'padding-bottom: 15px;', + 'padding-left: 15px;', + 'border-top-width: 1px;', + 'border-right-width: 1px;', + 'border-bottom-width: 1px;', + 'border-left-width: 1px;', + 'border-top-style: dotted;', + 'border-right-style: dotted;', + 'border-bottom-style: dotted;', + 'border-left-style: dotted;', + 'border-top-color: rgb(203, 200, 185);', + 'border-right-color: rgb(203, 200, 185);', + 'border-bottom-color: rgb(203, 200, 185);', + 'border-left-color: rgb(203, 200, 185);', + 'background-image: initial;', + 'background-attachment: initial;', + 'background-origin: initial;', + 'background-clip: initial;', + 'background-color: rgb(246, 246, 242);', + 'overflow-x: auto;', + 'overflow-y: auto;', + # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css. + # 'font: italic small-caps bolder condensed 16px/3 cursive;', + 'background-position: initial initial;', + 'background-repeat: initial initial;' + ] + html = '

Hello world

' % ' '.join(style) styles = [ 'border', 'float', 'overflow', 'min-height', 'vertical-align', 'white-space', @@ -120,12 +136,18 @@ def test_style_hang(): 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', ] - expected = ("""

""" - """Hello world

""") + expected = ( + '

Hello world

' + ) assert clean(html, styles=styles) == expected diff --git a/tests/test_links.py b/tests/test_links.py index ac38ee70..6b7a77eb 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -3,7 +3,7 @@ except ImportError: from urllib import quote_plus -from html5lib.tokenizer import HTMLTokenizer +# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC @@ -406,6 +406,7 @@ def test_end_of_clause(): ) +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" assert linkify('Yeah right ') == 'Yeah right <sarcasm/>' @@ -498,6 +499,7 @@ def test_ports(data, expected_data): assert linkify(data) == out.format(*expected_data) +@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer') def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' diff --git a/tests/test_security.py b/tests/test_security.py index 6ffaf449..4fb30207 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -74,7 +74,9 @@ def test_invalid_href_attr(): def test_invalid_filter_attr(): IMG = ['img', ] - IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"} + IMG_ATTR = { + 'img': lambda n, v: n == 'src' and v == "http://example.com/" + } assert ( clean('', tags=IMG, attributes=IMG_ATTR) == @@ -145,7 +147,11 @@ def test_feed_protocol(): def get_tests(): - """Retrieves regression tests from data/ directory""" + """Retrieves regression tests from data/ directory + + :returns: list of ``(filename, filedata)`` tuples + + """ datadir = os.path.join(os.path.dirname(__file__), 'data') tests = [ os.path.join(datadir, fn) for fn in os.listdir(datadir) @@ -153,19 +159,23 @@ def get_tests(): ] # Sort numerically which makes it easier to iterate through them tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0])) - return tests + + testcases = [ + (fn, open(fn, 'r').read()) for fn in tests + ] + + return testcases -@pytest.mark.parametrize('fn', get_tests()) -def test_regressions(fn): +@pytest.mark.parametrize('fn, text', get_tests()) +def test_regressions(fn, text): """Regression tests for clean so we can see if there are issues""" - s = open(fn, 'r').read() expected = six.text_type(open(fn + '.out', 'r').read()) # NOTE(willkg): This strips input and expected which makes it easier to # maintain the files. If there comes a time when the input needs whitespace # at the beginning or end, then we'll have to figure out something else. - assert clean(s.strip()) == expected.strip() + assert clean(text.strip()) == expected.strip() def test_regression_manually(): diff --git a/tox.ini b/tox.ini index 09ed488f..53c175c9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999 +envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999 [testenv] basepython = @@ -15,8 +15,7 @@ basepython = py36: python3.6 deps = -rrequirements.txt - html5lib999: html5lib==0.999 - html5lib999999: html5lib==0.999999 - html5lib9999999: html5lib==0.9999999 + html5lib99999999: html5lib==0.99999999 + html5lib999999999: html5lib==0.999999999 commands = py.test {posargs:-v} From 3db588a5ed8b43da8324547b23ad5670b459ee9e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 11:45:01 -0500 Subject: [PATCH 047/314] Update security regression tests I also moved test_nasty to the regression tests because it's just like those. --- tests/data/14.test.out | 2 +- tests/data/15.test.out | 2 +- tests/data/16.test.out | 2 +- tests/data/17.test.out | 2 +- tests/data/3.test.out | 2 +- tests/data/4.test | 1 + tests/data/4.test.out | 1 + tests/data/5.test.out | 2 +- tests/data/9.test.out | 2 +- tests/test_security.py | 35 +++++++++++++++++------------------ 10 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 tests/data/4.test create mode 100644 tests/data/4.test.out diff --git a/tests/data/14.test.out b/tests/data/14.test.out index 16445739..8e5ff754 100644 --- a/tests/data/14.test.out +++ b/tests/data/14.test.out @@ -1 +1 @@ -<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr>#116;:a \ No newline at end of file +<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=&#106;&#97;&#118;&#97;&<wbr> \ No newline at end of file diff --git a/tests/data/15.test.out b/tests/data/15.test.out index 334f916b..8b90245f 100644 --- a/tests/data/15.test.out +++ b/tests/data/15.test.out @@ -1 +1 @@ -le&<wbr>#114;t('XS<wbr>;S')> \ No newline at end of file +le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')> \ No newline at end of file diff --git a/tests/data/16.test.out b/tests/data/16.test.out index 9c6ca965..1ecb332b 100644 --- a/tests/data/16.test.out +++ b/tests/data/16.test.out @@ -1 +1 @@ -<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr>#0000099ri&<wbr>#0000112t:&<wbr>#0000097le&<wbr>#0000114t(&<wbr>#0000039XS&<wbr>#0000083')> \ No newline at end of file +<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=&#0000106&#0000097&<wbr> \ No newline at end of file diff --git a/tests/data/17.test.out b/tests/data/17.test.out index dabfaa2d..ae928a99 100644 --- a/tests/data/17.test.out +++ b/tests/data/17.test.out @@ -1 +1 @@ -<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr>#x61lert(&<wbr>#x27XSS')> \ No newline at end of file +<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr> \ No newline at end of file diff --git a/tests/data/3.test.out b/tests/data/3.test.out index 20c3d0d4..f0d69629 100644 --- a/tests/data/3.test.out +++ b/tests/data/3.test.out @@ -1 +1 @@ ->"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> \ No newline at end of file +>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)></img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> \ No newline at end of file diff --git a/tests/data/4.test b/tests/data/4.test new file mode 100644 index 00000000..c4cf51cd --- /dev/null +++ b/tests/data/4.test @@ -0,0 +1 @@ +ipt type="text/javascript">alert("foo");script> diff --git a/tests/data/4.test.out b/tests/data/4.test.out new file mode 100644 index 00000000..88ea86b2 --- /dev/null +++ b/tests/data/4.test.out @@ -0,0 +1 @@ +<scr<script>ipt type="text/javascript">alert("foo");script<del></del>></scr<script> diff --git a/tests/data/5.test.out b/tests/data/5.test.out index 1782eafb..0d88a88a 100644 --- a/tests/data/5.test.out +++ b/tests/data/5.test.out @@ -1 +1 @@ ->%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22> \ No newline at end of file +>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22></img%20src%3d%22javascript:alert(%27%20xss%27)%22> \ No newline at end of file diff --git a/tests/data/9.test.out b/tests/data/9.test.out index 3a4d9b6c..5c5eb6ba 100644 --- a/tests/data/9.test.out +++ b/tests/data/9.test.out @@ -1 +1 @@ -'';!--"<xss>=&{()} \ No newline at end of file +'';!--"<xss>=&{()}</xss> \ No newline at end of file diff --git a/tests/test_security.py b/tests/test_security.py index 4fb30207..356b1292 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -22,7 +22,7 @@ def test_nested_script_tag(): def test_nested_script_tag_r(): assert ( clean('>evil()>') == - '<script<script>>evil()</script<>>' + '<script<script>>evil()></script<script>' ) @@ -90,8 +90,11 @@ def test_invalid_filter_attr(): def test_invalid_tag_char(): assert ( - clean('') == - '<script xss="" src="http://xx.com/xss.js"></script>' + clean('') in + [ + '<script src="http://xx.com/xss.js" xss=""></script>', + '<script xss="" src="http://xx.com/xss.js"></script>' + ] ) assert ( clean('') == @@ -102,15 +105,21 @@ def test_invalid_tag_char(): def test_unclosed_tag(): assert ( clean('ipt type="text/javascript">alert("foo");script>') - expect = ('<scr<script></script>ipt type="text/javascript"' - '>alert("foo");</script>script<del></del>' - '>') - assert clean(test) == expect - - def test_poster_attribute(): """Poster attributes should not allow javascript.""" tags = ['video'] From 02facd34b02cd9d9e16b547e07b07ddd3db40455 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 13:23:00 -0500 Subject: [PATCH 048/314] Minor code cleanup * address FIXMEs * minor cleanup to code and comments --- bleach/__init__.py | 8 ++++---- bleach/sanitizer.py | 14 ++++++-------- tests/test_basics.py | 15 ++++++++++----- tests/test_css.py | 6 +++--- tests/test_links.py | 9 --------- tests/test_security.py | 2 +- 6 files changed, 24 insertions(+), 30 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index c54dc72b..d1a82cde 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -120,9 +120,11 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments + :returns: cleaned text as unicode + """ if not text: - return '' + return u'' text = force_unicode(text) @@ -152,7 +154,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer): parse_email=False): """Convert URL-like strings in an HTML fragment to links @@ -170,9 +171,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, text = force_unicode(text) if not text: - return '' + return u'' - # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer) parser = html5lib.HTMLParser() forest = parser.parseFragment(text) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index fb502b85..0701def2 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -63,9 +63,7 @@ def allow_token(self, token): attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name - # FIXME(willkg): "name" used to be something like "xlink:href" - # but it's now (namespace['xlink'], 'href'). we should fix the - # name here so it's what the callable would expect. + if callable(allowed_attributes): if allowed_attributes(name, val): attrs[namespaced_name] = val @@ -73,12 +71,14 @@ def allow_token(self, token): elif name in allowed_attributes: attrs[namespaced_name] = val - # Go through all the uri-type attributes + # Handle attributes that have uri values for attr in self.attr_val_is_uri: if attr not in attrs: continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped characters. val_unescaped = val_unescaped.replace("\ufffd", "") @@ -87,17 +87,15 @@ def allow_token(self, token): # It has a protocol, but it's not allowed--so drop it del attrs[attr] - # FIXME(willkg): is this right? for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) - # FIXME(willkg): is this right? if (token['name'] in self.svg_allow_local_href and - (namespace['xlink'], 'href') in attrs and - re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])): + (namespaces['xlink'], 'href') in attrs and + re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespace['xlink'], 'href')] # Sanitize css in style attribute diff --git a/tests/test_basics.py b/tests/test_basics.py index 8e293ca6..49148592 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -71,12 +71,17 @@ def test_function_arguments(): def test_named_arguments(): ATTRS = {'a': ['rel', 'href']} - s = ('xx.com', - 'xx.com') - assert bleach.clean(s[0]) == 'xx.com' - # FIXME: This might not be needed if attribute order is stable now. - assert bleach.clean(s[0], attributes=ATTRS) in s + text = 'xx.com' + + assert bleach.clean(text) == 'xx.com' + assert ( + bleach.clean(text, attributes=ATTRS) in + [ + 'xx.com', + 'xx.com' + ] + ) def test_disallowed_html(): diff --git a/tests/test_css.py b/tests/test_css.py index 3d224fac..d8880d78 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -119,8 +119,7 @@ def test_style_hang(): 'background-color: rgb(246, 246, 242);', 'overflow-x: auto;', 'overflow-y: auto;', - # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css. - # 'font: italic small-caps bolder condensed 16px/3 cursive;', + 'font: italic small-caps bolder condensed 16px/3 cursive;', 'background-position: initial initial;', 'background-repeat: initial initial;' ] @@ -146,7 +145,8 @@ def test_style_hang(): 'padding-right: 15px; ' 'padding-bottom: 15px; ' 'padding-left: 15px; ' - 'background-color: rgb(246, 246, 242);' + 'background-color: rgb(246, 246, 242); ' + 'font: italic small-caps bolder condensed 16px/3 cursive;' '">Hello world

' ) diff --git a/tests/test_links.py b/tests/test_links.py index 6b7a77eb..53d60e5c 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -3,7 +3,6 @@ except ImportError: from urllib import quote_plus -# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC @@ -499,14 +498,6 @@ def test_ports(data, expected_data): assert linkify(data) == out.format(*expected_data) -@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer') -def test_tokenizer(): - """Linkify doesn't always have to sanitize.""" - raw = 'test' - assert linkify(raw) == 'test<x></x>' - assert linkify(raw, tokenizer=HTMLTokenizer) == raw - - def test_ignore_bad_protocols(): assert ( linkify('foohttp://bar') == diff --git a/tests/test_security.py b/tests/test_security.py index 356b1292..2aac0200 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -75,7 +75,7 @@ def test_invalid_href_attr(): def test_invalid_filter_attr(): IMG = ['img', ] IMG_ATTR = { - 'img': lambda n, v: n == 'src' and v == "http://example.com/" + 'img': lambda attr, val: attr == 'src' and val == "http://example.com/" } assert ( From 10852231012ae3eece9c0a2af5c6c7c8e2e5212f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 15:37:00 -0500 Subject: [PATCH 049/314] Update CHANGES and docs --- CHANGES | 19 +++++++++++++++---- README.rst | 2 -- docs/clean.rst | 5 ++++- docs/conf.py | 3 +-- docs/dev.rst | 4 ++-- docs/goals.rst | 4 ++-- docs/linkify.rst | 17 +---------------- 7 files changed, 25 insertions(+), 29 deletions(-) diff --git a/CHANGES b/CHANGES index ec3bc992..e8e49a8d 100644 --- a/CHANGES +++ b/CHANGES @@ -8,10 +8,19 @@ Version 2.0 (in development) - Removed support for Python 2.6. #206 - Removed support for Python 3.2. #224 +- Bleach no longer supports html5lib < 0.99999999 (8 9s). + + This version represents a rewrite to use the new sanitizing API since + the old one was dropped in html5lib 0.99999999 (8 9s). + +- linkify no longer accepts a tokenizer argument. +- clean output is different than in previous versions; particularly this version + will add end tags even if the tag will be escaped. **Changes** -- Added testing for Python 3.6. +- Supports Python 3.6. +- Supports html5lib >= 0.99999999 (8 9s). Version 1.5 (November 4th, 2016) @@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016) **Backwards incompatible changes** - clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and - mailto. Previously it was a long list of protocols something like ed2k, ftp, - http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, - feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 + mailto. + + Previously it was a long list of protocols something like ed2k, ftp, http, + https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed, + urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 **Changes** diff --git a/README.rst b/README.rst index 8f9ce05d..3bd87573 100644 --- a/README.rst +++ b/README.rst @@ -101,5 +101,3 @@ The simplest way to use Bleach is: .. _GitHub: https://github.com/mozilla/bleach .. _ReadTheDocs: https://bleach.readthedocs.io/ .. _PyPI: http://pypi.python.org/pypi/bleach - - diff --git a/docs/clean.rst b/docs/clean.rst index ebd82055..a988a81a 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -162,7 +162,7 @@ For example, this sets allowed protocols to http, https and smb: u'allowed protocol' -This adds smb to the bleach-specified set of allowed protocols: +This adds smb to the Bleach-specified set of allowed protocols: .. doctest:: @@ -187,6 +187,7 @@ whitelist and invalid markup. For example: .. doctest:: >>> import bleach + >>> bleach.clean('is not allowed') u'<span>is not allowed</span>' >>> bleach.clean('is not allowed', tags=['b']) @@ -199,6 +200,7 @@ If you would rather Bleach stripped this markup entirely, you can pass .. doctest:: >>> import bleach + >>> bleach.clean('is not allowed', strip=True) u'is not allowed' >>> bleach.clean('is not allowed', tags=['b'], strip=True) @@ -214,6 +216,7 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set .. doctest:: >>> import bleach + >>> html = 'my html' >>> bleach.clean(html) diff --git a/docs/conf.py b/docs/conf.py index 00b9c239..e186c827 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,8 +27,7 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode', - 'sphinx.ext.doctest'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/dev.rst b/docs/dev.rst index 027a0a76..02f8d44a 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -5,7 +5,7 @@ Bleach development Docs ==== -Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs +Docs are in ``docs/``. We use Sphinx. Docs are pushed to ReadTheDocs via a GitHub webhook. @@ -16,7 +16,7 @@ Run:: $ tox -That'll run bleach tests in all the supported Python environments. Note +That'll run Bleach tests in all the supported Python environments. Note that you need the necessary Python binaries for them all to be tested. Tests are run in Travis CI via a GitHub webhook. diff --git a/docs/goals.rst b/docs/goals.rst index 01f63a94..632c222c 100644 --- a/docs/goals.rst +++ b/docs/goals.rst @@ -91,10 +91,10 @@ Make malicious content look pretty or sane ------------------------------------------ Malicious content is designed to be malicious. Making it safe is a design goal -of bleach. Making it pretty or sane-looking is not. +of Bleach. Making it pretty or sane-looking is not. If you want your malicious content to look pretty, you should pass it through -bleach to make it safe and then do your own transform afterwards. +Bleach to make it safe and then do your own transform afterwards. Allow arbitrary styling diff --git a/docs/linkify.rst b/docs/linkify.rst index b7449c34..705000c2 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -9,7 +9,7 @@ control how and when those links are rendered:: def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - parse_email=False, tokenizer=HTMLSanitizer): + parse_email=False): """Convert URL-like strings in an HTML fragment to links. ``linkify()`` works by building a document tree, so it's guaranteed never to do @@ -194,19 +194,4 @@ they are newly created or already in the text, so be careful when writing callbacks that may need to behave differently if the protocol is ``mailto:``. -``tokenizer`` -============= - -``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by -default. This has the effect of scrubbing some tags and attributes. To use a -more lenient, or totally different, tokenizer, you can specify the tokenizer -class here. (See the implementation of :ref:`clean() ` for an -example of building a custom tokenizer.) - -:: - - from html5lib.tokenizer import HTMLTokenizer - linked_text = linkify(text, tokenizer=HTMLTokenizer) - - .. _Crate: https://crate.io/ From 066631af96ee9c16ddcf9d132bd8537a4af17da6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 20:49:21 -0500 Subject: [PATCH 050/314] Update .travis.yml --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4e66cf1d..318dfa7d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,9 +11,8 @@ python: - "3.6" - "pypy" env: -- HTML5LIB=0.999 # 3 -- HTML5LIB=0.999999 # 6 -- HTML5LIB=0.9999999 # 7 +- HTML5LIB=0.99999999 # 8 +- HTML5LIB=0.999999999 # 9 install: - pip install -r requirements.txt - pip install html5lib==$HTML5LIB From 30772dd20f16a8ec75f7c1ddb7b76c9ff6fd97d2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 20:57:13 -0500 Subject: [PATCH 051/314] Another .travis.yml fix --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 318dfa7d..a5db65b1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ env: - HTML5LIB=0.99999999 # 8 - HTML5LIB=0.999999999 # 9 install: +- pip install -U pip - pip install -r requirements.txt - pip install html5lib==$HTML5LIB script: From 85cc802584f4290556b14e50384c2ed6bb6959b6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 21:05:59 -0500 Subject: [PATCH 052/314] One more .travis.yml fix --- .travis.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a5db65b1..14015378 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,9 +14,11 @@ env: - HTML5LIB=0.99999999 # 8 - HTML5LIB=0.999999999 # 9 install: -- pip install -U pip -- pip install -r requirements.txt -- pip install html5lib==$HTML5LIB + # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5 + - pip install -U pip setuptools>=18.5 + - pip install -r requirements.txt + # stomp on html5lib install with the specified one + - pip install html5lib==$HTML5LIB script: - py.test - flake8 bleach/ From 4cca43b5f41a4cc157fa2e6f5dd5185f70cb7a6c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 21:10:40 -0500 Subject: [PATCH 053/314] Fix flake8 issues --- bleach/sanitizer.py | 4 ++-- setup.cfg | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 0701def2..68438e9a 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals import re -from xml.sax.saxutils import escape, unescape +from xml.sax.saxutils import unescape from html5lib.constants import namespaces from html5lib.filters import sanitizer @@ -83,7 +83,7 @@ def allow_token(self, token): val_unescaped = val_unescaped.replace("\ufffd", "") if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and - (val_unescaped.split(':')[0] not in self.allowed_protocols)): + (val_unescaped.split(':')[0] not in self.allowed_protocols)): # It has a protocol, but it's not allowed--so drop it del attrs[attr] diff --git a/setup.cfg b/setup.cfg index f3a416e4..950364a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,6 +3,7 @@ test=pytest [flake8] ignore = E731,W503 +max-line-length = 100 [wheel] universal=1 From 49f35dd27317019e47febed1c0c507177297a8af Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 12:21:38 -0500 Subject: [PATCH 054/314] Add tests, fix alphabetizing, code cleanup * this adds some missing tests to add more coverage * html5lib 0.99999999 and 0.999999999 have an alphabeticalattributes filter that doesn't work when the attributes set has some items with a namespace and some without in Python 3; this rolls alphabetizing into the Bleach sanitizer * remove some dead code and clean some other code up --- bleach/__init__.py | 17 ++++----- bleach/sanitizer.py | 43 +++++++++++++++++----- tests/test_basics.py | 86 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 17 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index d1a82cde..ae96a925 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -61,8 +61,6 @@ # Make sure that .com doesn't get matched by .co first TLDS.reverse() -PROTOCOLS = allowed_protocols - url_re = re.compile( r"""\(* # Match any opening parentheses. \b(?"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), + """.format('|'.join(allowed_protocols), '|'.join(TLDS)), re.IGNORECASE | re.VERBOSE | re.UNICODE) proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) @@ -87,8 +85,6 @@ """, re.IGNORECASE | re.MULTILINE | re.VERBOSE) -NODE_TEXT = 4 # The numeric ID of a text node in simpletree. - ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) # a simple routine that returns the tag name with the namespace prefix # as returned by etree's Element.tag attribute @@ -147,8 +143,13 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, ) s = HTMLSerializer( quote_attr_values='always', - alphabetical_attributes=True, - omit_optional_tags=False + omit_optional_tags=False, + + # Bleach has its own sanitizer, so don't use the html5lib one + sanitize=False, + + # Bleach sanitizer alphabetizes already, so don't use the html5lib one + alphabetical_attributes=False, ) return s.render(filtered) @@ -176,7 +177,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parser = html5lib.HTMLParser() forest = parser.parseFragment(text) - _seen = set([]) + _seen = set() def replace_nodes(tree, new_frag, node, index=0): """Doesn't really replace nodes, but inserts the nodes contained in diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 68438e9a..c12b8d24 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +from collections import OrderedDict import re from xml.sax.saxutils import unescape @@ -6,6 +7,19 @@ from html5lib.filters import sanitizer +def _attr_key(attr): + """Returns appropriate key for sorting attribute names + + Attribute names are a tuple of ``(namespace, name)`` where namespace can be + ``None`` or a string. These can't be compared in Python 3, so we conver the + ``None`` to an empty string. + + """ + key = (attr[0][0] or ''), attr[0][1] + print(key) + return key + + class BleachSanitizerFilter(sanitizer.Filter): def __init__(self, source, allowed_attributes_map, strip_disallowed_elements=False, strip_html_comments=True, @@ -87,22 +101,33 @@ def allow_token(self, token): # It has a protocol, but it's not allowed--so drop it del attrs[attr] + # Drop values in svg attrs with non-local IRIs for attr in self.svg_attr_val_allows_ref: if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - - if (token['name'] in self.svg_allow_local_href and - (namespaces['xlink'], 'href') in attrs and - re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): - del attrs[(namespace['xlink'], 'href')] + new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + new_val = new_val.strip() + if not new_val: + del attrs[attr] + else: + attrs[attr] = new_val + + # Drop href and xlink:href attr for svg elements with non-local IRIs + if (None, token['name']) in self.svg_allow_local_href: + for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]: + if href_attr in attrs: + if re.search(r'^\s*[^#\s]', attrs[href_attr]): + del attrs[href_attr] # Sanitize css in style attribute if (None, u'style') in attrs: attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')]) - token['data'] = attrs + # Alphabetize attributes + token['data'] = OrderedDict( + [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)] + ) return token def sanitize_css(self, style): diff --git a/tests/test_basics.py b/tests/test_basics.py index 49148592..bae1506e 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -240,6 +240,92 @@ def test_wildcard_attributes(): assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean +def test_callable_attributes(): + """Verify callable attributes work and get correct arg values""" + def img_test(attr, val): + return attr == 'src' and val.startswith('https') + + ATTR = { + 'img': img_test, + } + TAGS = ['img'] + + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + + +def test_svg_attr_val_allows_ref(): + """Unescape values in svg attrs that allow url references""" + # Local IRI, so keep it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + # Non-local IRI, so drop it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + # NOTE(willkg): Bug in html5lib serializer drops the xlink part + '' + ), +]) +def test_svg_allow_local_href(text, expected): + """Keep local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + '' + ), +]) +def test_svg_allow_local_href_nonlocal(text, expected): + """Drop non-local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + + + @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" From 70d96e390c1525d8508436d8582daa49357fdf3e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 12:49:12 -0500 Subject: [PATCH 055/314] Alphabetize before escaping disallowed tokens Making this change means the output is stable since attributes will always happen in the same order. Seems like maybe it's not a great idea, but stable seems good. If it turns out this is terrible, someone will complain with a compelling use case and we can undo it. I also went through and removed a bunch of the "the output is either this or that" in the tests. --- bleach/sanitizer.py | 6 ++++++ tests/test_basics.py | 31 +++++++++++++------------------ tests/test_unicode.py | 7 ++----- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index c12b8d24..789d89e6 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -58,6 +58,12 @@ def sanitize_token(self, token): pass else: + if 'data' in token: + # Alphabetize the attributes before calling .disallowed_token() + # so that the resulting string is stable + token['data'] = OrderedDict( + [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)] + ) return self.disallowed_token(token) elif token_type == 'Comment': diff --git a/tests/test_basics.py b/tests/test_basics.py index bae1506e..c42ccc62 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -76,11 +76,8 @@ def test_named_arguments(): assert bleach.clean(text) == 'xx.com' assert ( - bleach.clean(text, attributes=ATTRS) in - [ - 'xx.com', - 'xx.com' - ] + bleach.clean(text, attributes=ATTRS) == + 'xx.com' ) @@ -199,25 +196,22 @@ def test_idempotent(): clean = bleach.clean(dirty) assert bleach.clean(clean) == clean - possible_outs = ( - 'invalid & < extra http://link.com', + linked = bleach.linkify(dirty) + assert ( + bleach.linkify(linked) == 'invalid & < extra http://link.com' ) - linked = bleach.linkify(dirty) - assert bleach.linkify(linked) in possible_outs def test_rel_already_there(): """Make sure rel attribute is updated not replaced""" linked = ('Click ' 'here.') - link_good = (('Click ' - 'here.'), - ('Click ' - 'here.')) - assert bleach.linkify(linked) in link_good - assert bleach.linkify(link_good[0]) in link_good + link_good = 'Click here.' + + assert bleach.linkify(linked) == link_good + assert bleach.linkify(link_good) == link_good def test_lowercase_html(): @@ -235,9 +229,10 @@ def test_wildcard_attributes(): TAG = ['img', 'em'] dirty = ('both can have ' '') - clean = ('both can have ', - 'both can have ') - assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean + assert ( + bleach.clean(dirty, tags=TAG, attributes=ATTR) == + 'both can have ' + ) def test_callable_attributes(): diff --git a/tests/test_unicode.py b/tests/test_unicode.py index b8b670e8..08ab3f4e 100644 --- a/tests/test_unicode.py +++ b/tests/test_unicode.py @@ -27,11 +27,8 @@ def test_mixed(): def test_mixed_linkify(): assert ( - linkify('Домашняя http://example.com ヘルプとチュートリアル') in - ( - 'Домашняя http://example.com ヘルプとチュートリアル', - 'Домашняя http://example.com ヘルプとチュートリアル' - ) + linkify('Домашняя http://example.com ヘルプとチュートリアル') == + 'Домашняя http://example.com ヘルプとチュートリアル' ) From 86dea93c17d96a237acd2543264ba27d83f15d79 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 13:09:20 -0500 Subject: [PATCH 056/314] Cosmetic fix for readability --- bleach/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index ae96a925..a645ac7e 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -130,16 +130,18 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, walker = html5lib.getTreeWalker('etree') filtered = BleachSanitizerFilter( source=walker(dom), + + # Bleach-sanitizer-specific things allowed_attributes_map=attributes, + strip_disallowed_elements=strip, + strip_html_comments=strip_comments, + # html5lib-sanitizer things allowed_elements=tags, allowed_css_properties=styles, allowed_protocols=protocols, - allowed_svg_properties=[], - strip_disallowed_elements=strip, - strip_html_comments=strip_comments ) s = HTMLSerializer( quote_attr_values='always', From 14ce11293e9f74009e4c9e55bec3039ac8106540 Mon Sep 17 00:00:00 2001 From: Greg Guthe Date: Wed, 22 Feb 2017 19:37:49 -0500 Subject: [PATCH 057/314] add test website and scripts --- website/.gitignore | 1 + website/README.txt | 16 +++++++++ website/data_to_json.py | 53 ++++++++++++++++++++++++++++ website/index.html | 74 +++++++++++++++++++++++++++++++++++++++ website/open_test_page.py | 34 ++++++++++++++++++ website/server.py | 42 ++++++++++++++++++++++ 6 files changed, 220 insertions(+) create mode 100644 website/.gitignore create mode 100644 website/README.txt create mode 100755 website/data_to_json.py create mode 100644 website/index.html create mode 100755 website/open_test_page.py create mode 100755 website/server.py diff --git a/website/.gitignore b/website/.gitignore new file mode 100644 index 00000000..765417a3 --- /dev/null +++ b/website/.gitignore @@ -0,0 +1 @@ +testcases.json diff --git a/website/README.txt b/website/README.txt new file mode 100644 index 00000000..2223e7ab --- /dev/null +++ b/website/README.txt @@ -0,0 +1,16 @@ +Scripts for a Bleach demo/test website + +Usage: + +from the project root: + +# generate testcases.json +python website/data_to_json.py tests/data > testcases.json + +# run the test server +cd website && python server.py & + +# open the page in browsers python can find +python open_test_page.py + +# inspect bleached html and iframe diff --git a/website/data_to_json.py b/website/data_to_json.py new file mode 100755 index 00000000..ffd346f5 --- /dev/null +++ b/website/data_to_json.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +""" +Util to write a directory of test cases with input filenames +.test and output filenames .test.out as JSON to +stdout. + +example: + +python tests/data_to_json.py tests/data > testcases.json +""" + +import argparse +import fnmatch +import json +import os +import os.path + +import bleach + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('data_dir', + help='directory containing test cases with input files' + ' named .test and output .test.out') + + args = parser.parse_args() + + filenames = os.listdir(args.data_dir) + ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')] + outs = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test.out')] + + testcases = [] + for infn, outfn in zip(ins, outs): + case_name = infn.rsplit('.test', 1)[0] + + with open(infn, 'r') as fin, open(outfn, 'r') as fout: + payload = fin.read()[:-1] + testcases.append({ + "title": case_name, + "input_filename": infn, + "output_filename": outfn, + "payload": payload, + "actual": bleach.clean(payload), + "expected": fout.read(), + }) + + print(json.dumps(testcases, indent=4, sort_keys=True)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/website/index.html b/website/index.html new file mode 100644 index 00000000..6ff43871 --- /dev/null +++ b/website/index.html @@ -0,0 +1,74 @@ + + + + + Python Bleach 2.0.0 + + +

Python Bleach 2.0.0

+

+ pypi version + Build Status +

+

+ This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. + The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below. +

+
+

+

clean on change

+
+

+ +

+ +

+ + + + + diff --git a/website/open_test_page.py b/website/open_test_page.py new file mode 100755 index 00000000..b812de92 --- /dev/null +++ b/website/open_test_page.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +import webbrowser + +TEST_BROWSERS = set([ + # 'mozilla', + 'firefox', + # 'netscape', + # 'galeon', + # 'epiphany', + # 'skipstone', + # 'kfmclient', + # 'konqueror', + # 'kfm', + # 'mosaic', + # 'opera', + # 'grail', + # 'links', + # 'elinks', + # 'lynx', + # 'w3m', + 'windows-default', + # 'macosx', + 'safari', + # 'google-chrome', + 'chrome', + # 'chromium', + # 'chromium-browser', +]) +REGISTERED_BROWSERS = set(webbrowser._browsers.keys()) + +if __name__ == '__main__': + for b in TEST_BROWSERS & REGISTERED_BROWSERS: + webbrowser.get(b).open_new_tab('http://localhost:8080') diff --git a/website/server.py b/website/server.py new file mode 100755 index 00000000..83fcf84a --- /dev/null +++ b/website/server.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +""" +Simple Test/Demo Server for running bleach.clean output +on various desktops. + +Usage: + +python server.py +""" + +import SimpleHTTPServer +import SocketServer +import json + +import bleach + + +PORT = 8080 + +class BleachCleanHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + + def do_POST(self): + content_len = int(self.headers.getheader('content-length', 0)) + body = self.rfile.read(content_len) + print("read %s bytes: %s" % (content_len, body)) + cleaned = bleach.clean(body) + print("cleaned %s" % cleaned) + + self.send_response(200) + self.send_header('Content-Length', len(cleaned)) + self.send_header('Content-Type', 'text/plain;charset=UTF-8') + self.end_headers() + + self.wfile.write(cleaned) + + +if __name__ == '__main__': + SocketServer.TCPServer.allow_reuse_address = True # Prevent 'cannot bind to address' errors on restart + httpd = SocketServer.TCPServer(('127.0.0.1', PORT), BleachCleanHandler) + print("listening on localhost port %d" % PORT) + httpd.serve_forever() From 41349916a756c74ef1e249e5a93ce7e789b89a59 Mon Sep 17 00:00:00 2001 From: Greg Guthe Date: Fri, 24 Feb 2017 14:25:39 -0500 Subject: [PATCH 058/314] isolate in-browser tests; add option to insert unsafe --- website/index.html | 117 +++++++++++++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 30 deletions(-) diff --git a/website/index.html b/website/index.html index 6ff43871..6cd1ef10 100644 --- a/website/index.html +++ b/website/index.html @@ -3,6 +3,17 @@ Python Bleach 2.0.0 +

Python Bleach 2.0.0

@@ -12,39 +23,91 @@

Python Bleach 2.0.0

This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. - The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below. + Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.


-

-

clean on change

+

+

+

clean when dirty HTML changes


-

- -

- -

- + +
+

Demo

+ +

+ +

+ +

+ +
+ + From 13dadf8e990f7773758bdac662ff0c9e87f345e0 Mon Sep 17 00:00:00 2001 From: Greg Guthe Date: Fri, 24 Feb 2017 14:59:26 -0500 Subject: [PATCH 059/314] unsafe and safe insert testcase vectors into dom --- website/index.html | 88 ++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/website/index.html b/website/index.html index 6cd1ef10..21ece427 100644 --- a/website/index.html +++ b/website/index.html @@ -13,43 +13,54 @@ .test-case > .ifr { height: 50px; } + .test-case, .demo { + padding-bottom: 15px; + border-bottom: 2px solid gray; + } -

Python Bleach 2.0.0

+

Python Bleach 2.0.0

pypi version Build Status

-

- This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. - Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below. -

-
-

-

-

clean when dirty HTML changes

-
-
-

Demo

+
+

Demo

+

+ This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. + Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below. +

+ +
+

+

+

clean when dirty HTML changes

- +

- -

- + +

+
@@ -60,23 +71,29 @@

Demo

ifr.contentDocument.close(); }; - var sanitize = function() { - var xhr = new XMLHttpRequest(); - xhr.open('POST', '/sanitize'); - xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8"); - xhr.onload = function() { - var sanitized = xhr.responseText; - document.querySelector('label[for=ifr]').textContent = "Clean DOM"; - writeToIframe(document.getElementById('ifr'), sanitized); - clean.value = sanitized; + var sanitize = function(event) { + if (event && event.target && event.target.className.indexOf("clean-and-write") !== -1) { + var grandParent = event.target.parentElement.parentElement; + var xhr = new XMLHttpRequest(); + xhr.open('POST', '/sanitize'); + xhr.setRequestHeader("Content-Type", "text/plain;charset=UTF-8"); + xhr.onload = function() { + var sanitized = xhr.responseText; + grandParent.querySelector("label.dom-label[for=ifr]").textContent = "Clean DOM"; + writeToIframe(grandParent.querySelector('.ifr'), sanitized); + grandParent.querySelector("textarea.clean").value = sanitized; + } + xhr.send(grandParent.querySelector("textarea.dirty").value); } - xhr.send(dirty.value); }; - var unsafeWrite = function() { - clean.value = "N/A"; - document.querySelector('label[for=ifr]').textContent = "Dirty DOM"; - writeToIframe(document.getElementById('ifr'), dirty.value); + var unsafeWrite = function(event) { + if (event.target.className.indexOf("unsafe-write") !== -1) { + var grandParent = event.target.parentElement.parentElement; + grandParent.querySelector("textarea.clean").value = "N/A"; + grandParent.querySelector("label.dom-label[for=ifr]").textContent = "Dirty DOM"; + writeToIframe(grandParent.querySelector('.ifr'), grandParent.querySelector("textarea.dirty").value); + } }; var addTest = function (test, index) { @@ -108,16 +125,17 @@

Demo

xhr.send(null); }; + // TODO: debounce input events? document.getElementById('dirty') .addEventListener('input', function () { var autocleanEl = document.getElementById('autoclean'); if (autocleanEl.checked) { - sanitize(); + sanitize({target: document.getElementsByClassName('clean-and-write')[0]}); } }); - document.getElementById('unsafe-write').addEventListener('click', unsafeWrite, false); - document.getElementById('clean-and-write').addEventListener('click', sanitize, false); + document.addEventListener('click', unsafeWrite, false); + document.addEventListener('click', sanitize, false); document.addEventListener('DOMContentLoaded', function() { loadTests('/testcases.json', function (responseText) { From 19be3f56ecee18c2d43c8bb32bd5b070506449b8 Mon Sep 17 00:00:00 2001 From: Vadim Kotov Date: Sun, 26 Feb 2017 15:50:18 +0400 Subject: [PATCH 060/314] Fixed pypi badge URL Capital letters dont seem to be working with badge.fury.io --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3bd87573..403ff9b6 100644 --- a/README.rst +++ b/README.rst @@ -5,8 +5,8 @@ Bleach .. image:: https://travis-ci.org/mozilla/bleach.png?branch=master :target: https://travis-ci.org/mozilla/bleach -.. image:: https://badge.fury.io/py/Bleach.svg - :target: http://badge.fury.io/py/Bleach +.. image:: https://badge.fury.io/py/bleach.svg + :target: http://badge.fury.io/py/bleach Bleach is a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. From a2c015bf23d239deed61341694c52839a5fa68bb Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 10:22:40 -0500 Subject: [PATCH 061/314] Refactor allow_token This redoes the innards of allow_token so that it's a single loop across all the attributes in a token rather than a bunch of little passes. This has less looping, so theoretically it's more optimal, but I didn't spend any time testing that theory. --- bleach/sanitizer.py | 81 +++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 789d89e6..f5244b49 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -84,51 +84,60 @@ def allow_token(self, token): for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name + # See if we should dump the attribute if callable(allowed_attributes): - if allowed_attributes(name, val): - attrs[namespaced_name] = val + if not allowed_attributes(name, val): + # DROP! + continue - elif name in allowed_attributes: - attrs[namespaced_name] = val - - # Handle attributes that have uri values - for attr in self.attr_val_is_uri: - if attr not in attrs: + elif name not in allowed_attributes: + # DROP! continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - - # Remove replacement characters from unescaped characters. - val_unescaped = val_unescaped.replace("\ufffd", "") - - if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and - (val_unescaped.split(':')[0] not in self.allowed_protocols)): - # It has a protocol, but it's not allowed--so drop it - del attrs[attr] - - # Drop values in svg attrs with non-local IRIs - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: + # Look at attributes that have uri values + if namespaced_name in self.attr_val_is_uri: + val_unescaped = re.sub( + "[`\000-\040\177-\240\s]+", + '', + unescape(val)).lower() + + # Remove replacement characters from unescaped characters. + val_unescaped = val_unescaped.replace("\ufffd", "") + + # Drop attributes with uri values that have protocols that + # aren't allowed + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and + (val_unescaped.split(':')[0] not in self.allowed_protocols)): + # DROP! + continue + + # Drop values in svg attrs with non-local IRIs + if namespaced_name in self.svg_attr_val_allows_ref: new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', - unescape(attrs[attr])) + unescape(val)) new_val = new_val.strip() if not new_val: - del attrs[attr] + # DROP! + continue + else: - attrs[attr] = new_val - - # Drop href and xlink:href attr for svg elements with non-local IRIs - if (None, token['name']) in self.svg_allow_local_href: - for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]: - if href_attr in attrs: - if re.search(r'^\s*[^#\s]', attrs[href_attr]): - del attrs[href_attr] - - # Sanitize css in style attribute - if (None, u'style') in attrs: - attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')]) + # Replace the val with the unescaped version because + # it's a iri + val = new_val + + # Drop href and xlink:href attr for svg elements with non-local IRIs + if (None, token['name']) in self.svg_allow_local_href: + if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]: + if re.search(r'^\s*[^#\s]', val): + continue + + # If it's a style attribute, sanitize it + if namespaced_name == (None, u'style'): + val = self.sanitize_css(val) + + # At this point, we want to keep the attribute, so add it in + attrs[namespaced_name] = val # Alphabetize attributes token['data'] = OrderedDict( From 62eb131f48745264b1a0b1ccf5495d91349957ec Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 10:27:51 -0500 Subject: [PATCH 062/314] Fix linting error --- bleach/sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index f5244b49..50fc2c55 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -141,7 +141,7 @@ def allow_token(self, token): # Alphabetize attributes token['data'] = OrderedDict( - [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)] + [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)] ) return token From 73a205a01fef5d962ff9a9daee6ed041625ae828 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 10:50:42 -0500 Subject: [PATCH 063/314] Clean up code comments to be clearer --- bleach/sanitizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 50fc2c55..b7162eed 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -79,19 +79,22 @@ def allow_token(self, token): if not callable(allowed_attributes): allowed_attributes += self.wildcard_attributes - # Drop any attributes that aren't allowed + # Loop through all the attributes and drop the ones that are not + # allowed, are unsafe or break other rules. Additionally, fix + # attribute values that need fixing. + # + # At the end of this loop, we have the final set of attributes + # we're keeping. attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name - # See if we should dump the attribute + # Drop attributes that are not explicitly allowed if callable(allowed_attributes): if not allowed_attributes(name, val): - # DROP! continue elif name not in allowed_attributes: - # DROP! continue # Look at attributes that have uri values @@ -108,7 +111,6 @@ def allow_token(self, token): # aren't allowed if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): - # DROP! continue # Drop values in svg attrs with non-local IRIs @@ -118,7 +120,6 @@ def allow_token(self, token): unescape(val)) new_val = new_val.strip() if not new_val: - # DROP! continue else: @@ -143,6 +144,7 @@ def allow_token(self, token): token['data'] = OrderedDict( [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)] ) + return token def sanitize_css(self, style): From 564b040d2e66a8af0ac97e5d4b28cc4aea4f24cb Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 10:53:55 -0500 Subject: [PATCH 064/314] Add Cleaner Bleach.clean() is often used on batches of content. This allows you to create a Cleaner class that encapsulates all the clean arguments into a single instance and ald reuses html5lib parser, walker and serializer. --- bleach/__init__.py | 154 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 120 insertions(+), 34 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index a645ac7e..296448ee 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -92,6 +92,100 @@ DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] +class Cleaner(object): + """Cleaner for cleaning HTML fragments of malicious content + + This cleaner is a security-focused function whose sole purpose is to remove + malicious content from a string such that it can be displayed as content in + a web page. + + This cleaner is not designed to use to transform content to be used in + non-web-page contexts. + + To use:: + + from bleach import Cleaner + + cleaner = Cleaner() + + for text in all_the_yucky_things: + sanitized = cleaner.clean(text) + + """ + + def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, + strip_comments=True): + """ + :arg tags: whitelist of allowed tags; defaults to + ``bleach.ALLOWED_TAGS`` + + :arg attributes: whitelist of allowed attributes; defaults to + ``bleach.ALLOWED_ATTRIBUTES`` + + :arg styles: whitelist of allowed css; defaults to + ``bleach.ALLOWED_STYLES`` + + :arg protocols: whitelist of allowed protocols for links; defaults + to ``bleach.ALLOWED_PROTOCOLS`` + + :arg strip: whether or not to strip disallowed elements + + :arg strip_comments: whether or not to strip HTML comments + + """ + self.tags = tags + self.attributes = attributes + self.styles = styles + self.protocols = protocols + self.strip = strip + self.strip_comments = strip_comments + + self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) + self.walker = html5lib.getTreeWalker('etree') + self.serializer = HTMLSerializer( + quote_attr_values='always', + omit_optional_tags=False, + + # Bleach has its own sanitizer, so don't use the html5lib one + sanitize=False, + + # Bleach sanitizer alphabetizes already, so don't use the html5lib one + alphabetical_attributes=False, + ) + + def clean(self, text): + """Cleans text and returns sanitized result as unicode + + :arg str text: text to be cleaned + + :returns: sanitized text as unicode + + """ + if not text: + return u'' + + text = force_unicode(text) + + dom = self.parser.parseFragment(text) + filtered = BleachSanitizerFilter( + source=self.walker(dom), + + # Bleach-sanitizer-specific things + allowed_attributes_map=self.attributes, + strip_disallowed_elements=self.strip, + strip_html_comments=self.strip_comments, + + # html5lib-sanitizer things + allowed_elements=self.tags, + allowed_css_properties=self.styles, + allowed_protocols=self.protocols, + allowed_svg_properties=[], + ) + + return self.serializer.render(filtered) + + def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): @@ -104,56 +198,48 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, This function is not designed to use to transform content to be used in non-web-page contexts. + Example:: + + import bleach + + better_text = bleach.clean(yucky_text) + + + .. Note:: + + If you're cleaning a lot of text and passing the same argument + values, consider caching a ``Cleaner`` instance. + :arg text: the text to clean + :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` + :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` + :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` + :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` + :arg strip: whether or not to strip disallowed elements + :arg strip_comments: whether or not to strip HTML comments :returns: cleaned text as unicode """ - if not text: - return u'' - - text = force_unicode(text) - - parser = html5lib.HTMLParser(namespaceHTMLElements=False) - dom = parser.parseFragment(text) - - walker = html5lib.getTreeWalker('etree') - filtered = BleachSanitizerFilter( - source=walker(dom), - - # Bleach-sanitizer-specific things - allowed_attributes_map=attributes, - strip_disallowed_elements=strip, - strip_html_comments=strip_comments, - - # html5lib-sanitizer things - allowed_elements=tags, - allowed_css_properties=styles, - allowed_protocols=protocols, - allowed_svg_properties=[], - - ) - s = HTMLSerializer( - quote_attr_values='always', - omit_optional_tags=False, - - # Bleach has its own sanitizer, so don't use the html5lib one - sanitize=False, - - # Bleach sanitizer alphabetizes already, so don't use the html5lib one - alphabetical_attributes=False, + cleaner = Cleaner( + tags=tags, + attributes=attributes, + styles=styles, + protocols=protocols, + strip=strip, + strip_comments=strip_comments, ) - return s.render(filtered) + return cleaner.clean(text) def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, From a97f3eade7157b22b9d76ccb105f100347b0fe2f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 10:58:28 -0500 Subject: [PATCH 065/314] Add docs for Cleaner --- docs/clean.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/clean.rst b/docs/clean.rst index a988a81a..63f0427e 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -5,10 +5,7 @@ ``bleach.clean()`` ================== -``clean()`` is Bleach's HTML sanitization method. - -.. autofunction:: bleach.clean - +:py:func:`bleach.clean`` is Bleach's HTML sanitization method. Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing algorithm and sanitize any disallowed tags or attributes. This algorithm also @@ -19,6 +16,14 @@ takes care of things like unclosed and (some) misnested tags. always return ``unicode``. +If you're cleaning a lot of text, you might want to create a +:py:class:`bleach.Cleaner` instance. + +.. autofunction:: bleach.clean + +.. autoclass:: bleach.Cleaner + + Tag Whitelist ============= From a0b88285cf6bcf8a7050b96ba0632eecf998d6f1 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 11:11:56 -0500 Subject: [PATCH 066/314] Cosmetic: Reorganize tests This is entirely cosmetic reorganization of tests so it's easier to see test coverage for clean vs. linkify. --- tests/test_basics.py | 600 +++++++++++++++++++++---------------------- 1 file changed, 288 insertions(+), 312 deletions(-) diff --git a/tests/test_basics.py b/tests/test_basics.py index c42ccc62..790ad559 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -5,141 +5,289 @@ import bleach -def test_empty(): - assert bleach.clean('') == '' - - -def test_nbsp(): - if six.PY3: - expected = '\xa0test string\xa0' - else: - expected = six.u('\\xa0test string\\xa0') - - assert bleach.clean(' test string ') == expected - - -def test_comments_only(): - comment = '' - open_comment = ''.format(open_comment) - ) - - -def test_with_comments(): - html = 'Just text' - assert 'Just text', bleach.clean(html) == 'Just text' - assert bleach.clean(html, strip_comments=False) == html - - -def test_no_html(): - assert bleach.clean('no html string') == 'no html string' - - -def test_allowed_html(): - assert ( - bleach.clean('an allowed tag') == - 'an allowed tag' - ) - assert ( - bleach.clean('another good tag') == - 'another good tag' - ) - - -def test_bad_html(): - assert ( - bleach.clean('a fixed tag') == - 'a fixed tag' - ) - - -def test_function_arguments(): - TAGS = ['span', 'br'] - ATTRS = {'span': ['style']} - - assert ( - bleach.clean('a
test', - tags=TAGS, attributes=ATTRS) == - 'a
test' - ) - - -def test_named_arguments(): - ATTRS = {'a': ['rel', 'href']} - - text = 'xx.com' - - assert bleach.clean(text) == 'xx.com' - assert ( - bleach.clean(text, attributes=ATTRS) == - 'xx.com' - ) - - -def test_disallowed_html(): - assert ( - bleach.clean('a test') == - 'a <script>safe()</script> test' - ) - assert ( - bleach.clean('a test') == - 'a <style>body{}</style> test' - ) - - -def test_bad_href(): - assert ( - bleach.clean('no link') == - 'no link' - ) - - -def test_bare_entities(): - assert ( - bleach.clean('an & entity') == - 'an & entity' - ) - assert ( - bleach.clean('an < entity') == - 'an < entity' - ) - - assert ( - bleach.clean('tag < and entity') == - 'tag < and entity' - ) - - assert ( - bleach.clean('&') == - '&' - ) - - -def test_escaped_entities(): - s = '<em>strong</em>' - assert bleach.clean(s) == s - - -def test_serializer(): - s = '
' - assert bleach.clean(s, tags=['table']) == s - assert bleach.linkify('test
') == 'test
' - assert bleach.clean('

test

', tags=['p']) == '

test

' - - -def test_no_href_links(): - s = 'x' - assert bleach.linkify(s) == s - - -def test_weird_strings(): - s = '' + open_comment = ''.format(open_comment) + ) + + def test_with_comments(self): + html = 'Just text' + assert 'Just text', bleach.clean(html) == 'Just text' + assert bleach.clean(html, strip_comments=False) == html + + def test_no_html(self): + assert bleach.clean('no html string') == 'no html string' + + def test_allowed_html(self): + assert ( + bleach.clean('an allowed tag') == + 'an allowed tag' + ) + assert ( + bleach.clean('another good tag') == + 'another good tag' + ) + + def test_bad_html(self): + assert ( + bleach.clean('a fixed tag') == + 'a fixed tag' + ) + + def test_function_arguments(self): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + assert ( + bleach.clean('a
test', + tags=TAGS, attributes=ATTRS) == + 'a
test' + ) + + def test_named_arguments(self): + ATTRS = {'a': ['rel', 'href']} + + text = 'xx.com' + + assert bleach.clean(text) == 'xx.com' + assert ( + bleach.clean(text, attributes=ATTRS) == + 'xx.com' + ) + + def test_disallowed_html(self): + assert ( + bleach.clean('a test') == + 'a <script>safe()</script> test' + ) + assert ( + bleach.clean('a test') == + 'a <style>body{}</style> test' + ) + + def test_bad_href(self): + assert ( + bleach.clean('no link') == + 'no link' + ) + + def test_bare_entities(self): + assert ( + bleach.clean('an & entity') == + 'an & entity' + ) + assert ( + bleach.clean('an < entity') == + 'an < entity' + ) + + assert ( + bleach.clean('tag < and entity') == + 'tag < and entity' + ) + + assert ( + bleach.clean('&') == + '&' + ) + + def test_escaped_entities(self): + s = '<em>strong</em>' + assert bleach.clean(s) == s + + def test_weird_strings(self): + s = 'with
html tags', strip=True) == + 'a test with html tags' + ) + assert ( + bleach.clean('a test with html tags', + strip=True) == + 'a test with html tags' + ) + + s = '

link text

' + assert ( + bleach.clean(s, tags=['p'], strip=True) == + '

link text

' + ) + s = '

multiply nested text

' + assert ( + bleach.clean(s, tags=['p'], strip=True) == + '

multiply nested text

' + ) + + s = ('

' + '

') + assert ( + bleach.clean(s, tags=['p', 'a'], strip=True) == + '

' + ) + + def test_allowed_styles(self): + ATTR = ['style'] + STYLE = ['color'] + blank = '' + s = '' + assert bleach.clean('', attributes=ATTR) == blank + assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s + assert ( + bleach.clean('', attributes=ATTR, styles=STYLE) == + s + ) + + def test_lowercase_html(self): + """We should output lowercase HTML.""" + dirty = 'BAR' + clean = 'BAR' + assert bleach.clean(dirty, attributes=['class']) == clean + + def test_wildcard_attributes(self): + ATTR = { + '*': ['id'], + 'img': ['src'], + } + TAG = ['img', 'em'] + dirty = ('both can have ' + '') + assert ( + bleach.clean(dirty, tags=TAG, attributes=ATTR) == + 'both can have ' + ) + + def test_callable_attributes(self): + """Verify callable attributes work and get correct arg values""" + def img_test(attr, val): + return attr == 'src' and val.startswith('https') + + ATTR = { + 'img': img_test, + } + TAGS = ['img'] + + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + + def test_svg_attr_val_allows_ref(self): + """Unescape values in svg attrs that allow url references""" + # Local IRI, so keep it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + # Non-local IRI, so drop it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + @pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + # NOTE(willkg): Bug in html5lib serializer drops the xlink part + '' + ), + ]) + def test_svg_allow_local_href(self, text, expected): + """Keep local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + @pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + '' + ), + ]) + def test_svg_allow_local_href_nonlocal(self, text, expected): + """Drop non-local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') + def test_sarcasm(self): + """Jokes should crash.""" + dirty = 'Yeah right ' + clean = 'Yeah right <sarcasm/>' + assert bleach.clean(dirty) == clean + + def test_user_defined_protocols_valid(self): + valid_href = 'allowed href' + assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href + + def test_user_defined_protocols_invalid(self): + invalid_href = 'invalid href' + cleaned_href = 'invalid href' + assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href + + +class TestLinkify: + def test_no_href_links(self): + s = 'x' + assert bleach.linkify(s) == s + + def test_rel_already_there(self): + """Make sure rel attribute is updated not replaced""" + linked = ('Click ' + 'here.') + + link_good = 'Click here.' + + assert bleach.linkify(linked) == link_good + assert bleach.linkify(link_good) == link_good def test_xml_render(): @@ -147,48 +295,6 @@ def test_xml_render(): assert bleach._render(parser.parseFragment('')) == '' -def test_stripping(): - assert ( - bleach.clean('a test with html tags', strip=True) == - 'a test with html tags' - ) - assert ( - bleach.clean('a test with html tags', strip=True) == - 'a test with html tags' - ) - - s = '

link text

' - assert ( - bleach.clean(s, tags=['p'], strip=True) == - '

link text

' - ) - s = '

multiply nested text

' - assert ( - bleach.clean(s, tags=['p'], strip=True) == - '

multiply nested text

' - ) - - s = ('

' - '

') - assert ( - bleach.clean(s, tags=['p', 'a'], strip=True) == - '

' - ) - - -def test_allowed_styles(): - ATTR = ['style'] - STYLE = ['color'] - blank = '' - s = '' - assert bleach.clean('', attributes=ATTR) == blank - assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s - assert ( - bleach.clean('', attributes=ATTR, styles=STYLE) == - s - ) - - def test_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" dirty = 'invalid & < extra http://link.com' @@ -203,138 +309,8 @@ def test_idempotent(): ) -def test_rel_already_there(): - """Make sure rel attribute is updated not replaced""" - linked = ('Click ' - 'here.') - - link_good = 'Click here.' - - assert bleach.linkify(linked) == link_good - assert bleach.linkify(link_good) == link_good - - -def test_lowercase_html(): - """We should output lowercase HTML.""" - dirty = 'BAR' - clean = 'BAR' - assert bleach.clean(dirty, attributes=['class']) == clean - - -def test_wildcard_attributes(): - ATTR = { - '*': ['id'], - 'img': ['src'], - } - TAG = ['img', 'em'] - dirty = ('both can have ' - '') - assert ( - bleach.clean(dirty, tags=TAG, attributes=ATTR) == - 'both can have ' - ) - - -def test_callable_attributes(): - """Verify callable attributes work and get correct arg values""" - def img_test(attr, val): - return attr == 'src' and val.startswith('https') - - ATTR = { - 'img': img_test, - } - TAGS = ['img'] - - assert ( - bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == - u'foo baz' - ) - assert ( - bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == - u'foo baz' - ) - - -def test_svg_attr_val_allows_ref(): - """Unescape values in svg attrs that allow url references""" - # Local IRI, so keep it - text = '' - TAGS = ['svg', 'rect'] - ATTRS = { - 'rect': ['fill'], - } - assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == - '' - ) - - # Non-local IRI, so drop it - text = '' - TAGS = ['svg', 'rect'] - ATTRS = { - 'rect': ['fill'], - } - assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == - '' - ) - - -@pytest.mark.parametrize('text, expected', [ - ( - '', - '' - ), - ( - '', - # NOTE(willkg): Bug in html5lib serializer drops the xlink part - '' - ), -]) -def test_svg_allow_local_href(text, expected): - """Keep local hrefs for svg elements""" - TAGS = ['svg', 'pattern'] - ATTRS = { - 'pattern': ['id', 'href'], - } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected - - -@pytest.mark.parametrize('text, expected', [ - ( - '', - '' - ), - ( - '', - '' - ), -]) -def test_svg_allow_local_href_nonlocal(text, expected): - """Drop non-local hrefs for svg elements""" - TAGS = ['svg', 'pattern'] - ATTRS = { - 'pattern': ['id', 'href'], - } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected - - - - -@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') -def test_sarcasm(): - """Jokes should crash.""" - dirty = 'Yeah right ' - clean = 'Yeah right <sarcasm/>' - assert bleach.clean(dirty) == clean - - -def test_user_defined_protocols_valid(): - valid_href = 'allowed href' - assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href - - -def test_user_defined_protocols_invalid(): - invalid_href = 'invalid href' - cleaned_href = 'invalid href' - assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href +def test_serializer(): + s = '
' + assert bleach.clean(s, tags=['table']) == s + assert bleach.linkify('test
') == 'test
' + assert bleach.clean('

test

', tags=['p']) == '

test

' From b46c7ae058c1ad5f2d351f47e6e57a4dfa1591c3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 11:19:31 -0500 Subject: [PATCH 067/314] Remove unneeded module --- tests/tools.py | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 tests/tools.py diff --git a/tests/tools.py b/tests/tools.py deleted file mode 100644 index 3ae047e9..00000000 --- a/tests/tools.py +++ /dev/null @@ -1,7 +0,0 @@ - - -def in_(l, a, msg=None): - """Shorthand for 'assert a in l, "%r not in %r" % (a, l) - """ - if a not in l: - raise AssertionError(msg or "%r not in %r" % (a, l)) From bd239771493c1ff6493fcff324e6512bbbb2028d Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 11:19:38 -0500 Subject: [PATCH 068/314] Add test for Cleaner and fix module import issue --- bleach/__init__.py | 2 +- tests/test_basics.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 296448ee..12788eb1 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -14,7 +14,7 @@ from bleach.sanitizer import BleachSanitizerFilter from bleach.version import __version__, VERSION # flake8: noqa -__all__ = ['clean', 'linkify'] +__all__ = ['Cleaner', 'clean', 'linkify'] log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) diff --git a/tests/test_basics.py b/tests/test_basics.py index 790ad559..919c8678 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -274,6 +274,19 @@ def test_user_defined_protocols_invalid(self): assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href +class TestCleaner: + def test_basics(self): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + cleaner = bleach.Cleaner(tags=TAGS, attributes=ATTRS) + + assert ( + cleaner.clean('a
test') == + 'a
test' + ) + + class TestLinkify: def test_no_href_links(self): s = 'x' From c0602651f187af1d1b927e1cbf219af0a46ad7c3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 12:02:12 -0500 Subject: [PATCH 069/314] Implement ability to use Filters in cleaning This lets users extend cleaning more easily by specifying an html5lib Filter. --- bleach/__init__.py | 33 ++++++++++++-- bleach/sanitizer.py | 1 - docs/clean.rst | 102 +++++++++++++++++++++++++++++++++++-------- tests/test_basics.py | 34 ++++++++++++--- 4 files changed, 142 insertions(+), 28 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 12788eb1..04d69b0a 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -115,9 +115,8 @@ class Cleaner(object): def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, - strip_comments=True): - """ - :arg tags: whitelist of allowed tags; defaults to + strip_comments=True, filters=None): + """:arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to @@ -133,6 +132,16 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, :arg strip_comments: whether or not to strip HTML comments + :arg filters: list of html5lib Filter classes to pass streamed content through + + See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + + .. Warning:: + + Using filters changes the output of + :py:method:`bleach.Cleaner.clean`. Make sure the way the filters + change the output are secure. + """ self.tags = tags self.attributes = attributes @@ -140,6 +149,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, self.protocols = protocols self.strip = strip self.strip_comments = strip_comments + self.filters = filters or [] self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') @@ -183,12 +193,16 @@ def clean(self, text): allowed_svg_properties=[], ) + # Apply any filters after the BleachSanitizerFilter + for filter_class in self.filters: + filtered = filter_class(source=filtered) + return self.serializer.render(filtered) def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, - strip_comments=True): + strip_comments=True, filters=None): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to @@ -228,6 +242,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, :arg strip_comments: whether or not to strip HTML comments + :arg filters: list of html5lib Filter classes to pass streamed content through + + See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + + .. Warning:: + + Using filters changes the output of + `bleach.Cleaner.clean`. Make sure the way the filters + change the output are secure. + :returns: cleaned text as unicode """ @@ -238,6 +262,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, protocols=protocols, strip=strip, strip_comments=strip_comments, + filters=filters, ) return cleaner.clean(text) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index b7162eed..62bbf648 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -16,7 +16,6 @@ def _attr_key(attr): """ key = (attr[0][0] or ''), attr[0][1] - print(key) return key diff --git a/docs/clean.rst b/docs/clean.rst index 63f0427e..8e310f59 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -5,7 +5,7 @@ ``bleach.clean()`` ================== -:py:func:`bleach.clean`` is Bleach's HTML sanitization method. +:py:func:`bleach.clean` is Bleach's HTML sanitization method. Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing algorithm and sanitize any disallowed tags or attributes. This algorithm also @@ -48,13 +48,41 @@ The default value is a relatively conservative list found in ``bleach.ALLOWED_TAGS``. -Attribute Whitelist -=================== +Allowed Attributes +================== + +The ``attributes`` kwarg lets you specify which attributes are allowed. + +The default value is also a conservative dict found in +``bleach.ALLOWED_ATTRIBUTES``. + + +As a list +--------- + +The ``attributes`` value can be a list, in which case the attributes are allowed +for any tag. + +For example: + +.. doctest:: + + >>> import bleach + + >>> bleach.clean( + ... u'

blah blah blah

', + ... tags=['p'], + ... attributes=['style'], + ... styles=['color'], + ... ) + u'

blah blah blah

' + + +As a dict +--------- -The ``attributes`` kwarg is a whitelist of attributes. It can be a list, in -which case the attributes are allowed for any tag, or a dictionary, in which -case the keys are tag names (or a wildcard: ``*`` for all tags) and the values -are lists of allowed attributes. +The ``attributes`` value can be a dict, in which case the keys are tag names (or +a wildcard: ``*`` for all tags) and the values are lists of allowed attributes. For example: @@ -80,23 +108,19 @@ In this case, ``class`` is allowed on any allowed element (from the ``tags`` argument), ```` tags are allowed to have ``href`` and ``rel`` attributes, and so on. -The default value is also a conservative dict found in -``bleach.ALLOWED_ATTRIBUTES``. - -Callable Filters ----------------- +Using functions +--------------- -You can also use a callable (instead of a list) in the ``attributes`` kwarg. If -the callable returns ``True``, the attribute is allowed. Otherwise, it is -stripped. For example: +You can also use callables. If the callable returns ``True``, the attribute is +allowed. Otherwise, it is stripped. For example: .. doctest:: >>> from urlparse import urlparse >>> import bleach - >>> def filter_src(name, value): + >>> def allow_src(name, value): ... if name in ('alt', 'height', 'width'): ... return True ... if name == 'src': @@ -108,7 +132,7 @@ stripped. For example: ... u'an example', ... tags=['img'], ... attributes={ - ... 'img': filter_src + ... 'img': allow_src ... } ... ) u'an example' @@ -229,3 +253,47 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set >>> bleach.clean(html, strip_comments=False) u'my html' + + +html5lib Filters +================ + +Bleach sanitizing is implemented as an html5lib Filter. The consequence of this +is that we can pass the streamed content through additional specified filters +after the :py:class:`bleach.sanitizer.BleachSanitizingFilter` filter has run. + +This lets you add data, drop data and change data as it is being serialized back +to a unicode. + +Documentation on html5lib Filters is here: +http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + +Trivial Filter example: + +.. doctest:: + + >>> import bleach + >>> from html5lib.filters.base import Filter + + >>> class MooFilter(Filter): + ... def __iter__(self): + ... for token in Filter.__iter__(self): + ... if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: + ... for attr, value in token['data'].items(): + ... token['data'][attr] = 'moo' + ... yield token + ... + >>> ATTRS = { + ... 'img': ['rel', 'src'] + ... } + ... + >>> TAGS = ['img'] + >>> dirty = 'this is cute! ' + >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) + u'this is cute! ' + + +.. Warning:: + + Filters change the output of cleaning. Make sure that whatever changes the + filter is applying maintain the safety guarantees of the output. diff --git a/tests/test_basics.py b/tests/test_basics.py index 919c8678..620a42da 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,4 +1,5 @@ import html5lib +from html5lib.filters.base import Filter import pytest import six @@ -147,14 +148,14 @@ def test_stripping(self): ) def test_allowed_styles(self): - ATTR = ['style'] + ATTRS = ['style'] STYLE = ['color'] blank = '' s = '' - assert bleach.clean('', attributes=ATTR) == blank - assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s + assert bleach.clean('', attributes=ATTRS) == blank + assert bleach.clean(s, attributes=ATTRS, styles=STYLE) == s assert ( - bleach.clean('', attributes=ATTR, styles=STYLE) == + bleach.clean('', attributes=ATTRS, styles=STYLE) == s ) @@ -165,7 +166,7 @@ def test_lowercase_html(self): assert bleach.clean(dirty, attributes=['class']) == clean def test_wildcard_attributes(self): - ATTR = { + ATTRS = { '*': ['id'], 'img': ['src'], } @@ -173,7 +174,7 @@ def test_wildcard_attributes(self): dirty = ('both can have ' '') assert ( - bleach.clean(dirty, tags=TAG, attributes=ATTR) == + bleach.clean(dirty, tags=TAG, attributes=ATTRS) == 'both can have ' ) @@ -273,6 +274,27 @@ def test_user_defined_protocols_invalid(self): cleaned_href = 'invalid href' assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href + def test_filters(self): + # Create a Filter that changes all the attr values to "moo" + class MooFilter(Filter): + def __iter__(self): + for token in Filter.__iter__(self): + if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: + for attr, value in token['data'].items(): + token['data'][attr] = 'moo' + + yield token + + ATTRS = { + 'img': ['rel', 'src'] + } + TAGS = ['img'] + dirty = 'this is cute! ' + assert ( + bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) == + 'this is cute! ' + ) + class TestCleaner: def test_basics(self): From 40ebdcda9833cd0d93418a943d4616c62c8cf043 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 12:20:51 -0500 Subject: [PATCH 070/314] Minor fixes that should have been in last PR --- bleach/__init__.py | 11 ++++++----- docs/clean.rst | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 04d69b0a..b71eeb89 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -116,7 +116,9 @@ class Cleaner(object): def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None): - """:arg tags: whitelist of allowed tags; defaults to + """Initializes a Cleaner + + :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to @@ -138,9 +140,8 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, .. Warning:: - Using filters changes the output of - :py:method:`bleach.Cleaner.clean`. Make sure the way the filters - change the output are secure. + Using filters changes the output of ``bleach.Cleaner.clean``. + Make sure the way the filters change the output are secure. """ self.tags = tags @@ -249,7 +250,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, .. Warning:: Using filters changes the output of - `bleach.Cleaner.clean`. Make sure the way the filters + ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. :returns: cleaned text as unicode diff --git a/docs/clean.rst b/docs/clean.rst index 8e310f59..a65d8b93 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -22,6 +22,7 @@ If you're cleaning a lot of text, you might want to create a .. autofunction:: bleach.clean .. autoclass:: bleach.Cleaner + :members: Tag Whitelist From 313478f17e7107fc659243be4309e6e0c9b86e30 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 15:18:27 -0500 Subject: [PATCH 071/314] Reimplement linkify as an html5lib Filter This reimplements linkify as an html5lib Filter. This has a few advantages: 1. it just has to consume a token stream which involves fewer dance steps to get things done 2. it can be used in conjunction with other Filters 3. it can now be used as a post-clean Filter for Cleaner letting you clean and linkify in one tree traversal --- bleach/__init__.py | 339 +++------------------------------------- bleach/callbacks.py | 19 ++- bleach/linkifier.py | 356 +++++++++++++++++++++++++++++++++++++++++++ bleach/sanitizer.py | 22 +-- bleach/utils.py | 23 +++ setup.cfg | 6 +- tests/test_basics.py | 17 +-- tests/test_links.py | 56 +++++-- 8 files changed, 472 insertions(+), 366 deletions(-) create mode 100644 bleach/linkifier.py create mode 100644 bleach/utils.py diff --git a/bleach/__init__.py b/bleach/__init__.py index b71eeb89..0155a127 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -11,6 +11,7 @@ from bleach import callbacks as linkify_callbacks from bleach.encoding import force_unicode +from bleach.linkifier import LinkifyFilter from bleach.sanitizer import BleachSanitizerFilter from bleach.version import __version__, VERSION # flake8: noqa @@ -44,47 +45,6 @@ ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] -TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az - ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat - cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk - dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg - gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il - im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp - kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk - ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne - net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post - pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl - sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to - tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws - xn xxx ye yt yu za zm zw""".split() - -# Make sure that .com doesn't get matched by .co first -TLDS.reverse() - -url_re = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, - # except for # and ~, which happen in practice) - """.format('|'.join(allowed_protocols), '|'.join(TLDS)), - re.IGNORECASE | re.VERBOSE | re.UNICODE) - -proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) - -punct_re = re.compile(r'([\.,]+)$') - -email_re = re.compile( - r"""(? tag replaced by the text within it - adj = replace_nodes(tree, _text, node, current_child) - # pull back current_child by 1 to scan the new nodes - # again. - current_child -= 1 - else: - text = force_unicode(attrs.pop('_text')) - for attr_key, attr_val in attrs.items(): - node.set(attr_key, attr_val) - - for n in reversed(list(node)): - node.remove(n) - text = parser.parseFragment(text) - node.text = text.text - for n in text: - node.append(n) - _seen.add(node) - - elif current_child >= 0: - if node.tag == ETREE_TAG('pre') and skip_pre: - linkify_nodes(node, False) - elif not (node in _seen): - linkify_nodes(node, parse_text) - - current_child += 1 - - def email_repl(match): - addr = match.group(0).replace('"', '"') - link = { - '_text': addr, - 'href': 'mailto:{0!s}'.format(addr), - } - link = apply_callbacks(link, True) - - if link is None: - return addr - - _href = link.pop('href') - _text = link.pop('_text') - - repl = '{2!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl.format(_href, attribs, _text) - - def link_repl(match): - url = match.group(0) - open_brackets = close_brackets = 0 - if url.startswith('('): - _wrapping = strip_wrapping_parentheses(url) - url, open_brackets, close_brackets = _wrapping - if url.endswith(')') and '(' not in url: - # This is a clumsy handling for the case where we have something - # like (foo http://example.com) and the ) gets picked up by the - # url_re but we don't want it part of the link. - new_url = url.rstrip(')') - close_brackets += len(url) - len(new_url) - url = new_url - - end = '' - m = re.search(punct_re, url) - if m: - end = m.group(0) - url = url[0:m.start()] - if re.search(proto_re, url): - href = url - else: - href = ''.join(['http://', url]) - - link = { - '_text': url, - 'href': href, - } - - link = apply_callbacks(link, True) - - if link is None: - return '(' * open_brackets + url + ')' * close_brackets - - _text = link.pop('_text') - _href = link.pop('href') - - repl = '{0!s}{3!s}{4!s}{5!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - - return repl.format('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) - - try: - linkify_nodes(forest) - except RuntimeError as e: - # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: {0!r}'.format(e)) - - return _render(forest) - - -def _render(tree): - """Try rendering as HTML, then XML, then give up.""" - return force_unicode(_serialize(tree)) - - -def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('etree') - stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values='always', - alphabetical_attributes=True, - omit_optional_tags=False) - return serializer.render(stream) + dom = parser.parseFragment(text) + filtered = LinkifyFilter( + source=walker(dom), + callbacks=callbacks, + skip_pre=skip_pre, + parse_email=parse_email + ) + return serializer.render(filtered) diff --git a/bleach/callbacks.py b/bleach/callbacks.py index 3cb82c25..d2ba1014 100644 --- a/bleach/callbacks.py +++ b/bleach/callbacks.py @@ -3,18 +3,23 @@ def nofollow(attrs, new=False): - if attrs['href'].startswith('mailto:'): + href_key = (None, u'href') + if href_key not in attrs or attrs[href_key].startswith(u'mailto:'): return attrs - rel = [x for x in attrs.get('rel', '').split(' ') if x] - if 'nofollow' not in [x.lower() for x in rel]: - rel.append('nofollow') - attrs['rel'] = ' '.join(rel) + + rel_key = (None, u'rel') + rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val] + if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]: + rel_values.append(u'nofollow') + attrs[rel_key] = u' '.join(rel_values) return attrs def target_blank(attrs, new=False): - if attrs['href'].startswith('mailto:'): + href_key = (None, u'href') + if attrs[href_key].startswith(u'mailto:'): return attrs - attrs['target'] = '_blank' + + attrs[(None, u'target')] = u'_blank' return attrs diff --git a/bleach/linkifier.py b/bleach/linkifier.py new file mode 100644 index 00000000..b4ba2ea8 --- /dev/null +++ b/bleach/linkifier.py @@ -0,0 +1,356 @@ +from __future__ import unicode_literals +import re + +from html5lib.filters.base import Filter + +from bleach import allowed_protocols +from bleach.encoding import force_unicode +from bleach.utils import alphabetize_attributes + + +# FIXME(willkg): Move this to a constants module. +TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az + ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat + cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk + dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg + gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il + im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp + kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk + ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne + net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post + pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl + sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to + tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws + xn xxx ye yt yu za zm zw""".split() + +# Make sure that .com doesn't get matched by .co first +TLDS.reverse() + + +url_re = re.compile( + r"""\(* # Match any opening parentheses. + \b(?"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """.format('|'.join(allowed_protocols), '|'.join(TLDS)), + re.IGNORECASE | re.VERBOSE | re.UNICODE) + + +proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) + +punct_re = re.compile(r'([\.,]+)$') + +email_re = re.compile( + r"""(? end: + new_tokens.append( + {u'type': u'Characters', u'data': text[end:match.start()]} + ) + + # Run attributes through the callbacks to see what we + # should do with this match + attrs = { + (None, u'href'): u'mailto:%s' % match.group(0), + u'_text': match.group(0) + } + attrs = self.apply_callbacks(attrs, True) + + if attrs is None: + # Just add the text + new_tokens.append( + {u'type': u'Characters', u'data': match.group(0)} + ) + + else: + # Add a "a" tag + _text = attrs.pop(u'_text', '') + attrs = alphabetize_attributes(attrs) + new_tokens.extend([ + {u'type': u'StartTag', u'name': u'a', u'data': attrs}, + {u'type': u'Characters', u'data': force_unicode(_text)}, + {u'type': u'EndTag', u'name': 'a'} + ]) + end = match.end() + + if new_tokens: + if end < len(text): + new_tokens.append({u'type': u'Characters', u'data': text[end:]}) + + for new_token in new_tokens: + yield new_token + + continue + + yield token + + def strip_wrapping_parentheses(self, fragment): + """Strips wrapping parentheses""" + openp = closep = 0 + + # Count consecutive opening parentheses at the beginning of the + # fragment (string) + for char in fragment: + if char == '(': + openp += 1 + else: + break + + if openp: + newer_frag = '' + # Cut the consecutive opening brackets from the fragment + fragment = fragment[openp:] + + # Reverse the fragment for easier detection of parentheses + # inside the URL + reverse_fragment = fragment[::-1] + skip = False + for char in reverse_fragment: + if char == ')' and closep < openp and not skip: + # Remove the closing parentheses if it has a matching + # opening parentheses (they are balanced). + closep += 1 + continue + + elif char != ')': + # Do not remove ')' from the URL itself. + skip = True + + newer_frag += char + + # Reverse fragment back + fragment = newer_frag[::-1] + + return fragment, u'(' * openp, u')' * closep + + def strip_punctuation(self, fragment): + match = re.search(punct_re, fragment) + if match: + return fragment[0:match.start()], match.group(0) + else: + return fragment, '' + + def handle_links(self, src_iter): + """Handle links in character tokens""" + for token in src_iter: + if token['type'] == 'Characters': + text = token['data'] + new_tokens = [] + end = 0 + + for match in url_re.finditer(text): + if match.start() > end: + new_tokens.append( + {u'type': u'Characters', u'data': text[end:match.start()]} + ) + + url = match.group(0) + prefix = suffix = '' + + # Sometimes we pick up ( and ), so drop them from the url + if url.startswith('('): + url, prefix, suffix = self.strip_wrapping_parentheses(url) + + if url.endswith(u')') and u'(' not in url: + new_url = url.rstrip(u')') + suffix = url[len(new_url):] + suffix + url = new_url + + # Sometimes we pick up . and , at the end of the url that's + # part of the sentence and not the url so drop it + url, punct_suffix = self.strip_punctuation(url) + if punct_suffix: + suffix = suffix + punct_suffix + + # If there's no protocol, add one + if re.search(proto_re, url): + href = url + else: + href = u'http://%s' % url + + attrs = { + (None, u'href'): href, + u'_text': url + } + attrs = self.apply_callbacks(attrs, True) + + if prefix: + new_tokens.append( + {u'type': u'Characters', u'data': prefix} + ) + + if attrs is None: + # Just add the text + new_tokens.append( + {u'type': u'Characters', u'data': url} + ) + + else: + # Add an "a" tag! + _text = attrs.pop(u'_text', '') + attrs = alphabetize_attributes(attrs) + + new_tokens.extend([ + {u'type': u'StartTag', u'name': u'a', u'data': attrs}, + {u'type': u'Characters', u'data': force_unicode(_text)}, + {u'type': u'EndTag', u'name': 'a'}, + ]) + + if suffix: + new_tokens.append( + {u'type': u'Characters', u'data': suffix} + ) + + end = match.end() + + if new_tokens: + if end < len(text): + new_tokens.append({u'type': u'Characters', u'data': text[end:]}) + + for new_token in new_tokens: + yield new_token + + continue + + yield token + + def __iter__(self): + in_a = False + in_pre = False + + token_buffer = [] + + for token in super(LinkifyFilter, self).__iter__(): + if in_a: + # Handle the case where we're in an "a" tag--we want to buffer tokens + # until we hit an end "a" tag. + if token['type'] == 'EndTag' and token['name'] == 'a': + # We're no longer in an "a" tag, so we get all the things we + # need to apply callbacks and then figure out what to do with + # this "a" tag. + in_a = False + a_token = token_buffer[0] + if a_token['data']: + attrs = a_token['data'] + else: + attrs = {} + + text = self.extract_character_data(token_buffer) + attrs['_text'] = text + + attrs = self.apply_callbacks(attrs, False) + if attrs is None: + # We're dropping the "a" tag and everything else and replacing + # it with character data. So emit that token. + yield {'type': 'Characters', 'data': text} + + else: + new_text = attrs.pop('_text', '') + # FIXME(willkg): add nofollow here + a_token['data'] = alphabetize_attributes(attrs) + + if text == new_text: + # The callbacks didn't change the text, so we yield the + # new "a" token, then whatever else was there, then the + # end "a" token + yield a_token + for mem in token_buffer[1:]: + yield mem + yield token + + else: + # If the callbacks changed the text, then we're going + # to drop all the tokens between the start and end "a" + # tags and replace it with the new text + yield a_token + yield {'type': 'Characters', 'data': force_unicode(new_text)} + yield token + + token_buffer = [] + continue + + else: + token_buffer.append(token) + continue + + elif token['type'] in ['StartTag', 'EmptyTag']: + if token['name'] == 'pre' and self.skip_pre: + # The "pre" tag starts a "special mode" where we don't linkify + # anything. + in_pre = True + + elif token['name'] == 'a': + # The "a" tag is special--we switch to a slurp mode and + # slurp all the tokens until the end "a" tag and then + # figure out what to do with them there. + in_a = True + token_buffer.append(token) + + # We buffer the start tag, so we don't want to yield it, + # yet + continue + + elif in_pre: + # NOTE(willkg): We put this clause here since in_a and + # switching in and out of is_a takes precedence. + if token['type'] == 'EndTag' and token['name'] == 'pre': + in_pre = False + + elif not in_a and not in_pre and token['type'] == 'Characters': + new_stream = iter([token]) + if self.parse_email: + new_stream = self.handle_email_addresses(new_stream) + + new_stream = self.handle_links(new_stream) + + for token in new_stream: + yield token + + # We've already yielded this token, so continue + continue + + yield token diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 62bbf648..610dd903 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,22 +1,11 @@ from __future__ import unicode_literals -from collections import OrderedDict import re from xml.sax.saxutils import unescape from html5lib.constants import namespaces from html5lib.filters import sanitizer - -def _attr_key(attr): - """Returns appropriate key for sorting attribute names - - Attribute names are a tuple of ``(namespace, name)`` where namespace can be - ``None`` or a string. These can't be compared in Python 3, so we conver the - ``None`` to an empty string. - - """ - key = (attr[0][0] or ''), attr[0][1] - return key +from bleach.utils import alphabetize_attributes class BleachSanitizerFilter(sanitizer.Filter): @@ -60,9 +49,7 @@ def sanitize_token(self, token): if 'data' in token: # Alphabetize the attributes before calling .disallowed_token() # so that the resulting string is stable - token['data'] = OrderedDict( - [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)] - ) + token['data'] = alphabetize_attributes(token['data']) return self.disallowed_token(token) elif token_type == 'Comment': @@ -139,10 +126,7 @@ def allow_token(self, token): # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val - # Alphabetize attributes - token['data'] = OrderedDict( - [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)] - ) + token['data'] = alphabetize_attributes(attrs) return token diff --git a/bleach/utils.py b/bleach/utils.py new file mode 100644 index 00000000..d9c211fc --- /dev/null +++ b/bleach/utils.py @@ -0,0 +1,23 @@ +from collections import OrderedDict + + +def _attr_key(attr): + """Returns appropriate key for sorting attribute names + + Attribute names are a tuple of ``(namespace, name)`` where namespace can be + ``None`` or a string. These can't be compared in Python 3, so we conver the + ``None`` to an empty string. + + """ + key = (attr[0][0] or ''), attr[0][1] + return key + + +def alphabetize_attributes(attrs): + """Takes a dict of attributes (or None) and returns them alphabetized""" + if not attrs: + return attrs + + return OrderedDict( + [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)] + ) diff --git a/setup.cfg b/setup.cfg index 950364a7..69c6d1f2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,11 @@ test=pytest [flake8] -ignore = E731,W503 +ignore = + # E731: do not assign a lambda expression, use a def + E731, + # W503: line break occurred before a binary operator + W503 max-line-length = 100 [wheel] diff --git a/tests/test_basics.py b/tests/test_basics.py index 620a42da..e3f5d2da 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,4 +1,3 @@ -import html5lib from html5lib.filters.base import Filter import pytest import six @@ -183,17 +182,19 @@ def test_callable_attributes(self): def img_test(attr, val): return attr == 'src' and val.startswith('https') - ATTR = { + ATTRS = { 'img': img_test, } TAGS = ['img'] assert ( - bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + bleach.clean('foo blah baz', tags=TAGS, + attributes=ATTRS) == u'foo baz' ) assert ( - bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + bleach.clean('foo blah baz', tags=TAGS, + attributes=ATTRS) == u'foo baz' ) @@ -325,11 +326,6 @@ def test_rel_already_there(self): assert bleach.linkify(link_good) == link_good -def test_xml_render(): - parser = html5lib.HTMLParser() - assert bleach._render(parser.parseFragment('')) == '' - - def test_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" dirty = 'invalid & < extra http://link.com' @@ -340,7 +336,8 @@ def test_idempotent(): linked = bleach.linkify(dirty) assert ( bleach.linkify(linked) == - 'invalid & < extra http://link.com' + 'invalid & < extra http://link.com' ) diff --git a/tests/test_links.py b/tests/test_links.py index 53d60e5c..1712d199 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -5,7 +5,8 @@ import pytest -from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC +from bleach import linkify, DEFAULT_CALLBACKS as DC +from bleach.linkifier import url_re def test_url_re(): @@ -51,8 +52,9 @@ def test_trailing_slash(): def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): - quoted = quote_plus(attrs['href']) - attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) + if not attrs.get((None, 'href'), '').startswith('http://bouncer'): + quoted = quote_plus(attrs[(None, 'href')]) + attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs assert ( @@ -188,7 +190,7 @@ def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): - attrs['rev'] = 'canonical' + attrs[(None, u'rev')] = u'canonical' return attrs assert ( @@ -214,7 +216,7 @@ def only_proto(attrs, new=False): def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): - if attrs['href'].startswith('mailto:'): + if attrs[(None, 'href')].startswith('mailto:'): return None return attrs text = 'do not link james@example.com' @@ -276,14 +278,16 @@ def test_add_rel_nofollow(): def test_url_with_path(): assert ( linkify('http://example.com/path/to/file') == - 'http://example.com/path/to/file' + '' + 'http://example.com/path/to/file' ) def test_link_ftp(): assert ( linkify('ftp://ftp.mozilla.org/some/file') == - 'ftp://ftp.mozilla.org/some/file' + '' + 'ftp://ftp.mozilla.org/some/file' ) @@ -325,10 +329,8 @@ def test_escaped_html(): def test_link_http_complete(): assert ( linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') == - ( - '' - 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f' - ) + '' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f' ) @@ -348,7 +350,8 @@ def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" assert ( linkify('All your{"xx.yy.com/grover.png"}base are') == - 'All your{"xx.yy.com/grover.png"}base are' + 'All your{"xx.yy.com/grover.png"}' + 'base are' ) @@ -556,8 +559,35 @@ def test_remove_first_childlink(): def test_drop_link_tags(): """Verify that dropping link tags *just* drops the tag and not the content""" - html = """first second third fourth fifth""" + html = ( + 'first second third ' + 'fourth fifth' + ) assert ( linkify(html, callbacks=[lambda attrs, new: None]) == 'first second third fourth fifth' ) + + +@pytest.mark.parametrize('text, expected', [ + (u'<br>', u'<br>'), + ( + u'<br> http://example.com', + u'<br> http://example.com' + ), + ( + u'<br>
http://example.com', + u'<br>
http://example.com' + ) +]) +def test_naughty_unescaping(text, expected): + """Verify that linkify is not unescaping things it shouldn't be""" + assert linkify(text) == expected + + +def test_hang(): + """This string would hang linkify. Issue #200""" + assert ( + linkify("an@email.com", parse_email=True) == + 'an@email.com' + ) From f4333240eeb543b323e7f26d81dff58ce112fcb6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 21:03:21 -0500 Subject: [PATCH 072/314] Update CHANGES --- CHANGES | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/CHANGES b/CHANGES index e8e49a8d..c2b91c1a 100644 --- a/CHANGES +++ b/CHANGES @@ -6,21 +6,68 @@ Version 2.0 (in development) **Backwards incompatible changes** -- Removed support for Python 2.6. #206 -- Removed support for Python 3.2. #224 -- Bleach no longer supports html5lib < 0.99999999 (8 9s). +* Removed support for Python 2.6. #206 - This version represents a rewrite to use the new sanitizing API since - the old one was dropped in html5lib 0.99999999 (8 9s). +* Removed support for Python 3.2. #224 + +* Bleach no longer supports html5lib < 0.99999999 (8 9s). + + This version is a rewrite to use the new sanitizing API since the old + one was dropped in html5lib 0.99999999 (8 9s). + +* ``bleach.clean`` and friends were rewritten + + ``clean`` is now implemented as an html5lib Filter and happens at a different + step in the HTML parsing -> traversing -> serializing process. Because of + that, there are some differences in clean's output as compared with previous + versions. + + Amongst other things, this version will add end tags even if the tag in + question is to be escaped. + +* ``bleach.linkify`` was rewritten + + ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer + accepts a ``tokenizer`` argument. + + The callback functions for adjusting link attributes now takes a namespaced + attribute. + + Previously you'd do something like this:: + + def check_protocol(attrs): + if not attrs.get('href', '').startswith('http:', 'https:')): + return None + return attrs + + Now it's more like this:: + + def check_protocol(attrs): + if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')): + # ^^^^^^^^^^^^^^ + return None + return attrs -- linkify no longer accepts a tokenizer argument. -- clean output is different than in previous versions; particularly this version - will add end tags even if the tag will be escaped. **Changes** -- Supports Python 3.6. -- Supports html5lib >= 0.99999999 (8 9s). +* Supports Python 3.6. + +* Supports html5lib >= 0.99999999 (8 9s). + +* There's a ``bleach.Cleaner`` class that you can instantiate with your + favorite clean settings and reuse it. + +* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib Filter. + +* You can pass ``bleach.linkifier.LinkifyFilter`` as a Filter to + ``bleach.Cleaner`` allowing you to clean and linkify in one pass. + +* Lots of bug fixes. + +* Test cleanup. + +* Documentation fixes. Version 1.5 (November 4th, 2016) From 1199a6323e4e632d6ec984646d8a1eb1aee937ec Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 3 Mar 2017 21:33:15 -0500 Subject: [PATCH 073/314] Update documentation --- CHANGES | 4 +- LICENSE | 2 +- docs/clean.rst | 65 +++++++++++++--------- docs/conf.py | 2 +- docs/linkify.rst | 141 ++++++++++++++++++++++++++++++++--------------- 5 files changed, 140 insertions(+), 74 deletions(-) diff --git a/CHANGES b/CHANGES index c2b91c1a..d7d9b0d2 100644 --- a/CHANGES +++ b/CHANGES @@ -35,14 +35,14 @@ Version 2.0 (in development) Previously you'd do something like this:: - def check_protocol(attrs): + def check_protocol(attrs, is_new): if not attrs.get('href', '').startswith('http:', 'https:')): return None return attrs Now it's more like this:: - def check_protocol(attrs): + def check_protocol(attrs, is_new): if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')): # ^^^^^^^^^^^^^^ return None diff --git a/LICENSE b/LICENSE index 90a2cb9b..467c38e4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014-2016, Mozilla Foundation +Copyright (c) 2014-2017, Mozilla Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/docs/clean.rst b/docs/clean.rst index a65d8b93..e281e2ca 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -1,9 +1,9 @@ .. _clean-chapter: .. highlightlang:: python -================== -``bleach.clean()`` -================== +========================= +Sanitizing text fragments +========================= :py:func:`bleach.clean` is Bleach's HTML sanitization method. @@ -16,21 +16,15 @@ takes care of things like unclosed and (some) misnested tags. always return ``unicode``. -If you're cleaning a lot of text, you might want to create a -:py:class:`bleach.Cleaner` instance. - .. autofunction:: bleach.clean -.. autoclass:: bleach.Cleaner - :members: +Allowed tags (``tags``) +======================= -Tag Whitelist -============= - -The ``tags`` kwarg is a whitelist of allowed HTML tags. It should be a list, -tuple, or other iterable. Any other HTML tags will be escaped or stripped from -the text. +The ``tags`` kwarg specifies the allowed set of HTML tags. It should be a list, +tuple, or other iterable. Any HTML tags not in this list will be escaped or +stripped from the text. For example: @@ -49,8 +43,8 @@ The default value is a relatively conservative list found in ``bleach.ALLOWED_TAGS``. -Allowed Attributes -================== +Allowed Attributes (``attributes``) +=================================== The ``attributes`` kwarg lets you specify which attributes are allowed. @@ -139,8 +133,8 @@ allowed. Otherwise, it is stripped. For example: u'an example' -Styles Whitelist -================ +Allowed styles (``styles``) +=========================== If you allow the ``style`` attribute, you will also need to whitelist styles users are allowed to set, for example ``color`` and ``background-color``. @@ -172,8 +166,8 @@ For example, to allow users to set the color and font-weight of text: Default styles are stored in ``bleach.ALLOWED_STYLES``. -Protocol Whitelist -================== +Allowed protocols (``protocols``) +================================= If you allow tags that have attributes containing a URI value (like the ``href`` attribute of an anchor tag, you may want to adapt the accepted protocols. The @@ -208,8 +202,8 @@ This adds smb to the Bleach-specified set of allowed protocols: Default protocols are in ``bleach.ALLOWED_PROTOCOLS``. -Stripping Markup -================ +Stripping markup (``strip``) +============================ By default, Bleach *escapes* tags that aren't specified in the tags whitelist and invalid markup. For example: @@ -237,8 +231,8 @@ If you would rather Bleach stripped this markup entirely, you can pass u'is not allowed' -Stripping Comments -================== +Stripping comments (``strip_comments``) +======================================= By default, Bleach will strip out HTML comments. To disable this behavior, set ``strip_comments=False``: @@ -256,8 +250,8 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set u'my html' -html5lib Filters -================ +html5lib Filters (``filters``) +============================== Bleach sanitizing is implemented as an html5lib Filter. The consequence of this is that we can pass the streamed content through additional specified filters @@ -298,3 +292,22 @@ Trivial Filter example: Filters change the output of cleaning. Make sure that whatever changes the filter is applying maintain the safety guarantees of the output. + + +Using ``bleach.Cleaner`` +======================== + +If you're cleaning a lot of text, you might want to create a +:py:class:`bleach.Cleaner` instance. + +.. autoclass:: bleach.Cleaner + :members: + + +Using ``bleach.sanitizer.BleachSanitizerFilter`` +================================================ + +``bleach.clean`` creates a ``bleach.Cleaner`` which creates a +``bleach.sanitizer.BleachSanitizerFilter`` which does the sanitizing work. +``BleachSanitizerFilter`` is an html5lib Filter and can be used anywhere you can +use an html5lib Filter. diff --git a/docs/conf.py b/docs/conf.py index e186c827..1d257d01 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,7 +43,7 @@ # General information about the project. project = u'Bleach' -copyright = u'2012-2015, James Socol; 2015-2016, Mozilla Foundation' +copyright = u'2012-2015, James Socol; 2015-2017, Mozilla Foundation' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/docs/linkify.rst b/docs/linkify.rst index 705000c2..6fe032ed 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -1,32 +1,31 @@ .. _linkify-chapter: .. highlightlang:: python -==================== -``bleach.linkify()`` -==================== +========================= +Linkifying text fragments +========================= ``linkify()`` searches text for links, URLs, and email addresses and lets you -control how and when those links are rendered:: - - def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - parse_email=False): - """Convert URL-like strings in an HTML fragment to links. +control how and when those links are rendered. ``linkify()`` works by building a document tree, so it's guaranteed never to do weird things to URLs in attribute values, can modify the value of attributes on ```` tags, and can even do things like skip ``
`` sections.
 
-By default, ``linkify()`` will perform some sanitization, only allowing a set
-of "safe" tags. Because it uses the HTML5 parsing algorithm, it will always
-handle things like unclosed tags.
+By default, ``linkify()`` will perform some sanitization, only allowing a set of
+"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
+things like unclosed tags.
 
 .. note::
+
    You may pass a ``string`` or ``unicode`` object, but Bleach will always
    return ``unicode``.
 
+.. autofunction:: bleach.linkify
+
 
-Callbacks
-=========
+Callbacks for adjusting attributes (``callbacks``)
+==================================================
 
 The second argument to ``linkify()`` is a list or other iterable of callback
 functions. These callbacks can modify links that exist and links that are being
@@ -36,20 +35,23 @@ Each callback will get the following arguments::
 
     def my_callback(attrs, new=False):
 
-The ``attrs`` argument is a dict of attributes of the ```` tag. The ``new``
-argument is a boolean indicating if the link is new (e.g. an email address or
-URL found in the text) or already existed (e.g. an ```` tag found in the
-text). The ``attrs`` dict also contains a ``_text`` key, which is the innerText
-of the ```` tag.
+The ``attrs`` argument is a dict of attributes of the ```` tag. Keys of the
+``attrs`` dict are namespaced attr names. For example ``(None, 'href')``. The
+``attrs`` dict also contains a ``_text`` key, which is the innerText of the
+```` tag.
 
-The callback must return a dict of attributes (including ``_text``) or
-``None``. The new dict of attributes will be passed to the next callback in the
-list. If any callback returns ``None``, the link will not be created and the
-original text left in place, or will be removed, and its original innerText
-left in place.
+The ``new`` argument is a boolean indicating if the link is new (e.g. an email
+address or URL found in the text) or already existed (e.g. an ```` tag found
+in the text).
 
-The default value is simply to add ``rel="nofollow"``. See ``bleach.callbacks``
-for some included callback functions.
+The callback must return a dict of attributes (including ``_text``) or ``None``.
+The new dict of attributes will be passed to the next callback in the list.
+
+If any callback returns ``None``, new links will not be created and existing
+links will be removed leaving the innerText left in its place.
+
+The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
+included callback functions.
 
 
 Setting Attributes
@@ -59,22 +61,24 @@ For example, to set ``rel="nofollow"`` on all links found in the text, a simple
 (and included) callback might be::
 
     def set_nofollow(attrs, new=False):
-        attrs['rel'] = 'nofollow'
+        attrs[(None, 'rel')] = 'nofollow'
         return attrs
 
-This would overwrite the value of the ``rel`` attribute if it was set.
 
-You could also make external links open in a new tab, or set a class::
+This would set the value of the ``rel`` attribute, stomping on a previous value
+if there was one.
+
+You could also make external links open in a new tab or set a class::
 
     from urlparse import urlparse
 
     def set_target(attrs, new=False):
-        p = urlparse(attrs['href'])
+        p = urlparse(attrs[(None, 'href')])
         if p.netloc not in ['my-domain.com', 'other-domain.com']:
-            attrs['target'] = '_blank'
-            attrs['class'] = 'external'
+            attrs[(None, 'target')] = '_blank'
+            attrs[(None, 'class')] = 'external'
         else:
-            attrs.pop('target', None)
+            attrs.pop((None, 'target'), None)
         return attrs
 
 
@@ -89,18 +93,20 @@ sanitizing attributes.)
 
     def allowed_attributes(attrs, new=False):
         """Only allow href, target, rel and title."""
-        allowed = ['href', 'target', 'rel', 'title']
+        allowed = [(None, 'href'), (None, 'target'),
+                   (None, 'rel'), (None, 'title')]
         return dict((k, v) for k, v in attrs.items() if k in allowed)
 
+
 Or you could remove a specific attribute, if it exists::
 
     def remove_title1(attrs, new=False):
-        attrs.pop('title', None)
+        attrs.pop((None, 'title'), None)
         return attrs
 
     def remove_title2(attrs, new=False):
-        if 'title' in attrs:
-            del attrs['title']
+        if (None, 'title') in attrs:
+            del attrs[(None, 'title')]
         return attrs
 
 
@@ -117,6 +123,7 @@ limit the length of text inside an ```` tag.
         """Shorten overly-long URLs in the text."""
         if not new:  # Only looking at newly-created links.
             return attrs
+
         # _text will be the same as the URL for new links.
         text = attrs['_text']
         if len(text) > 25:
@@ -130,10 +137,10 @@ limit the length of text inside an ```` tag.
 
     def outgoing_bouncer(attrs, new=False):
         """Send outgoing links through a bouncer."""
-        p = urlparse(attrs['href'])
+        p = urlparse((None, attrs['href']))
         if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
             bouncer = 'http://outgoing.my-domain.com/?destination=%s'
-            attrs['href'] = bouncer % quote(attrs['href'])
+            attrs[(None, 'href')] = bouncer % quote(attrs['href'])
         return attrs
 
 
@@ -151,7 +158,7 @@ write the following callback::
             return attrs
 
         # If the TLD is '.py', make sure it starts with http: or https:
-        href = attrs['href']
+        href = attrs[(None, 'href')]
         if href.endswith('.py') and not href.startswith(('http:', 'https:')):
             # This looks like a Python file, not a URL. Don't make a link.
             return None
@@ -168,13 +175,13 @@ If you want to remove certain links, even if they are written in the text with
 
     def remove_mailto(attrs, new=False):
         """Remove any mailto: links."""
-        if attrs['href'].startswith('mailto:'):
+        if attrs[(None, 'href')].startswith('mailto:'):
             return None
         return attrs
 
 
-``skip_pre``
-============
+Skipping links in pre blocks (``skip_pre``)
+===========================================
 
 ``
`` tags are often special, literal sections. If you don't want to create
 any new links within a ``
`` section, pass ``skip_pre=True``.
@@ -184,8 +191,8 @@ any new links within a ``
`` section, pass ``skip_pre=True``.
    tags will still be passed through all the callbacks.
 
 
-``parse_email``
-===============
+Linkifying email addresses (``parse_email``)
+============================================
 
 By default, ``linkify()`` does not create ``mailto:`` links for email
 addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
@@ -194,4 +201,50 @@ they are newly created or already in the text, so be careful when writing
 callbacks that may need to behave differently if the protocol is ``mailto:``.
 
 
+Using ``bleach.linkifier.LinkifyFilter``
+========================================
+
+``bleach.linkify`` works by paring an HTML fragment and then running it through
+the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
+back into text.
+
+You can use this filter wherever you can use an html5lib Filter. For example, you
+could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+
+For example, using all the defaults:
+
+.. doctest::
+
+   >>> from functools import partial
+
+   >>> from bleach import Cleaner
+   >>> from bleach.linkifier import LinkifyFilter
+
+   >>> cleaner = Cleaner(tags=['pre'])
+   >>> cleaner.clean('
http://example.com
') + u'
http://example.com
' + + >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter]) + >>> cleaner.clean('
http://example.com
') + u'
http://example.com
' + + +And passing parameters to ``LinkifyFilter``: + +.. doctest:: + + >>> from functools import partial + + >>> from bleach import Cleaner + >>> from bleach.linkifier import LinkifyFilter + + >>> cleaner = Cleaner( + ... tags=['pre'], + ... filters=[partial(LinkifyFilter, skip_pre=True)] + ... ) + ... + >>> cleaner.clean('
http://example.com
') + u'
http://example.com
' + + .. _Crate: https://crate.io/ From ddc39ec4a30c5a378976ca97664939c67420ebe5 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 4 Mar 2017 09:44:23 -0500 Subject: [PATCH 074/314] Minor code cleanup and comments --- bleach/linkifier.py | 159 ++++++++++++++++++++++++++++---------------- bleach/sanitizer.py | 8 ++- 2 files changed, 110 insertions(+), 57 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index b4ba2ea8..c6a8486a 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -54,6 +54,18 @@ class LinkifyFilter(Filter): + """html5lib filter that linkifies text + + This will do the following: + + * convert email addresses into links + * convert urls into links + * edit existing links by running them through callbacks--the default is to + add a ``rel="nofollow"`` + + This filter can be used anywhere html5lib filters can be used. + + """ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False): super(LinkifyFilter, self).__init__(source) @@ -62,6 +74,13 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False): self.parse_email = parse_email def apply_callbacks(self, attrs, is_new): + """Given an attrs dict and an is_new bool, runs through callbacks + + Callbacks can return an adjusted attrs dict or None. In the case of + None, we stop going through callbacks and return that and the link gets + dropped. + + """ for cb in self.callbacks: attrs = cb(attrs, is_new) if attrs is None: @@ -70,6 +89,23 @@ def apply_callbacks(self, attrs, is_new): def extract_character_data(self, token_list): """Extracts and squashes character sequences in a token stream""" + # FIXME(willkg): This is a terrible idea. What it does is drop all the + # tags from the token list and merge the Characters and SpaceCharacters + # tokens into a single text. + # + # So something like this:: + # + # "" "" "some text" "" "" + # + # gets converted to "some text". + # + # This gets used to figure out the ``_text`` fauxttribute value for + # linkify callables. + # + # I'm not really sure how else to support that ``_text`` fauxttribute and + # maintain some modicum of backwards compatability with previous versions + # of Bleach. + out = [] for token in token_list: token_type = token['type'] @@ -86,6 +122,7 @@ def handle_email_addresses(self, src_iter): new_tokens = [] end = 0 + # For each email address we find in the text for match in email_re.finditer(text): if match.start() > end: new_tokens.append( @@ -101,13 +138,13 @@ def handle_email_addresses(self, src_iter): attrs = self.apply_callbacks(attrs, True) if attrs is None: - # Just add the text + # Just add the text--but not as a link new_tokens.append( {u'type': u'Characters', u'data': match.group(0)} ) else: - # Add a "a" tag + # Add an "a" tag for the new link _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ @@ -118,6 +155,8 @@ def handle_email_addresses(self, src_iter): end = match.end() if new_tokens: + # Yield the adjusted set of tokens and then continue + # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) @@ -128,46 +167,58 @@ def handle_email_addresses(self, src_iter): yield token - def strip_wrapping_parentheses(self, fragment): - """Strips wrapping parentheses""" + def strip_parentheses(self, fragment): + """Strips parentheses from before and after url""" openp = closep = 0 # Count consecutive opening parentheses at the beginning of the # fragment (string) - for char in fragment: - if char == '(': - openp += 1 - else: - break - - if openp: - newer_frag = '' - # Cut the consecutive opening brackets from the fragment - fragment = fragment[openp:] - - # Reverse the fragment for easier detection of parentheses - # inside the URL - reverse_fragment = fragment[::-1] - skip = False - for char in reverse_fragment: - if char == ')' and closep < openp and not skip: - # Remove the closing parentheses if it has a matching - # opening parentheses (they are balanced). - closep += 1 - continue - - elif char != ')': - # Do not remove ')' from the URL itself. - skip = True - - newer_frag += char - - # Reverse fragment back - fragment = newer_frag[::-1] + if fragment.startswith(u'('): + for char in fragment: + if char == '(': + openp += 1 + else: + break + + if openp: + newer_frag = '' + + # Cut the consecutive opening brackets from the fragment + fragment = fragment[openp:] + + # Reverse the fragment for easier detection of parentheses + # inside the URL + reverse_fragment = fragment[::-1] + skip = False + for char in reverse_fragment: + if char == ')' and closep < openp and not skip: + # Remove the closing parentheses if it has a matching + # opening parentheses (they are balanced). + closep += 1 + continue + + elif char != ')': + # Do not remove ')' from the URL itself. + skip = True + + newer_frag += char + + # Reverse fragment back + fragment = newer_frag[::-1] + + # Sometimes we pick up ) at the end of a url, but the url is in a + # parenthesized phrase like: + # + # "i looked at the site (at http://example.com)" + if fragment.endswith(u')') and u'(' not in fragment: + new_fragment = fragment.rstrip(u')') + closep += (len(fragment) - len(new_fragment)) + fragment = new_fragment return fragment, u'(' * openp, u')' * closep def strip_punctuation(self, fragment): + """Strips punctuation at the end of a url match""" match = re.search(punct_re, fragment) if match: return fragment[0:match.start()], match.group(0) @@ -192,19 +243,12 @@ def handle_links(self, src_iter): prefix = suffix = '' # Sometimes we pick up ( and ), so drop them from the url - if url.startswith('('): - url, prefix, suffix = self.strip_wrapping_parentheses(url) - - if url.endswith(u')') and u'(' not in url: - new_url = url.rstrip(u')') - suffix = url[len(new_url):] + suffix - url = new_url + url, prefix, suffix = self.strip_parentheses(url) # Sometimes we pick up . and , at the end of the url that's # part of the sentence and not the url so drop it url, punct_suffix = self.strip_punctuation(url) - if punct_suffix: - suffix = suffix + punct_suffix + suffix = suffix + punct_suffix # If there's no protocol, add one if re.search(proto_re, url): @@ -218,19 +262,20 @@ def handle_links(self, src_iter): } attrs = self.apply_callbacks(attrs, True) - if prefix: - new_tokens.append( - {u'type': u'Characters', u'data': prefix} - ) - if attrs is None: # Just add the text new_tokens.append( - {u'type': u'Characters', u'data': url} + {u'type': u'Characters', u'data': prefix + url + suffix} ) else: - # Add an "a" tag! + # Add the "a" tag! + + if prefix: + new_tokens.append( + {u'type': u'Characters', u'data': prefix} + ) + _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) @@ -240,14 +285,16 @@ def handle_links(self, src_iter): {u'type': u'EndTag', u'name': 'a'}, ]) - if suffix: - new_tokens.append( - {u'type': u'Characters', u'data': suffix} - ) + if suffix: + new_tokens.append( + {u'type': u'Characters', u'data': suffix} + ) end = match.end() if new_tokens: + # Yield the adjusted set of tokens and then continue + # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) @@ -334,9 +381,9 @@ def __iter__(self): # yet continue - elif in_pre: + elif in_pre and self.skip_pre: # NOTE(willkg): We put this clause here since in_a and - # switching in and out of is_a takes precedence. + # switching in and out of in_a takes precedence. if token['type'] == 'EndTag' and token['name'] == 'pre': in_pre = False diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 610dd903..18ce49f4 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -9,6 +9,11 @@ class BleachSanitizerFilter(sanitizer.Filter): + """html5lib Filter that sanitizes text + + This filter can be used anywhere html5lib filters can be used. + + """ def __init__(self, source, allowed_attributes_map, strip_disallowed_elements=False, strip_html_comments=True, **kwargs): @@ -60,6 +65,7 @@ def sanitize_token(self, token): return token def allow_token(self, token): + """Handles the case where we're allowing the tag""" if 'data' in token: allowed_attributes = self.allowed_attributes_map.get(token['name'], []) if not callable(allowed_attributes): @@ -131,7 +137,7 @@ def allow_token(self, token): return token def sanitize_css(self, style): - """html5lib sanitizer filter replacement to fix issues""" + """Sanitizes css in style tags""" # disallow urls style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) From 6968e5d8eec7be235ea167d97c73ea2937ad0a59 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 4 Mar 2017 09:52:37 -0500 Subject: [PATCH 075/314] Add tests for alphabetize_attributes --- tests/test_utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_utils.py diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..076617df --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,44 @@ +from collections import OrderedDict + +from bleach.utils import alphabetize_attributes + + +class TestAlphabeticalAttributes: + def test_empty_cases(self): + assert alphabetize_attributes(None) is None + + assert alphabetize_attributes({}) == {} + + def test_ordering(self): + assert ( + alphabetize_attributes({ + (None, 'a'): 1, + (None, 'b'): 2 + }) == + OrderedDict([ + ((None, 'a'), 1), + ((None, 'b'), 2) + ]) + ) + assert ( + alphabetize_attributes({ + (None, 'b'): 1, + (None, 'a'): 2} + ) == + OrderedDict([ + ((None, 'a'), 2), + ((None, 'b'), 1) + ]) + ) + + def test_different_namespaces(self): + assert ( + alphabetize_attributes({ + ('xlink', 'href'): 'abc', + (None, 'alt'): '123' + }) == + OrderedDict([ + ((None, 'alt'), '123'), + (('xlink', 'href'), 'abc') + ]) + ) From a1a85e9226e2be45a4eded6680b5ede4b2fa1e4c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 4 Mar 2017 10:37:37 -0500 Subject: [PATCH 076/314] Fix handling for over-eager url matching --- bleach/linkifier.py | 100 +++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 58 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index c6a8486a..04ab8275 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -40,8 +40,6 @@ proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) -punct_re = re.compile(r'([\.,]+)$') - email_re = re.compile( r"""(? Date: Sat, 4 Mar 2017 10:41:20 -0500 Subject: [PATCH 077/314] Add tests from #78 --- tests/test_links.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_links.py b/tests/test_links.py index 1712d199..8e166543 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -467,6 +467,14 @@ def test_sarcasm(): '(http://en.wikipedia.org/wiki/)Test_(assessment', ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '') + ), + ( + 'hello (http://www.mu.de/blah.html) world', + ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world') + ), + ( + 'hello (http://www.mu.de/blah.html). world', + ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world') ) ]) def test_wrapping_parentheses(data, expected_data): From bb44c173700853c0da33a1cbb43632e95f54e885 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 4 Mar 2017 10:53:43 -0500 Subject: [PATCH 078/314] Move "a" tag handling to a method --- bleach/linkifier.py | 92 +++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 04ab8275..a3e46009 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -254,7 +254,6 @@ def handle_links(self, src_iter): else: # Add the "a" tag! - if prefix: new_tokens.append( {u'type': u'Characters', u'data': prefix} @@ -289,6 +288,49 @@ def handle_links(self, src_iter): yield token + def handle_a_tag(self, token_buffer): + """Handle the "a" tag + + This could adjust the link or drop it altogether depending on what the + callbacks return. + + This yields the new set of tokens. + + """ + a_token = token_buffer[0] + if a_token['data']: + attrs = a_token['data'] + else: + attrs = {} + text = self.extract_character_data(token_buffer) + attrs['_text'] = text + + attrs = self.apply_callbacks(attrs, False) + + if attrs is None: + # We're dropping the "a" tag and everything else and replacing + # it with character data. So emit that token. + yield {'type': 'Characters', 'data': text} + + else: + new_text = attrs.pop('_text', '') + a_token['data'] = alphabetize_attributes(attrs) + + if text == new_text: + # The callbacks didn't change the text, so we yield the new "a" + # token, then whatever else was there, then the end "a" token + yield a_token + for mem in token_buffer[1:]: + yield mem + + else: + # If the callbacks changed the text, then we're going to drop + # all the tokens between the start and end "a" tags and replace + # it with the new text + yield a_token + yield {'type': 'Characters', 'data': force_unicode(new_text)} + yield token_buffer[-1] + def __iter__(self): in_a = False in_pre = False @@ -300,47 +342,15 @@ def __iter__(self): # Handle the case where we're in an "a" tag--we want to buffer tokens # until we hit an end "a" tag. if token['type'] == 'EndTag' and token['name'] == 'a': - # We're no longer in an "a" tag, so we get all the things we - # need to apply callbacks and then figure out what to do with - # this "a" tag. - in_a = False - a_token = token_buffer[0] - if a_token['data']: - attrs = a_token['data'] - else: - attrs = {} - - text = self.extract_character_data(token_buffer) - attrs['_text'] = text - - attrs = self.apply_callbacks(attrs, False) - if attrs is None: - # We're dropping the "a" tag and everything else and replacing - # it with character data. So emit that token. - yield {'type': 'Characters', 'data': text} - - else: - new_text = attrs.pop('_text', '') - # FIXME(willkg): add nofollow here - a_token['data'] = alphabetize_attributes(attrs) - - if text == new_text: - # The callbacks didn't change the text, so we yield the - # new "a" token, then whatever else was there, then the - # end "a" token - yield a_token - for mem in token_buffer[1:]: - yield mem - yield token - - else: - # If the callbacks changed the text, then we're going - # to drop all the tokens between the start and end "a" - # tags and replace it with the new text - yield a_token - yield {'type': 'Characters', 'data': force_unicode(new_text)} - yield token + # Add the end tag to the token buffer and then handle them + # and yield anything returned + token_buffer.append(token) + for new_token in self.handle_a_tag(token_buffer): + yield new_token + # Clear "a" related state and continue since we've yielded all + # the tokens we're going to yield + in_a = False token_buffer = [] continue From 460aa6d3a95f5cc89f9589e8398491f3a5e2180d Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 6 Mar 2017 13:55:05 -0500 Subject: [PATCH 079/314] Restructure linkify; clean up __init__; update docs This is a ton of changes all in one commit. Sorry. * Restructures linkify so it mirrors clean. This has the nicety in that the two are parallel and can be used the same. * Rework how url_re and email_re work so that it's possible to override them and/or provide your own list of allowed procotols and TLDs * Overhaul the docs including converting linkify examples to doctest * Update CHANGES --- CHANGES | 23 ++-- bleach/__init__.py | 245 +++++++++--------------------------------- bleach/linkifier.py | 100 ++++++++++++++--- bleach/sanitizer.py | 142 ++++++++++++++++++++++++ docs/clean.rst | 38 ++++--- docs/linkify.rst | 250 +++++++++++++++++++++++++++++-------------- tests/test_basics.py | 8 +- tests/test_links.py | 44 ++++++-- 8 files changed, 518 insertions(+), 332 deletions(-) diff --git a/CHANGES b/CHANGES index d7d9b0d2..9afe859f 100644 --- a/CHANGES +++ b/CHANGES @@ -17,7 +17,7 @@ Version 2.0 (in development) * ``bleach.clean`` and friends were rewritten - ``clean`` is now implemented as an html5lib Filter and happens at a different + ``clean`` was reimplemented as an html5lib filter and happens at a different step in the HTML parsing -> traversing -> serializing process. Because of that, there are some differences in clean's output as compared with previous versions. @@ -43,11 +43,14 @@ Version 2.0 (in development) Now it's more like this:: def check_protocol(attrs, is_new): - if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')): + if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')): # ^^^^^^^^^^^^^^ return None return attrs + Further, you need to make sure you're always using unicode values. If you + don't then html5lib will raise an assertion error that the value is not + unicode. **Changes** @@ -55,17 +58,19 @@ Version 2.0 (in development) * Supports html5lib >= 0.99999999 (8 9s). -* There's a ``bleach.Cleaner`` class that you can instantiate with your - favorite clean settings and reuse it. +* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your + favorite clean settings for easy reuse. -* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib Filter. +* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your + favorite linkify settings for easy reuse. -* You can pass ``bleach.linkifier.LinkifyFilter`` as a Filter to - ``bleach.Cleaner`` allowing you to clean and linkify in one pass. +* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that + you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and + linkify in one pass. -* Lots of bug fixes. +* Tons of bug fixes. -* Test cleanup. +* Cleaned up tests. * Documentation fixes. diff --git a/bleach/__init__.py b/bleach/__init__.py index 0155a127..07b5075c 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,169 +1,28 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import logging -import re - -import html5lib -from html5lib.filters import sanitizer -from html5lib.filters.sanitizer import allowed_protocols -from html5lib.serializer import HTMLSerializer - -from bleach import callbacks as linkify_callbacks -from bleach.encoding import force_unicode -from bleach.linkifier import LinkifyFilter -from bleach.sanitizer import BleachSanitizerFilter -from bleach.version import __version__, VERSION # flake8: noqa - -__all__ = ['Cleaner', 'clean', 'linkify'] - -log = logging.getLogger(__name__) -log.addHandler(logging.NullHandler()) - -ALLOWED_TAGS = [ - 'a', - 'abbr', - 'acronym', - 'b', - 'blockquote', - 'code', - 'em', - 'i', - 'li', - 'ol', - 'strong', - 'ul', -] - -ALLOWED_ATTRIBUTES = { - 'a': ['href', 'title'], - 'abbr': ['title'], - 'acronym': ['title'], -} - -ALLOWED_STYLES = [] - -ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] - -ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) -# a simple routine that returns the tag name with the namespace prefix -# as returned by etree's Element.tag attribute - -DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] - - -class Cleaner(object): - """Cleaner for cleaning HTML fragments of malicious content - - This cleaner is a security-focused function whose sole purpose is to remove - malicious content from a string such that it can be displayed as content in - a web page. - - This cleaner is not designed to use to transform content to be used in - non-web-page contexts. - - To use:: - - from bleach import Cleaner - - cleaner = Cleaner() - - for text in all_the_yucky_things: - sanitized = cleaner.clean(text) - - """ - - def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, - styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, - strip_comments=True, filters=None): - """Initializes a Cleaner - - :arg tags: whitelist of allowed tags; defaults to - ``bleach.ALLOWED_TAGS`` - - :arg attributes: whitelist of allowed attributes; defaults to - ``bleach.ALLOWED_ATTRIBUTES`` - - :arg styles: whitelist of allowed css; defaults to - ``bleach.ALLOWED_STYLES`` - - :arg protocols: whitelist of allowed protocols for links; defaults - to ``bleach.ALLOWED_PROTOCOLS`` - - :arg strip: whether or not to strip disallowed elements - - :arg strip_comments: whether or not to strip HTML comments - - :arg filters: list of html5lib Filter classes to pass streamed content through - - See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters - .. Warning:: - - Using filters changes the output of ``bleach.Cleaner.clean``. - Make sure the way the filters change the output are secure. - - """ - self.tags = tags - self.attributes = attributes - self.styles = styles - self.protocols = protocols - self.strip = strip - self.strip_comments = strip_comments - self.filters = filters or [] - - self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) - self.walker = html5lib.getTreeWalker('etree') - self.serializer = HTMLSerializer( - quote_attr_values='always', - omit_optional_tags=False, - - # Bleach has its own sanitizer, so don't use the html5lib one - sanitize=False, - - # Bleach sanitizer alphabetizes already, so don't use the html5lib one - alphabetical_attributes=False, - ) - - def clean(self, text): - """Cleans text and returns sanitized result as unicode - - :arg str text: text to be cleaned - - :returns: sanitized text as unicode - - """ - if not text: - return u'' - - text = force_unicode(text) - - dom = self.parser.parseFragment(text) - filtered = BleachSanitizerFilter( - source=self.walker(dom), - - # Bleach-sanitizer-specific things - allowed_attributes_map=self.attributes, - strip_disallowed_elements=self.strip, - strip_html_comments=self.strip_comments, - - # html5lib-sanitizer things - allowed_elements=self.tags, - allowed_css_properties=self.styles, - allowed_protocols=self.protocols, - allowed_svg_properties=[], - ) - - # Apply any filters after the BleachSanitizerFilter - for filter_class in self.filters: - filtered = filter_class(source=filtered) +from bleach.linkifier import ( + DEFAULT_CALLBACKS, + Linker, + LinkifyFilter, +) +from bleach.sanitizer import ( + ALLOWED_ATTRIBUTES, + ALLOWED_PROTOCOLS, + ALLOWED_STYLES, + ALLOWED_TAGS, + BleachSanitizerFilter, + Cleaner, +) +from bleach.version import __version__, VERSION # flake8: noqa - return self.serializer.render(filtered) +__all__ = ['clean', 'linkify'] def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, - strip_comments=True, filters=None): + strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to @@ -182,36 +41,27 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, .. Note:: - If you're cleaning a lot of text and passing the same argument - values, consider caching a ``Cleaner`` instance. + If you're cleaning a lot of text and passing the same argument values or + you want more configurability, consider using a + :py:class:`bleach.sanitizer.Cleaner` instance. - :arg text: the text to clean + :arg str text: the text to clean - :arg tags: whitelist of allowed tags; defaults to + :arg list tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` - :arg attributes: whitelist of allowed attributes; defaults to + :arg dict attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` - :arg styles: whitelist of allowed css; defaults to + :arg list styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` - :arg protocols: whitelist of allowed protocols for links; defaults + :arg list protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` - :arg strip: whether or not to strip disallowed elements - - :arg strip_comments: whether or not to strip HTML comments - - :arg filters: list of html5lib Filter classes to pass streamed content through - - See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + :arg bool strip: whether or not to strip disallowed elements - .. Warning:: - - Using filters changes the output of - ``bleach.Cleaner.clean``. Make sure the way the filters - change the output are secure. + :arg bool strip_comments: whether or not to strip HTML comments :returns: cleaned text as unicode @@ -223,7 +73,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, protocols=protocols, strip=strip, strip_comments=strip_comments, - filters=filters, ) return cleaner.clean(text) @@ -231,40 +80,42 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False): """Convert URL-like strings in an HTML fragment to links - ``linkify()`` converts strings that look like URLs, domain names and email + This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses - ``linkify()`` does a best-effort approach and tries to recover from bad + linkify does a best-effort approach and tries to recover from bad situations due to crazy text. - """ - parser = html5lib.HTMLParser(namespaceHTMLElements=False) - walker = html5lib.getTreeWalker('etree') - serializer = HTMLSerializer( - quote_attr_values='always', - omit_optional_tags=False, + .. Note:: - # Bleach has its own sanitizer, so don't use the html5lib one - sanitize=False, + If you're linking a lot of text and passing the same argument values or + you want more configurability, consider using a + :py:class:`bleach.linkifier.Linker` instance. - # Bleach sanitizer alphabetizes already, so don't use the html5lib one - alphabetical_attributes=False, - ) + .. Note:: + + If you have text that you want to clean and then linkify, consider using + the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean + pass. That way you're not parsing the HTML twice. + + :arg str text: the text to linkify - text = force_unicode(text) + :arg list callbacks: list of callbacks to run when adjusting tag attributes - if not text: - return u'' + :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag - dom = parser.parseFragment(text) - filtered = LinkifyFilter( - source=walker(dom), + :arg bool parse_email: whether or not to linkify email addresses + + :returns: linkified text as unicode + + """ + linker = Linker( callbacks=callbacks, skip_pre=skip_pre, parse_email=parse_email ) - return serializer.render(filtered) + return linker.linkify(text) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index a3e46009..1396f056 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -1,14 +1,19 @@ from __future__ import unicode_literals import re +import html5lib from html5lib.filters.base import Filter +from html5lib.filters.sanitizer import allowed_protocols +from html5lib.serializer import HTMLSerializer -from bleach import allowed_protocols +from bleach import callbacks as linkify_callbacks from bleach.encoding import force_unicode from bleach.utils import alphabetize_attributes -# FIXME(willkg): Move this to a constants module. +DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] + + TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk @@ -27,20 +32,37 @@ TLDS.reverse() -url_re = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, - # except for # and ~, which happen in practice) - """.format('|'.join(allowed_protocols), '|'.join(TLDS)), - re.IGNORECASE | re.VERBOSE | re.UNICODE) +def build_url_re(tlds=TLDS, protocols=allowed_protocols): + """Builds the url regex used by linkifier + + If you want a different set of tlds or allowed protocols, pass those in + and stomp on the existing ``url_re``:: + + from bleach import linkifier + + my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) + + linker = LinkifyFilter(url_re=my_url_re) + + """ + return re.compile( + r"""\(* # Match any opening parentheses. + \b(?"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """.format('|'.join(protocols), '|'.join(tlds)), + re.IGNORECASE | re.VERBOSE | re.UNICODE) + + +URL_RE = build_url_re() + +PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) -proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) -email_re = re.compile( +EMAIL_RE = re.compile( r"""(? end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} @@ -221,7 +287,7 @@ def handle_links(self, src_iter): new_tokens = [] end = 0 - for match in url_re.finditer(text): + for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} @@ -235,7 +301,7 @@ def handle_links(self, src_iter): url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one - if re.search(proto_re, url): + if PROTO_RE.search(url): href = url else: href = u'http://%s' % url diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 18ce49f4..fcbcd915 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -2,12 +2,154 @@ import re from xml.sax.saxutils import unescape +import html5lib from html5lib.constants import namespaces from html5lib.filters import sanitizer +from html5lib.serializer import HTMLSerializer +from bleach.encoding import force_unicode from bleach.utils import alphabetize_attributes +ALLOWED_TAGS = [ + 'a', + 'abbr', + 'acronym', + 'b', + 'blockquote', + 'code', + 'em', + 'i', + 'li', + 'ol', + 'strong', + 'ul', +] + +ALLOWED_ATTRIBUTES = { + 'a': ['href', 'title'], + 'abbr': ['title'], + 'acronym': ['title'], +} + +ALLOWED_STYLES = [] + +ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] + +ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) +# a simple routine that returns the tag name with the namespace prefix +# as returned by etree's Element.tag attribute + + +class Cleaner(object): + """Cleaner for cleaning HTML fragments of malicious content + + This cleaner is a security-focused function whose sole purpose is to remove + malicious content from a string such that it can be displayed as content in + a web page. + + This cleaner is not designed to use to transform content to be used in + non-web-page contexts. + + To use:: + + from bleach import Cleaner + + cleaner = Cleaner() + + for text in all_the_yucky_things: + sanitized = cleaner.clean(text) + + """ + + def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, + strip_comments=True, filters=None): + """Initializes a Cleaner + + :arg tags: whitelist of allowed tags; defaults to + ``bleach.ALLOWED_TAGS`` + + :arg attributes: whitelist of allowed attributes; defaults to + ``bleach.ALLOWED_ATTRIBUTES`` + + :arg styles: whitelist of allowed css; defaults to + ``bleach.ALLOWED_STYLES`` + + :arg protocols: whitelist of allowed protocols for links; defaults + to ``bleach.ALLOWED_PROTOCOLS`` + + :arg strip: whether or not to strip disallowed elements + + :arg strip_comments: whether or not to strip HTML comments + + :arg filters: list of html5lib Filter classes to pass streamed content through + + See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + + .. Warning:: + + Using filters changes the output of ``bleach.Cleaner.clean``. + Make sure the way the filters change the output are secure. + + """ + self.tags = tags + self.attributes = attributes + self.styles = styles + self.protocols = protocols + self.strip = strip + self.strip_comments = strip_comments + self.filters = filters or [] + + self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) + self.walker = html5lib.getTreeWalker('etree') + self.serializer = HTMLSerializer( + quote_attr_values='always', + omit_optional_tags=False, + + # Bleach has its own sanitizer, so don't use the html5lib one + sanitize=False, + + # Bleach sanitizer alphabetizes already, so don't use the html5lib one + alphabetical_attributes=False, + ) + + def clean(self, text): + """Cleans text and returns sanitized result as unicode + + :arg str text: text to be cleaned + + :returns: sanitized text as unicode + + """ + if not text: + return u'' + + text = force_unicode(text) + + dom = self.parser.parseFragment(text) + filtered = BleachSanitizerFilter( + source=self.walker(dom), + + # Bleach-sanitizer-specific things + allowed_attributes_map=self.attributes, + strip_disallowed_elements=self.strip, + strip_html_comments=self.strip_comments, + + # html5lib-sanitizer things + allowed_elements=self.tags, + allowed_css_properties=self.styles, + allowed_protocols=self.protocols, + allowed_svg_properties=[], + ) + + # Apply any filters after the BleachSanitizerFilter + for filter_class in self.filters: + filtered = filter_class(source=filtered) + + return self.serializer.render(filtered) + + class BleachSanitizerFilter(sanitizer.Filter): """html5lib Filter that sanitizes text diff --git a/docs/clean.rst b/docs/clean.rst index e281e2ca..161e4357 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -214,6 +214,7 @@ whitelist and invalid markup. For example: >>> bleach.clean('is not allowed') u'<span>is not allowed</span>' + >>> bleach.clean('is not allowed', tags=['b']) u'<span>is not allowed</span>' @@ -227,6 +228,7 @@ If you would rather Bleach stripped this markup entirely, you can pass >>> bleach.clean('is not allowed', strip=True) u'is not allowed' + >>> bleach.clean('is not allowed', tags=['b'], strip=True) u'is not allowed' @@ -250,10 +252,20 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set u'my html' +Using ``bleach.sanitizer.Cleaner`` +================================== + +If you're cleaning a lot of text or you need better control of things, you +should create a :py:class:`bleach.sanitizer.Cleaner` instance. + +.. autoclass:: bleach.sanitizer.Cleaner + :members: + + html5lib Filters (``filters``) -============================== +------------------------------ -Bleach sanitizing is implemented as an html5lib Filter. The consequence of this +Bleach sanitizing is implemented as an html5lib filter. The consequence of this is that we can pass the streamed content through additional specified filters after the :py:class:`bleach.sanitizer.BleachSanitizingFilter` filter has run. @@ -267,7 +279,7 @@ Trivial Filter example: .. doctest:: - >>> import bleach + >>> from bleach.sanitizer import Cleaner >>> from html5lib.filters.base import Filter >>> class MooFilter(Filter): @@ -283,8 +295,9 @@ Trivial Filter example: ... } ... >>> TAGS = ['img'] + >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) >>> dirty = 'this is cute! ' - >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) + >>> cleaner.clean(dirty) u'this is cute! ' @@ -294,20 +307,11 @@ Trivial Filter example: filter is applying maintain the safety guarantees of the output. -Using ``bleach.Cleaner`` -======================== - -If you're cleaning a lot of text, you might want to create a -:py:class:`bleach.Cleaner` instance. - -.. autoclass:: bleach.Cleaner - :members: - - Using ``bleach.sanitizer.BleachSanitizerFilter`` ================================================ -``bleach.clean`` creates a ``bleach.Cleaner`` which creates a +``bleach.clean`` creates a ``bleach.sanitizer.Cleaner`` which creates a ``bleach.sanitizer.BleachSanitizerFilter`` which does the sanitizing work. -``BleachSanitizerFilter`` is an html5lib Filter and can be used anywhere you can -use an html5lib Filter. + +``BleachSanitizerFilter`` is an html5lib filter and can be used anywhere you can +use an html5lib filter. diff --git a/docs/linkify.rst b/docs/linkify.rst index 6fe032ed..a468830c 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -5,22 +5,19 @@ Linkifying text fragments ========================= -``linkify()`` searches text for links, URLs, and email addresses and lets you -control how and when those links are rendered. +:py:func:`bleach.linkify` searches text for links, URLs, and email addresses and +lets you control how and when those links are rendered. -``linkify()`` works by building a document tree, so it's guaranteed never to do -weird things to URLs in attribute values, can modify the value of attributes on -```` tags, and can even do things like skip ``
`` sections.
-
-By default, ``linkify()`` will perform some sanitization, only allowing a set of
-"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
-things like unclosed tags.
+It works by building a document tree, so it's guaranteed never to do weird
+things to URLs in attribute values, can modify the value of attributes on
+```` tags and can even do things like skip ``
`` sections.
 
 .. note::
 
    You may pass a ``string`` or ``unicode`` object, but Bleach will always
    return ``unicode``.
 
+
 .. autofunction:: bleach.linkify
 
 
@@ -57,29 +54,44 @@ included callback functions.
 Setting Attributes
 ------------------
 
-For example, to set ``rel="nofollow"`` on all links found in the text, a simple
-(and included) callback might be::
+For example, you could add a ``title`` attribute to all links:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
 
-    def set_nofollow(attrs, new=False):
-        attrs[(None, 'rel')] = 'nofollow'
-        return attrs
+   >>> def set_title(attrs, new=False):
+   ...     attrs[(None, u'title')] = u'link in user text'
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[set_title])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc http://example.com def'
 
 
 This would set the value of the ``rel`` attribute, stomping on a previous value
 if there was one.
 
-You could also make external links open in a new tab or set a class::
+Here's another example that makes external links open in a new tab and look like
+an external link:
 
-    from urlparse import urlparse
+.. doctest::
 
-    def set_target(attrs, new=False):
-        p = urlparse(attrs[(None, 'href')])
-        if p.netloc not in ['my-domain.com', 'other-domain.com']:
-            attrs[(None, 'target')] = '_blank'
-            attrs[(None, 'class')] = 'external'
-        else:
-            attrs.pop((None, 'target'), None)
-        return attrs
+   >>> from urlparse import urlparse
+   >>> from bleach.linkifier import Linker
+
+   >>> def set_target(attrs, new=False):
+   ...     p = urlparse(attrs[(None, u'href')])
+   ...     if p.netloc not in ['my-domain.com', 'other-domain.com']:
+   ...         attrs[(None, u'target')] = u'_blank'
+   ...         attrs[(None, u'class')] = u'external'
+   ...     else:
+   ...         attrs.pop((None, u'target'), None)
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[set_target])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc http://example.com def'
 
 
 Removing Attributes
@@ -89,25 +101,42 @@ You can easily remove attributes you don't want to allow, even on existing
 links (```` tags) in the text. (See also :ref:`clean() ` for
 sanitizing attributes.)
 
-::
+.. doctest::
 
-    def allowed_attributes(attrs, new=False):
-        """Only allow href, target, rel and title."""
-        allowed = [(None, 'href'), (None, 'target'),
-                   (None, 'rel'), (None, 'title')]
-        return dict((k, v) for k, v in attrs.items() if k in allowed)
+   >>> from bleach.linkifier import Linker
+
+   >>> def allowed_attrs(attrs, new=False):
+   ...     """Only allow href, target, rel and title."""
+   ...     allowed = [
+   ...         (None, u'href'),
+   ...         (None, u'target'),
+   ...         (None, u'rel'),
+   ...         (None, u'title'),
+   ...         u'_text',
+   ...     ]
+   ...     return dict((k, v) for k, v in attrs.items() if k in allowed)
+   ...
+   >>> linker = Linker(callbacks=[allowed_attrs])
+   >>> linker.linkify('link')
+   u'link'
 
 
-Or you could remove a specific attribute, if it exists::
+Or you could remove a specific attribute, if it exists:
 
-    def remove_title1(attrs, new=False):
-        attrs.pop((None, 'title'), None)
-        return attrs
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
 
-    def remove_title2(attrs, new=False):
-        if (None, 'title') in attrs:
-            del attrs[(None, 'title')]
-        return attrs
+   >>> def remove_title(attrs, new=False):
+   ...     attrs.pop((None, u'title'), None)
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[remove_title])
+   >>> linker.linkify('link')
+   u'link'
+
+   >>> linker.linkify('link')
+   u'link'
 
 
 Altering Attributes
@@ -117,31 +146,50 @@ You can alter and overwrite attributes, including the link text, via the
 ``_text`` key, to, for example, pass outgoing links through a warning page, or
 limit the length of text inside an ```` tag.
 
-::
+Example of shortening link text:
 
-    def shorten_url(attrs, new=False):
-        """Shorten overly-long URLs in the text."""
-        if not new:  # Only looking at newly-created links.
-            return attrs
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> def shorten_url(attrs, new=False):
+   ...     """Shorten overly-long URLs in the text."""
+   ...     # Only adjust newly-created links
+   ...     if not new:
+   ...         return attrs
+   ...     # _text will be the same as the URL for new links
+   ...     text = attrs[u'_text']
+   ...     if len(text) > 25:
+   ...         attrs[u'_text'] = text[0:22] + u'...'
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[shorten_url])
+   >>> linker.linkify('http://example.com/longlonglonglonglongurl')
+   u'http://example.com/lon...'
 
-        # _text will be the same as the URL for new links.
-        text = attrs['_text']
-        if len(text) > 25:
-            attrs['_text'] = text[0:22] + '...'
-        return attrs
 
-::
+Example of switching all links to go through a bouncer first:
 
-    from urllib2 import quote
-    from urlparse import urlparse
+.. doctest::
 
-    def outgoing_bouncer(attrs, new=False):
-        """Send outgoing links through a bouncer."""
-        p = urlparse((None, attrs['href']))
-        if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
-            bouncer = 'http://outgoing.my-domain.com/?destination=%s'
-            attrs[(None, 'href')] = bouncer % quote(attrs['href'])
-        return attrs
+   >>> from six.moves.urllib.parse import quote, urlparse
+   >>> from bleach.linkifier import Linker
+
+   >>> def outgoing_bouncer(attrs, new=False):
+   ...     """Send outgoing links through a bouncer."""
+   ...     href_key = (None, u'href')
+   ...     p = urlparse(attrs.get(href_key, None))
+   ...     if p.netloc not in ['example.com', 'www.example.com', '']:
+   ...         bouncer = 'http://bn.ce/?destination=%s'
+   ...         attrs[href_key] = bouncer % quote(attrs[href_key])
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[outgoing_bouncer])
+   >>> linker.linkify('http://example.com')
+   u'http://example.com'
+
+   >>> linker.linkify('http://foo.com')
+   u'http://foo.com'
 
 
 Preventing Links
@@ -151,33 +199,53 @@ A slightly more complex example is inspired by Crate_, where strings like
 ``models.py`` are often found, and linkified. ``.py`` is the ccTLD for
 Paraguay, so ``example.py`` may be a legitimate URL, but in the case of a site
 dedicated to Python packages, odds are it is not. In this case, Crate_ could
-write the following callback::
+write the following callback:
 
-    def dont_linkify_python(attrs, new=False):
-        if not new:  # This is an existing  tag, leave it be.
-            return attrs
+.. doctest::
 
-        # If the TLD is '.py', make sure it starts with http: or https:
-        href = attrs[(None, 'href')]
-        if href.endswith('.py') and not href.startswith(('http:', 'https:')):
-            # This looks like a Python file, not a URL. Don't make a link.
-            return None
+   >>> from bleach.linkifier import Linker
+
+   >>> def dont_linkify_python(attrs, new=False):
+   ...     # This is an existing link, so leave it be
+   ...     if not new:
+   ...         return attrs
+   ...     # If the TLD is '.py', make sure it starts with http: or https:.
+   ...     # Use _text because that's the original text
+   ...     link_text = attrs[u'_text']
+   ...     if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
+   ...         # This looks like a Python file, not a URL. Don't make a link.
+   ...         return None
+   ...     # Everything checks out, keep going to the next callback.
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[dont_linkify_python])
+   >>> linker.linkify('abc http://example.com def')
+   u'abc http://example.com def'
 
-        # Everything checks out, keep going to the next callback.
-        return attrs
+   >>> linker.linkify('abc models.py def')
+   u'abc models.py def'
 
 
 Removing Links
 --------------
 
 If you want to remove certain links, even if they are written in the text with
-```` tags, you can still return ``None``::
+```` tags, have the callback return ``None``.
 
-    def remove_mailto(attrs, new=False):
-        """Remove any mailto: links."""
-        if attrs[(None, 'href')].startswith('mailto:'):
-            return None
-        return attrs
+For example, this removes any ``mailto:`` links:
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> def remove_mailto(attrs, new=False):
+   ...     if attrs[(None, u'href')].startswith(u'mailto:'):
+   ...         return None
+   ...     return attrs
+   ...
+   >>> linker = Linker(callbacks=[remove_mailto])
+   >>> linker.linkify('mail janet!')
+   u'mail janet!'
 
 
 Skipping links in pre blocks (``skip_pre``)
@@ -194,11 +262,31 @@ any new links within a ``
`` section, pass ``skip_pre=True``.
 Linkifying email addresses (``parse_email``)
 ============================================
 
-By default, ``linkify()`` does not create ``mailto:`` links for email
-addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
-will go through exactly the same set of callbacks as all other links, whether
-they are newly created or already in the text, so be careful when writing
-callbacks that may need to behave differently if the protocol is ``mailto:``.
+By default, :py:func:`bleach.linkify` does not create ``mailto:`` links for
+email addresses, but if you pass ``parse_email=True``, it will. ``mailto:``
+links will go through exactly the same set of callbacks as all other links,
+whether they are newly created or already in the text, so be careful when
+writing callbacks that may need to behave differently if the protocol is
+``mailto:``.
+
+
+Using ``bleach.linkifier.Linker``
+=================================
+
+If you're linking a lot of text and passing the same argument values or you want
+more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+instance.
+
+.. doctest::
+
+   >>> from bleach.linkifier import Linker
+
+   >>> linker = Linker(skip_pre=True)
+   >>> linker.linkify('a b c http://example.com d e f')
+   u'a b c http://example.com d e f'
+
+
+.. autoclass:: bleach.linkifier.Linker
 
 
 Using ``bleach.linkifier.LinkifyFilter``
@@ -235,7 +323,7 @@ And passing parameters to ``LinkifyFilter``:
 
    >>> from functools import partial
 
-   >>> from bleach import Cleaner
+   >>> from bleach.sanitizer import Cleaner
    >>> from bleach.linkifier import LinkifyFilter
 
    >>> cleaner = Cleaner(
diff --git a/tests/test_basics.py b/tests/test_basics.py
index e3f5d2da..bff29c0f 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -3,6 +3,7 @@
 import six
 
 import bleach
+from bleach.sanitizer import Cleaner
 
 
 class TestClean:
@@ -291,8 +292,11 @@ def __iter__(self):
         }
         TAGS = ['img']
         dirty = 'this is cute! '
+
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
         assert (
-            bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) ==
+            cleaner.clean(dirty) ==
             'this is cute! '
         )
 
@@ -302,7 +306,7 @@ def test_basics(self):
         TAGS = ['span', 'br']
         ATTRS = {'span': ['style']}
 
-        cleaner = bleach.Cleaner(tags=TAGS, attributes=ATTRS)
+        cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
 
         assert (
             cleaner.clean('a 
test') == diff --git a/tests/test_links.py b/tests/test_links.py index 8e166543..e602abd4 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -1,3 +1,4 @@ +import re try: from urllib.parse import quote_plus except ImportError: @@ -6,13 +7,7 @@ import pytest from bleach import linkify, DEFAULT_CALLBACKS as DC -from bleach.linkifier import url_re - - -def test_url_re(): - text = 'just what i am looking for...it' - match = url_re.search(text) - assert not match, 'matched {0!s}'.format(text[slice(*match.span())]) +from bleach.linkifier import Linker def test_empty(): @@ -540,8 +535,7 @@ def test_link_emails_and_urls(): def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" - expect = ('' - 'HTTP://EXAMPLE.COM') + expect = 'HTTP://EXAMPLE.COM' assert linkify('HTTP://EXAMPLE.COM') == expect @@ -599,3 +593,35 @@ def test_hang(): linkify("an@email.com", parse_email=True) == 'an@email.com' ) + + +def test_url_re_arg(): + """Verifies that a specified url_re is used""" + fred_re = re.compile(r"""(fred\.com)""") + + linker = Linker(url_re=fred_re) + assert ( + linker.linkify('a b c fred.com d e f') == + 'a b c fred.com d e f' + ) + + assert ( + linker.linkify('a b c http://example.com d e f') == + 'a b c http://example.com d e f' + ) + + +def test_email_re_arg(): + """Verifies that a specified email_re is used""" + fred_re = re.compile(r"""(fred@example\.com)""") + + linker = Linker(parse_email=True, email_re=fred_re) + assert ( + linker.linkify('a b c fred@example.com d e f') == + 'a b c fred@example.com d e f' + ) + + assert ( + linker.linkify('a b c jim@example.com d e f') == + 'a b c jim@example.com d e f' + ) From 975091d0ba9c9ed4000d0c457eddbd03178ab44e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 6 Mar 2017 14:12:49 -0500 Subject: [PATCH 080/314] Minor fixes --- CHANGES | 6 +++--- bleach/sanitizer.py | 6 +----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/CHANGES b/CHANGES index 9afe859f..3ff2b789 100644 --- a/CHANGES +++ b/CHANGES @@ -44,7 +44,7 @@ Version 2.0 (in development) def check_protocol(attrs, is_new): if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')): - # ^^^^^^^^^^^^^^ + # ^^^^^^^^^^^^^^^ return None return attrs @@ -65,8 +65,8 @@ Version 2.0 (in development) favorite linkify settings for easy reuse. * There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that - you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and - linkify in one pass. + you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean + and linkify in one pass. * Tons of bug fixes. diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index fcbcd915..06c90665 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -36,10 +36,6 @@ ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] -ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) -# a simple routine that returns the tag name with the namespace prefix -# as returned by etree's Element.tag attribute - class Cleaner(object): """Cleaner for cleaning HTML fragments of malicious content @@ -53,7 +49,7 @@ class Cleaner(object): To use:: - from bleach import Cleaner + from bleach.sanitizer import Cleaner cleaner = Cleaner() From ef442862f6b0ee64e88570705bf4d287f80669c8 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 6 Mar 2017 16:33:23 -0500 Subject: [PATCH 081/314] More test cleanup * move linkify tests to test_linkify.py * remove tests that are related to the previous implementation --- tests/test_basics.py | 48 ++++++++------------------------------------ tests/test_links.py | 35 +++++++++++++++++++------------- 2 files changed, 29 insertions(+), 54 deletions(-) diff --git a/tests/test_basics.py b/tests/test_basics.py index bff29c0f..031ab66d 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -140,8 +140,7 @@ def test_stripping(self): '

multiply nested text

' ) - s = ('

' - '

') + s = '

' assert ( bleach.clean(s, tags=['p', 'a'], strip=True) == '

' @@ -301,6 +300,13 @@ def __iter__(self): ) +def test_clean_idempotent(): + """Make sure that applying the filter twice doesn't change anything.""" + dirty = 'invalid & < extra http://link.com' + + assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty) + + class TestCleaner: def test_basics(self): TAGS = ['span', 'br'] @@ -312,41 +318,3 @@ def test_basics(self): cleaner.clean('a
test') == 'a
test' ) - - -class TestLinkify: - def test_no_href_links(self): - s = 'x' - assert bleach.linkify(s) == s - - def test_rel_already_there(self): - """Make sure rel attribute is updated not replaced""" - linked = ('Click ' - 'here.') - - link_good = 'Click here.' - - assert bleach.linkify(linked) == link_good - assert bleach.linkify(link_good) == link_good - - -def test_idempotent(): - """Make sure that applying the filter twice doesn't change anything.""" - dirty = 'invalid & < extra http://link.com' - - clean = bleach.clean(dirty) - assert bleach.clean(clean) == clean - - linked = bleach.linkify(dirty) - assert ( - bleach.linkify(linked) == - 'invalid & < extra http://link.com' - ) - - -def test_serializer(): - s = '
' - assert bleach.clean(s, tags=['table']) == s - assert bleach.linkify('test
') == 'test
' - assert bleach.clean('

test

', tags=['p']) == '

test

' diff --git a/tests/test_links.py b/tests/test_links.py index e602abd4..28b6ad6d 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -515,12 +515,6 @@ def test_ignore_bad_protocols(): ) -def test_max_recursion_depth(): - """If we hit the max recursion depth, just return the string.""" - test = '' * 2000 + 'foo' + '' * 2000 - assert linkify(test) == test - - def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" assert ( @@ -551,14 +545,6 @@ def test_elements_inside_links(): ) -def test_remove_first_childlink(): - callbacks = [lambda *a: None] - assert ( - linkify('

something

', callbacks=callbacks) == - '

something

' - ) - - def test_drop_link_tags(): """Verify that dropping link tags *just* drops the tag and not the content""" html = ( @@ -625,3 +611,24 @@ def test_email_re_arg(): linker.linkify('a b c jim@example.com d e f') == 'a b c jim@example.com d e f' ) + + +def test_linkify_idempotent(): + dirty = 'invalid & < extra http://link.com' + assert linkify(linkify(dirty)) == linkify(dirty) + + +class TestLinkify: + def test_no_href_links(self): + s = 'x' + assert linkify(s) == s + + def test_rel_already_there(self): + """Make sure rel attribute is updated not replaced""" + linked = ('Click ' + 'here.') + + link_good = 'Click here.' + + assert linkify(linked) == link_good + assert linkify(link_good) == link_good From a08454cdfea4bd758deab28e19084ccaa7388e1c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 6 Mar 2017 20:17:10 -0500 Subject: [PATCH 082/314] Rework attributes value and filters This reworks how attributes argument works. Callables now take three arguments: tag, attribute name and attribute value. Callables can be passed in as the attributes argument value or as a value for any of the tags in the dict. This also reworks the implementation so the complexity of the different shapes is shuffled away out of ``allow_token`` which simplifies it a bit. --- CHANGES | 10 +++++ bleach/sanitizer.py | 65 ++++++++++++++++++++++-------- docs/clean.rst | 90 ++++++++++++++++++++++++++---------------- tests/test_basics.py | 59 +++++++++++++++++++++++---- tests/test_security.py | 2 +- 5 files changed, 166 insertions(+), 60 deletions(-) diff --git a/CHANGES b/CHANGES index 3ff2b789..79f56a9e 100644 --- a/CHANGES +++ b/CHANGES @@ -25,6 +25,12 @@ Version 2.0 (in development) Amongst other things, this version will add end tags even if the tag in question is to be escaped. +* ``bleach.clean`` and friends attribute callables now take three arguments: + tag, attribute name and attribute value. Previously they only took attribute + name and attribute value. + + All attribute callables will need to be updated. + * ``bleach.linkify`` was rewritten ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer @@ -52,6 +58,8 @@ Version 2.0 (in development) don't then html5lib will raise an assertion error that the value is not unicode. + All linkify filters will need to be updated. + **Changes** * Supports Python 3.6. @@ -68,6 +76,8 @@ Version 2.0 (in development) you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean and linkify in one pass. +* ``bleach.clean`` and friends can now take a callable as an attributes arg value. + * Tons of bug fixes. * Cleaned up tests. diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 06c90665..1223e79b 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -128,7 +128,7 @@ def clean(self, text): source=self.walker(dom), # Bleach-sanitizer-specific things - allowed_attributes_map=self.attributes, + attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, @@ -146,22 +146,58 @@ def clean(self, text): return self.serializer.render(filtered) +def attribute_filter_factory(attributes): + """Generates attribute filter function for the given attributes value + + The attributes value can take one of several shapes. This returns a filter + function appropriate to the attributes value. One nice thing about this is + that there's less if/then shenanigans in the ``allow_token`` method. + + """ + if callable(attributes): + return attributes + + if isinstance(attributes, dict): + def _attr_filter(tag, attr, value): + if tag in attributes: + attr_val = attributes[tag] + if callable(attr_val): + return attr_val(tag, attr, value) + + if attr in attr_val: + return True + + if '*' in attributes: + attr_val = attributes['*'] + if callable(attr_val): + return attr_val(tag, attr, value) + + return attr in attr_val + + return False + + return _attr_filter + + if isinstance(attributes, list): + def _attr_filter(tag, attr, value): + return attr in attributes + + return _attr_filter + + raise ValueError('attributes needs to be a callable, a list or a dict') + + class BleachSanitizerFilter(sanitizer.Filter): """html5lib Filter that sanitizes text This filter can be used anywhere html5lib filters can be used. """ - def __init__(self, source, allowed_attributes_map, + def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, strip_disallowed_elements=False, strip_html_comments=True, **kwargs): - if isinstance(allowed_attributes_map, dict): - self.wildcard_attributes = allowed_attributes_map.get('*', []) - self.allowed_attributes_map = allowed_attributes_map - else: - self.wildcard_attributes = allowed_attributes_map - self.allowed_attributes_map = {} + self.attr_filter = attribute_filter_factory(attributes) self.strip_disallowed_elements = strip_disallowed_elements self.strip_html_comments = strip_html_comments @@ -205,10 +241,6 @@ def sanitize_token(self, token): def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: - allowed_attributes = self.allowed_attributes_map.get(token['name'], []) - if not callable(allowed_attributes): - allowed_attributes += self.wildcard_attributes - # Loop through all the attributes and drop the ones that are not # allowed, are unsafe or break other rules. Additionally, fix # attribute values that need fixing. @@ -220,11 +252,10 @@ def allow_token(self, token): namespace, name = namespaced_name # Drop attributes that are not explicitly allowed - if callable(allowed_attributes): - if not allowed_attributes(name, val): - continue - - elif name not in allowed_attributes: + # + # NOTE(willkg): We pass in the attribute name--not a namespaced + # name. + if not self.attr_filter(token['name'], name, val): continue # Look at attributes that have uri values diff --git a/docs/clean.rst b/docs/clean.rst index 161e4357..b02c4525 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -55,8 +55,8 @@ The default value is also a conservative dict found in As a list --------- -The ``attributes`` value can be a list, in which case the attributes are allowed -for any tag. +The ``attributes`` value can be a list which specifies the list of attributes +allowed for any tag. For example: @@ -76,10 +76,12 @@ For example: As a dict --------- -The ``attributes`` value can be a dict, in which case the keys are tag names (or -a wildcard: ``*`` for all tags) and the values are lists of allowed attributes. +The ``attributes`` value can be a dict which maps tags to what attributes they can have. -For example: +You can also specify ``*``, which will match any tag. + +For example, this allows "href" and "rel" for "a" tags, "alt" for the "img" tag +and "class" for any tag (including "a" and "img"): .. doctest:: @@ -99,48 +101,66 @@ For example: u'an example' -In this case, ``class`` is allowed on any allowed element (from the ``tags`` -argument), ```` tags are allowed to have ``href`` and ``rel`` attributes, -and so on. - - Using functions --------------- -You can also use callables. If the callable returns ``True``, the attribute is -allowed. Otherwise, it is stripped. For example: +You can also use callables that take the tag, attribute name and attribute value +and returns ``True`` to keep the attribute or ``False`` to drop it. + +You can pass a callable as the attributes argument value and it'll run for +every tag/attr. + +For example: + +.. doctest:: + + >>> import bleach + + >>> def allow_h(tag, name, value): + ... return name[0] == 'h' + + >>> bleach.clean( + ... u'link', + ... tags=['a'], + ... attributes=allow_h, + ... ) + u'link' + + +You can also pass a callable as a value in an attributes dict and it'll run for +attributes for specified tags: .. doctest:: - >>> from urlparse import urlparse - >>> import bleach + >>> from urlparse import urlparse + >>> import bleach - >>> def allow_src(name, value): - ... if name in ('alt', 'height', 'width'): - ... return True - ... if name == 'src': - ... p = urlparse(value) - ... return (not p.netloc) or p.netloc == 'mydomain.com' - ... return False + >>> def allow_src(tag, name, value): + ... if name in ('alt', 'height', 'width'): + ... return True + ... if name == 'src': + ... p = urlparse(value) + ... return (not p.netloc) or p.netloc == 'mydomain.com' + ... return False - >>> bleach.clean( - ... u'an example', - ... tags=['img'], - ... attributes={ - ... 'img': allow_src - ... } - ... ) - u'an example' + >>> bleach.clean( + ... u'an example', + ... tags=['img'], + ... attributes={ + ... 'img': allow_src + ... } + ... ) + u'an example' Allowed styles (``styles``) =========================== -If you allow the ``style`` attribute, you will also need to whitelist styles -users are allowed to set, for example ``color`` and ``background-color``. +If you allow the ``style`` attribute, you will also need to specify the allowed +styles users are allowed to set, for example ``color`` and ``background-color``. -The default value is an empty list, i.e., the ``style`` attribute will be -allowed but no values will be. +The default value is an empty list. In other words, the ``style`` attribute will +be allowed but no style declaration names will be allowed. For example, to allow users to set the color and font-weight of text: @@ -205,8 +225,8 @@ Default protocols are in ``bleach.ALLOWED_PROTOCOLS``. Stripping markup (``strip``) ============================ -By default, Bleach *escapes* tags that aren't specified in the tags -whitelist and invalid markup. For example: +By default, Bleach *escapes* tags that aren't specified in the allowed tags list +and invalid markup. For example: .. doctest:: diff --git a/tests/test_basics.py b/tests/test_basics.py index 031ab66d..5b59ebf9 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -164,23 +164,46 @@ def test_lowercase_html(self): clean = 'BAR' assert bleach.clean(dirty, attributes=['class']) == clean - def test_wildcard_attributes(self): + def test_attributes_callable(self): + """Verify attributes can take a callable""" + ATTRS = lambda tag, name, val: name == 'title' + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + + def test_attributes_wildcard(self): + """Verify attributes[*] works""" ATTRS = { '*': ['id'], 'img': ['src'], } - TAG = ['img', 'em'] + TAGS = ['img', 'em'] dirty = ('both can have ' '') assert ( - bleach.clean(dirty, tags=TAG, attributes=ATTRS) == + bleach.clean(dirty, tags=TAGS, attributes=ATTRS) == 'both can have ' ) - def test_callable_attributes(self): - """Verify callable attributes work and get correct arg values""" - def img_test(attr, val): - return attr == 'src' and val.startswith('https') + def test_attributes_wildcard_callable(self): + """Verify attributes[*] callable works""" + ATTRS = { + '*': lambda tag, name, val: name == 'title' + } + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + + def test_attributes_tag_callable(self): + """Verify attributes[tag] callable works""" + def img_test(tag, name, val): + return name == 'src' and val.startswith('https') ATTRS = { 'img': img_test, @@ -198,6 +221,28 @@ def img_test(attr, val): u'foo baz' ) + def test_attributes_tag_list(self): + """Verify attributes[tag] list works""" + ATTRS = { + 'a': ['title'] + } + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + + def test_attributes_list(self): + """Verify attributes list works""" + ATTRS = ['title'] + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + def test_svg_attr_val_allows_ref(self): """Unescape values in svg attrs that allow url references""" # Local IRI, so keep it diff --git a/tests/test_security.py b/tests/test_security.py index 2aac0200..da0fe92f 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -75,7 +75,7 @@ def test_invalid_href_attr(): def test_invalid_filter_attr(): IMG = ['img', ] IMG_ATTR = { - 'img': lambda attr, val: attr == 'src' and val == "http://example.com/" + 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/" } assert ( From 2cedde71bfa263ccf0ce76f630468912aa3f212f Mon Sep 17 00:00:00 2001 From: "Alexandr N. Zamaraev" Date: Tue, 21 Feb 2017 23:49:19 +0700 Subject: [PATCH 083/314] Correct dublicates in email_re see #247 --- bleach/linkifier.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 1396f056..92351be4 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -64,10 +64,10 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols): EMAIL_RE = re.compile( r"""(? Date: Wed, 22 Feb 2017 12:06:41 +0700 Subject: [PATCH 084/314] Add test incorrect email --- tests/test_links.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_links.py b/tests/test_links.py index 28b6ad6d..99b30b89 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -109,6 +109,13 @@ def ft(attrs, new=False): True, 'mailto james@example.com.au.' ), + # Incorrect email + ( + '"\\\n"@opa.ru', + True, + '"\\\n"@opa.ru' + ), + ]) def test_email_link(data, parse_email, expected): assert linkify(data, parse_email=parse_email) == expected From 9a617a52d6b5e81bd7ca8407f1e0810fc412cc2a Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 6 Mar 2017 21:50:30 -0500 Subject: [PATCH 085/314] Change skip_pre to the more general skip_tags This changes skip_pre to a more general skip_tags that lets you skip linkifying in a specified list of tags--not just pre. --- CHANGES | 17 +++++++++ README.rst | 2 +- bleach/__init__.py | 18 +++++---- bleach/linkifier.py | 89 ++++++++++++++++++++++++++++++++++++--------- bleach/sanitizer.py | 30 ++++++++++++--- docs/goals.rst | 22 +++++------ docs/linkify.rst | 14 +++---- setup.py | 4 +- tests/test_links.py | 10 ++--- 9 files changed, 149 insertions(+), 57 deletions(-) diff --git a/CHANGES b/CHANGES index 79f56a9e..050f4fc1 100644 --- a/CHANGES +++ b/CHANGES @@ -60,6 +60,23 @@ Version 2.0 (in development) All linkify filters will need to be updated. +* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been + replaced with a more general ``skip_tags`` argument. + + Before, you might do:: + + bleach.linkify(some_text, skip_pre=True) + + The equivalent with Bleach 2.0 is:: + + bleach.linkify(some_text, skip_tags=['pre']) + + You can skip other tags, too, like ``style`` or ``script`` or other places + where you don't want linkification happening. + + All uses of linkify that use ``skip_pre`` will need to be updated. + + **Changes** * Supports Python 3.6. diff --git a/README.rst b/README.rst index 403ff9b6..08dd886a 100644 --- a/README.rst +++ b/README.rst @@ -8,7 +8,7 @@ Bleach .. image:: https://badge.fury.io/py/bleach.svg :target: http://badge.fury.io/py/bleach -Bleach is a whitelist-based HTML sanitizing library that escapes or strips +Bleach is a allowed-list-based HTML sanitizing library that escapes or strips markup and attributes. Bleach can also linkify text safely, applying filters that Django's ``urlize`` diff --git a/bleach/__init__.py b/bleach/__init__.py index 07b5075c..a231f136 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -47,16 +47,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, :arg str text: the text to clean - :arg list tags: whitelist of allowed tags; defaults to + :arg list tags: allowed list of tags; defaults to ``bleach.ALLOWED_TAGS`` - :arg dict attributes: whitelist of allowed attributes; defaults to - ``bleach.ALLOWED_ATTRIBUTES`` + :arg dict attributes: allowed attributes; can be a callable, list or dict; + defaults to ``bleach.ALLOWED_ATTRIBUTES`` - :arg list styles: whitelist of allowed css; defaults to + :arg list styles: allowed list of css styles; defaults to ``bleach.ALLOWED_STYLES`` - :arg list protocols: whitelist of allowed protocols for links; defaults + :arg list protocols: allowed list of protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements @@ -77,7 +77,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, return cleaner.clean(text) -def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False): +def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email @@ -106,7 +106,9 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False :arg list callbacks: list of callbacks to run when adjusting tag attributes - :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag + :arg list skip_tags: list of tags that you don't want to linkify the + contents of; for example, you could set this to ``['pre']`` to skip + linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses @@ -115,7 +117,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False """ linker = Linker( callbacks=callbacks, - skip_pre=skip_pre, + skip_tags=skip_tags, parse_email=parse_email ) return linker.linkify(text) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 92351be4..6103e81e 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -74,10 +74,40 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols): class Linker(object): - def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False, + """Convert URL-like strings in an HTML fragment to links + + This function converts strings that look like URLs, domain names and email + addresses in text that may be an HTML fragment to links, while preserving: + + 1. links already in the string + 2. urls found in attributes + 3. email addresses + + linkify does a best-effort approach and tries to recover from bad + situations due to crazy text. + + """ + def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): + """Creates a Linker instance + + :arg list callbacks: list of callbacks to run when adjusting tag attributes + + :arg list skip_tags: list of tags that you don't want to linkify the + contents of; for example, you could set this to ``['pre']`` to skip + linkifying contents of ``pre`` tags + + :arg bool parse_email: whether or not to linkify email addresses + + :arg re url_re: url matching regex + + :arg email_re: email matching regex + + :returns: linkified text as unicode + + """ self.callbacks = callbacks - self.skip_pre = skip_pre + self.skip_tags = skip_tags self.parse_email = parse_email self.url_re = url_re self.email_re = email_re @@ -105,7 +135,7 @@ def linkify(self, text): filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, - skip_pre=self.skip_pre, + skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, @@ -126,12 +156,31 @@ class LinkifyFilter(Filter): This filter can be used anywhere html5lib filters can be used. """ - def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False, + def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, url_re=URL_RE, email_re=EMAIL_RE): + """Creates a LinkifyFilter instance + + :arg TreeWalker source: stream + + :arg list callbacks: list of callbacks to run when adjusting tag attributes + + :arg list skip_tags: list of tags that you don't want to linkify the + contents of; for example, you could set this to ``['pre']`` to skip + linkifying contents of ``pre`` tags + + :arg bool parse_email: whether or not to linkify email addresses + + :arg re url_re: url matching regex + + :arg email_re: email matching regex + + :returns: linkified text as unicode + + """ super(LinkifyFilter, self).__init__(source) self.callbacks = callbacks or [] - self.skip_pre = skip_pre + self.skip_tags = skip_tags or [] self.parse_email = parse_email self.url_re = url_re @@ -140,9 +189,15 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False, def apply_callbacks(self, attrs, is_new): """Given an attrs dict and an is_new bool, runs through callbacks - Callbacks can return an adjusted attrs dict or None. In the case of - None, we stop going through callbacks and return that and the link gets - dropped. + Callbacks can return an adjusted attrs dict or ``None``. In the case of + ``None``, we stop going through callbacks and return that and the link + gets dropped. + + :arg dict attrs: map of ``(namespace, name)`` -> ``value`` + + :arg bool is_new: whether or not this link was added by linkify + + :returns: adjusted attrs dict or ``None`` """ for cb in self.callbacks: @@ -399,7 +454,7 @@ def handle_a_tag(self, token_buffer): def __iter__(self): in_a = False - in_pre = False + in_skip_tag = None token_buffer = [] @@ -425,10 +480,10 @@ def __iter__(self): continue elif token['type'] in ['StartTag', 'EmptyTag']: - if token['name'] == 'pre' and self.skip_pre: - # The "pre" tag starts a "special mode" where we don't linkify - # anything. - in_pre = True + if token['name'] in self.skip_tags: + # Skip tags start a "special mode" where we don't linkify + # anything until the end tag. + in_skip_tag = token['name'] elif token['name'] == 'a': # The "a" tag is special--we switch to a slurp mode and @@ -441,13 +496,13 @@ def __iter__(self): # yet continue - elif in_pre and self.skip_pre: + elif in_skip_tag and self.skip_tags: # NOTE(willkg): We put this clause here since in_a and # switching in and out of in_a takes precedence. - if token['type'] == 'EndTag' and token['name'] == 'pre': - in_pre = False + if token['type'] == 'EndTag' and token['name'] == in_skip_tag: + in_skip_tag = None - elif not in_a and not in_pre and token['type'] == 'Characters': + elif not in_a and not in_skip_tag and token['type'] == 'Characters': new_stream = iter([token]) if self.parse_email: new_stream = self.handle_email_addresses(new_stream) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 1223e79b..b5c2fe95 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -63,16 +63,16 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, strip_comments=True, filters=None): """Initializes a Cleaner - :arg tags: whitelist of allowed tags; defaults to + :arg list tags: allowed list of tags; defaults to ``bleach.ALLOWED_TAGS`` - :arg attributes: whitelist of allowed attributes; defaults to - ``bleach.ALLOWED_ATTRIBUTES`` + :arg dict attributes: allowed attributes; can be a callable, list or dict; + defaults to ``bleach.ALLOWED_ATTRIBUTES`` - :arg styles: whitelist of allowed css; defaults to + :arg list styles: allowed list of css styles; defaults to ``bleach.ALLOWED_STYLES`` - :arg protocols: whitelist of allowed protocols for links; defaults + :arg list protocols: allowed list of protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements @@ -196,7 +196,27 @@ class BleachSanitizerFilter(sanitizer.Filter): def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, strip_disallowed_elements=False, strip_html_comments=True, **kwargs): + """Creates a BleachSanitizerFilter instance + :arg Treewalker source: stream + + :arg list tags: allowed list of tags; defaults to + ``bleach.ALLOWED_TAGS`` + + :arg dict attributes: allowed attributes; can be a callable, list or dict; + defaults to ``bleach.ALLOWED_ATTRIBUTES`` + + :arg list styles: allowed list of css styles; defaults to + ``bleach.ALLOWED_STYLES`` + + :arg list protocols: allowed list of protocols for links; defaults + to ``bleach.ALLOWED_PROTOCOLS`` + + :arg strip_disallowed_elements: whether or not to strip disallowed elements + + :arg strip_html_comments: whether or not to strip HTML comments + + """ self.attr_filter = attribute_filter_factory(attributes) self.strip_disallowed_elements = strip_disallowed_elements diff --git a/docs/goals.rst b/docs/goals.rst index 632c222c..015bc563 100644 --- a/docs/goals.rst +++ b/docs/goals.rst @@ -13,15 +13,15 @@ Goals ===== -Always take a whitelist-based approach --------------------------------------- +Always take a allowed-list-based approach +----------------------------------------- -Bleach should always take a whitelist-based approach to allowing any kind of -content or markup. Blacklisting is error-prone and not future proof. +Bleach should always take a allowed-list-based approach to markup filtering. +Specifying disallowed lists is error-prone and not future proof. For example, you should have to opt-in to allowing the ``onclick`` attribute, -not blacklist all the other ``on*`` attributes. Future versions of HTML may add -new event handlers, like ``ontouch``, that old blacklists would not prevent. +not opt-out of all the other ``on*`` attributes. Future versions of HTML may add +new event handlers, like ``ontouch``, that old disallow would not prevent. Main goal is to sanitize input of malicious content @@ -39,8 +39,8 @@ Examples might include: These examples, and others, are traditionally prone to security issues like XSS or other script injection, or annoying issues like unclosed tags and invalid -markup. Bleach will take a proactive, whitelist-only approach to allowing HTML -content, and will use the HTML5 parsing algorithm to handle invalid markup. +markup. Bleach will take a proactive, allowed-list-only approach to allowing +HTML content, and will use the HTML5 parsing algorithm to handle invalid markup. See the :ref:`chapter on clean() ` for more info. @@ -52,7 +52,7 @@ The secondary goal of Bleach is to provide a mechanism for finding or altering links (```` tags with ``href`` attributes, or things that look like URLs or email addresses) in text. -While Bleach itself will always operate on a whitelist-based security model, +While Bleach itself will always operate on a allowed-list-based security model, the :ref:`linkify() method ` is flexible enough to allow the creation, alteration, and removal of links based on an extremely wide range of use cases. @@ -69,8 +69,8 @@ Sanitize complete HTML documents -------------------------------- Once you're creating whole documents, you have to allow so many tags that a -blacklist approach (e.g. forbidding `` test') == - 'a <script>safe()</script> test' - ) - assert ( - bleach.clean('a test') == - 'a <style>body{}</style> test' - ) - - def test_bad_href(self): - assert ( - bleach.clean('no link') == - 'no link' - ) - - def test_bare_entities(self): - assert ( - bleach.clean('an & entity') == - 'an & entity' - ) - assert ( - bleach.clean('an < entity') == - 'an < entity' - ) - - assert ( - bleach.clean('tag < and entity') == - 'tag < and entity' - ) - - assert ( - bleach.clean('&') == - '&' - ) - - def test_escaped_entities(self): - s = '<em>strong</em>' - assert bleach.clean(s) == s - - def test_weird_strings(self): - s = 'with html tags', strip=True) == - 'a test with html tags' - ) - assert ( - bleach.clean('a test with html tags', - strip=True) == - 'a test with html tags' - ) - - s = '

link text

' - assert ( - bleach.clean(s, tags=['p'], strip=True) == - '

link text

' - ) - s = '

multiply nested text

' - assert ( - bleach.clean(s, tags=['p'], strip=True) == - '

multiply nested text

' - ) - - s = '

' - assert ( - bleach.clean(s, tags=['p', 'a'], strip=True) == - '

' - ) - - def test_allowed_styles(self): - ATTRS = ['style'] - STYLE = ['color'] - blank = '' - s = '' - assert bleach.clean('', attributes=ATTRS) == blank - assert bleach.clean(s, attributes=ATTRS, styles=STYLE) == s - assert ( - bleach.clean('', attributes=ATTRS, styles=STYLE) == - s - ) - - def test_lowercase_html(self): - """We should output lowercase HTML.""" - dirty = 'BAR' - clean = 'BAR' - assert bleach.clean(dirty, attributes=['class']) == clean - - def test_attributes_callable(self): - """Verify attributes can take a callable""" - ATTRS = lambda tag, name, val: name == 'title' - TAGS = ['a'] - - assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == - u'example' - ) - - def test_attributes_wildcard(self): - """Verify attributes[*] works""" - ATTRS = { - '*': ['id'], - 'img': ['src'], - } - TAGS = ['img', 'em'] - dirty = ('both can have ' - '') - assert ( - bleach.clean(dirty, tags=TAGS, attributes=ATTRS) == - 'both can have ' - ) - - def test_attributes_wildcard_callable(self): - """Verify attributes[*] callable works""" - ATTRS = { - '*': lambda tag, name, val: name == 'title' - } - TAGS = ['a'] - - assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == - u'example' - ) - - def test_attributes_tag_callable(self): - """Verify attributes[tag] callable works""" - def img_test(tag, name, val): - return name == 'src' and val.startswith('https') - - ATTRS = { - 'img': img_test, - } - TAGS = ['img'] - - assert ( - bleach.clean('foo blah baz', tags=TAGS, - attributes=ATTRS) == - u'foo baz' - ) - assert ( - bleach.clean('foo blah baz', tags=TAGS, - attributes=ATTRS) == - u'foo baz' - ) - - def test_attributes_tag_list(self): - """Verify attributes[tag] list works""" - ATTRS = { - 'a': ['title'] - } - TAGS = ['a'] - - assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == - u'example' - ) - - def test_attributes_list(self): - """Verify attributes list works""" - ATTRS = ['title'] - TAGS = ['a'] - - assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == - u'example' - ) - - def test_svg_attr_val_allows_ref(self): - """Unescape values in svg attrs that allow url references""" - # Local IRI, so keep it - text = '' - TAGS = ['svg', 'rect'] - ATTRS = { - 'rect': ['fill'], - } - assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == - '' - ) - - # Non-local IRI, so drop it - text = '' - TAGS = ['svg', 'rect'] - ATTRS = { - 'rect': ['fill'], - } - assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == - '' - ) - - @pytest.mark.parametrize('text, expected', [ - ( - '', - '' - ), - ( - '', - # NOTE(willkg): Bug in html5lib serializer drops the xlink part - '' - ), - ]) - def test_svg_allow_local_href(self, text, expected): - """Keep local hrefs for svg elements""" - TAGS = ['svg', 'pattern'] - ATTRS = { - 'pattern': ['id', 'href'], - } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected - - @pytest.mark.parametrize('text, expected', [ - ( - '', - '' - ), - ( - '', - '' - ), - ]) - def test_svg_allow_local_href_nonlocal(self, text, expected): - """Drop non-local hrefs for svg elements""" - TAGS = ['svg', 'pattern'] - ATTRS = { - 'pattern': ['id', 'href'], - } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected - - @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') - def test_sarcasm(self): - """Jokes should crash.""" - dirty = 'Yeah right ' - clean = 'Yeah right <sarcasm/>' - assert bleach.clean(dirty) == clean - - def test_user_defined_protocols_valid(self): - valid_href = 'allowed href' - assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href - - def test_user_defined_protocols_invalid(self): - invalid_href = 'invalid href' - cleaned_href = 'invalid href' - assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href - - def test_filters(self): - # Create a Filter that changes all the attr values to "moo" - class MooFilter(Filter): - def __iter__(self): - for token in Filter.__iter__(self): - if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: - for attr, value in token['data'].items(): - token['data'][attr] = 'moo' - - yield token - - ATTRS = { - 'img': ['rel', 'src'] - } - TAGS = ['img'] - dirty = 'this is cute! ' - - cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) - - assert ( - cleaner.clean(dirty) == - 'this is cute! ' - ) - - -def test_clean_idempotent(): - """Make sure that applying the filter twice doesn't change anything.""" - dirty = 'invalid & < extra http://link.com' - - assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty) - - -class TestCleaner: - def test_basics(self): - TAGS = ['span', 'br'] - ATTRS = {'span': ['style']} - - cleaner = Cleaner(tags=TAGS, attributes=ATTRS) - - assert ( - cleaner.clean('a
test') == - 'a
test' - ) diff --git a/tests/test_clean.py b/tests/test_clean.py new file mode 100644 index 00000000..a6a37557 --- /dev/null +++ b/tests/test_clean.py @@ -0,0 +1,404 @@ +from html5lib.filters.base import Filter +import pytest +import six + +import bleach +from bleach.sanitizer import Cleaner + + +def test_empty(): + assert bleach.clean('') == '' + + +def test_nbsp(): + if six.PY3: + expected = '\xa0test string\xa0' + else: + expected = six.u('\\xa0test string\\xa0') + + assert bleach.clean(' test string ') == expected + + +def test_comments_only(): + comment = '' + assert bleach.clean(comment) == '' + assert bleach.clean(comment, strip_comments=False) == comment + + open_comment = ''.format(open_comment) + ) + + +def test_with_comments(): + text = 'Just text' + assert bleach.clean(text) == 'Just text' + assert bleach.clean(text, strip_comments=False) == text + + +def test_no_html(): + assert bleach.clean('no html string') == 'no html string' + + +def test_allowed_html(): + assert ( + bleach.clean('an allowed tag') == + 'an allowed tag' + ) + assert ( + bleach.clean('another good tag') == + 'another good tag' + ) + + +def test_bad_html(): + assert ( + bleach.clean('a fixed tag') == + 'a fixed tag' + ) + + +def test_function_arguments(): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + text = 'a
test' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + 'a
test' + ) + + +def test_named_arguments(): + ATTRS = {'a': ['rel', 'href']} + + text = 'xx.com' + assert bleach.clean(text) == 'xx.com' + assert ( + bleach.clean(text, attributes=ATTRS) == + 'xx.com' + ) + + +def test_disallowed_html(): + assert ( + bleach.clean('a test') == + 'a <script>safe()</script> test' + ) + assert ( + bleach.clean('a test') == + 'a <style>body{}</style> test' + ) + + +def test_bad_href(): + assert ( + bleach.clean('no link') == + 'no link' + ) + + +def test_bare_entities(): + assert ( + bleach.clean('an & entity') == + 'an & entity' + ) + assert ( + bleach.clean('an < entity') == + 'an < entity' + ) + + assert ( + bleach.clean('tag < and entity') == + 'tag < and entity' + ) + + assert ( + bleach.clean('&') == + '&' + ) + + +def test_escaped_entities(): + s = '<em>strong</em>' + assert bleach.clean(s) == s + + +def test_weird_strings(): + s = 'with
html tags' + assert ( + bleach.clean(text, strip=True) == + 'a test with html tags' + ) + + text = 'a test with html tags' + assert ( + bleach.clean(text, strip=True) == + 'a test with html tags' + ) + + text = '

link text

' + assert ( + bleach.clean(text, tags=['p'], strip=True) == + '

link text

' + ) + text = '

multiply nested text

' + assert ( + bleach.clean(text, tags=['p'], strip=True) == + '

multiply nested text

' + ) + + text = '

' + assert ( + bleach.clean(text, tags=['p', 'a'], strip=True) == + '

' + ) + + +def test_allowed_styles(): + ATTRS = ['style'] + STYLE = ['color'] + + assert ( + bleach.clean('', attributes=ATTRS) == + '' + ) + + text = '' + assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text + + text = '' + assert ( + bleach.clean(text, attributes=ATTRS, styles=STYLE) == + '' + ) + + +def test_lowercase_html(): + """We should output lowercase HTML.""" + assert ( + bleach.clean('BAR', attributes=['class']) == + 'BAR' + ) + + +def test_attributes_callable(): + """Verify attributes can take a callable""" + ATTRS = lambda tag, name, val: name == 'title' + TAGS = ['a'] + + text = u'example' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + u'example' + ) + + +def test_attributes_wildcard(): + """Verify attributes[*] works""" + ATTRS = { + '*': ['id'], + 'img': ['src'], + } + TAGS = ['img', 'em'] + + text = 'both can have ' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + 'both can have ' + ) + + +def test_attributes_wildcard_callable(): + """Verify attributes[*] callable works""" + ATTRS = { + '*': lambda tag, name, val: name == 'title' + } + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + + +def test_attributes_tag_callable(): + """Verify attributes[tag] callable works""" + def img_test(tag, name, val): + return name == 'src' and val.startswith('https') + + ATTRS = { + 'img': img_test, + } + TAGS = ['img'] + + text = 'foo blah baz' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + u'foo baz' + ) + text = 'foo blah baz' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + u'foo baz' + ) + + +def test_attributes_tag_list(): + """Verify attributes[tag] list works""" + ATTRS = { + 'a': ['title'] + } + TAGS = ['a'] + + assert ( + bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + u'example' + ) + + +def test_attributes_list(): + """Verify attributes list works""" + ATTRS = ['title'] + TAGS = ['a'] + + text = u'example' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + u'example' + ) + + +def test_svg_attr_val_allows_ref(): + """Unescape values in svg attrs that allow url references""" + # Local IRI, so keep it + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + + text = '' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + # Non-local IRI, so drop it + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + text = '' + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + # NOTE(willkg): Bug in html5lib serializer drops the xlink part + '' + ), +]) +def test_svg_allow_local_href(text, expected): + """Keep local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + '' + ), +]) +def test_svg_allow_local_href_nonlocal(text, expected): + """Drop non-local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') +def test_sarcasm(): + """Jokes should crash.""" + dirty = 'Yeah right ' + clean = 'Yeah right <sarcasm/>' + assert bleach.clean(dirty) == clean + + +def test_user_defined_protocols_valid(): + valid_href = 'allowed href' + assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href + + +def test_user_defined_protocols_invalid(): + invalid_href = 'invalid href' + cleaned_href = 'invalid href' + assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href + + +def test_filters(): + # Create a Filter that changes all the attr values to "moo" + class MooFilter(Filter): + def __iter__(self): + for token in Filter.__iter__(self): + if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: + for attr, value in token['data'].items(): + token['data'][attr] = 'moo' + + yield token + + ATTRS = { + 'img': ['rel', 'src'] + } + TAGS = ['img'] + + cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) + + dirty = 'this is cute! ' + assert ( + cleaner.clean(dirty) == + 'this is cute! ' + ) + + +def test_clean_idempotent(): + """Make sure that applying the filter twice doesn't change anything.""" + dirty = 'invalid & < extra http://link.com' + assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty) + + +class TestCleaner: + def test_basics(self): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + cleaner = Cleaner(tags=TAGS, attributes=ATTRS) + + assert ( + cleaner.clean('a
test') == + 'a
test' + ) diff --git a/tests/test_links.py b/tests/test_linkify.py similarity index 100% rename from tests/test_links.py rename to tests/test_linkify.py From 205edc0094c3a5ad217d164048d57a22a69fed93 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 20 Sep 2017 09:00:30 -0400 Subject: [PATCH 117/314] Add code of conduct blurb, move some docs around Fixes #313 --- CODE_OF_CONDUCT.rst | 9 +++++++++ README.rst | 41 ++++++++++++++++++++++++++--------------- docs/dev.rst | 6 ++++++ 3 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 CODE_OF_CONDUCT.rst diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst new file mode 100644 index 00000000..da20d8db --- /dev/null +++ b/CODE_OF_CONDUCT.rst @@ -0,0 +1,9 @@ +Code of conduct +=============== + +This project and repository is governed by Mozilla's code of conduct and +etiquette guidelines. For more details please see the `Mozilla Community +Participation Guidelines +`_ and +`Developer Etiquette Guidelines +`_. diff --git a/README.rst b/README.rst index b728c292..863772e8 100644 --- a/README.rst +++ b/README.rst @@ -51,21 +51,6 @@ please read our wiki page at ``_. -Security -======== - -Bleach is a security-related library. - -We have a responsible security vulnerability reporting process. Please use -that if you're reporting a security issue. - -Security issues are fixed in private. After we land such a fix, we'll do a -release. - -For every release, we mark security issues we've fixed in the ``CHANGES`` in -the **Security issues** section. We include relevant CVE links. - - Installing Bleach ================= @@ -104,6 +89,32 @@ The simplest way to use Bleach is: u'an http://example.com url +Security +======== + +Bleach is a security-related library. + +We have a responsible security vulnerability reporting process. Please use +that if you're reporting a security issue. + +Security issues are fixed in private. After we land such a fix, we'll do a +release. + +For every release, we mark security issues we've fixed in the ``CHANGES`` in +the **Security issues** section. We include relevant CVE links. + + +Code of conduct +=============== + +This project and repository is governed by Mozilla's code of conduct and +etiquette guidelines. For more details please see the `Mozilla Community +Participation Guidelines +`_ and +`Developer Etiquette Guidelines +`_. + + .. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/mozilla/bleach .. _ReadTheDocs: https://bleach.readthedocs.io/ diff --git a/docs/dev.rst b/docs/dev.rst index cfa0a8c7..98707048 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -19,6 +19,12 @@ To install Bleach to make changes to it: $ pip install -e . +.. include:: ../CONTRIBUTING.rst + + +.. include:: ../CODE_OF_CONDUCT.rst + + Docs ==== From 4c80d008059257a17af3982c1aba4a3b7879370b Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 20 Sep 2017 09:07:20 -0400 Subject: [PATCH 118/314] Change "Security issues" to "Security fixes" This is clearer regarding the intent of that block. --- CHANGES | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 59db3338..5ea7aff9 100644 --- a/CHANGES +++ b/CHANGES @@ -4,7 +4,7 @@ Bleach Changes Version 2.1 (in development) ---------------------------- -**Security issues** +**Security fixes** **Backwards incompatible changes** @@ -40,7 +40,7 @@ Version 2.1 (in development) Version 2.0 (March 8th, 2017) ----------------------------- -**Security issues** +**Security fixes** * None @@ -150,7 +150,7 @@ Version 2.0 (March 8th, 2017) Version 1.5 (November 4th, 2016) -------------------------------- -**Security issues** +**Security fixes** * None @@ -192,7 +192,7 @@ Version 1.5 (November 4th, 2016) Version 1.4.3 (May 23rd, 2016) ------------------------------ -**Security issues** +**Security fixes** * None From 2a9854d9484797beeed1673454980404483774b3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 20 Sep 2017 11:08:11 -0400 Subject: [PATCH 119/314] Fix test_websites to work with Python 3 --- tests_website/data_to_json.py | 2 +- tests_website/open_test_page.py | 2 ++ tests_website/server.py | 16 ++++++++++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py index ffd346f5..debe5a9d 100755 --- a/tests_website/data_to_json.py +++ b/tests_website/data_to_json.py @@ -50,4 +50,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py index b812de92..79f4adf2 100755 --- a/tests_website/open_test_page.py +++ b/tests_website/open_test_page.py @@ -2,6 +2,7 @@ import webbrowser + TEST_BROWSERS = set([ # 'mozilla', 'firefox', @@ -29,6 +30,7 @@ ]) REGISTERED_BROWSERS = set(webbrowser._browsers.keys()) + if __name__ == '__main__': for b in TEST_BROWSERS & REGISTERED_BROWSERS: webbrowser.get(b).open_new_tab('http://localhost:8080') diff --git a/tests_website/server.py b/tests_website/server.py index 83fcf84a..8a8c6438 100755 --- a/tests_website/server.py +++ b/tests_website/server.py @@ -9,17 +9,19 @@ python server.py """ -import SimpleHTTPServer -import SocketServer -import json +# import SimpleHTTPServer +# import SocketServer + +import six + import bleach PORT = 8080 -class BleachCleanHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): +class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler): def do_POST(self): content_len = int(self.headers.getheader('content-length', 0)) body = self.rfile.read(content_len) @@ -36,7 +38,9 @@ def do_POST(self): if __name__ == '__main__': - SocketServer.TCPServer.allow_reuse_address = True # Prevent 'cannot bind to address' errors on restart - httpd = SocketServer.TCPServer(('127.0.0.1', PORT), BleachCleanHandler) + # Prevent 'cannot bind to address' errors on restart + six.moves.socketserver.TCPServer.allow_reuse_address = True + + httpd = six.moves.socketserver.TCPServer(('127.0.0.1', PORT), BleachCleanHandler) print("listening on localhost port %d" % PORT) httpd.serve_forever() From daec5ef18487fa31779165cb104a22b5931b4c3b Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 20 Sep 2017 11:31:06 -0400 Subject: [PATCH 120/314] More Python 3 fixes for tests_websites --- tests_website/server.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tests_website/server.py b/tests_website/server.py index 8a8c6438..edc791a4 100755 --- a/tests_website/server.py +++ b/tests_website/server.py @@ -1,20 +1,17 @@ #!/usr/bin/env python """ -Simple Test/Demo Server for running bleach.clean output -on various desktops. +Simple Test/Demo Server for running bleach.clean output on various +desktops. Usage: -python server.py -""" + python server.py -# import SimpleHTTPServer -# import SocketServer +""" import six - import bleach @@ -23,17 +20,26 @@ class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler): def do_POST(self): - content_len = int(self.headers.getheader('content-length', 0)) + if six.PY2: + content_len = int(self.headers.getheader('content-length', 0)) + else: + content_len = int(self.headers.get('content-length', 0)) body = self.rfile.read(content_len) print("read %s bytes: %s" % (content_len, body)) + + if six.PY3: + body = body.decode('utf-8') + print('input: %r' % body) cleaned = bleach.clean(body) - print("cleaned %s" % cleaned) self.send_response(200) self.send_header('Content-Length', len(cleaned)) self.send_header('Content-Type', 'text/plain;charset=UTF-8') self.end_headers() + if six.PY3: + cleaned = bytes(cleaned, encoding='utf-8') + print("cleaned: %r" % cleaned) self.wfile.write(cleaned) From 67afdf8ae7d323305ea104c0efb6bcb37547edc2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 27 Jul 2017 13:07:08 -0400 Subject: [PATCH 121/314] Prevent HTMLTokenizer from unescaping entities This overrides the HTMLTokenizer's .consumeEntity() method such that it doesn't convert character entities. This also fixes some other escaping/unescaping oddities so that the output of bleach.clean() is more correct in regards to intended behavior. One thing this breaks is the idempotent property for bleach.clean()--it's no longer idempotent. Since it escapes text more correctly now and that's not an idempotent transform, this is no longer idempotent. For example, bleach.clean() can't differentiate between a user talking about code and saying this: I like my html wrapped in ! and this: I like my html escaped like this <b>! I'm not sure why we thought bleach.clean() could ever be correct and idempotent. Seems like that was an error. --- CHANGES | 7 ++++- bleach/sanitizer.py | 66 ++++++++++++++++++++++++++++++++++++++++-- tests/data/13.test.out | 2 +- tests/data/14.test.out | 2 +- tests/data/15.test.out | 2 +- tests/data/16.test.out | 2 +- tests/data/17.test.out | 2 +- tests/data/18.test.out | 2 +- tests/data/19.test.out | 3 +- tests/test_security.py | 15 ++++++++-- 10 files changed, 90 insertions(+), 13 deletions(-) diff --git a/CHANGES b/CHANGES index 5ea7aff9..ae1d52f3 100644 --- a/CHANGES +++ b/CHANGES @@ -17,6 +17,12 @@ Version 2.1 (in development) * clean, linkify: accept only unicode or utf-8-encoded str (#176) +* ``bleach.clean()`` no longer unescapes entities including ones that are missing + a ``;`` at the end which can happen in urls and other places. (#143) + +* ``bleach.clean()`` is no longer idempotent. If you run ``bleach.clean()`` on + text multiple times, it'll escape things again and again. + **Features** **Bug fixes** @@ -36,7 +42,6 @@ Version 2.1 (in development) * add test website and scripts to test ``bleach.clean()`` output in browser; thank you, Greg Guthe! - Version 2.0 (March 8th, 2017) ----------------------------- diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 26cfad2a..f9fb4287 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -4,9 +4,15 @@ from xml.sax.saxutils import unescape import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + ReparseException, + namespaces, + prefixes, + tokenTypes, +) from html5lib.filters import sanitizer from html5lib.serializer import HTMLSerializer +from html5lib._tokenizer import HTMLTokenizer from bleach.utils import alphabetize_attributes, force_unicode @@ -44,6 +50,33 @@ ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] +class BleachHTMLTokenizer(HTMLTokenizer): + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # We don't want to consume and convert entities. Instead we put the + # '&' in output. + if fromAttribute: + self.currentToken['data'][-1][1] += '&' + + else: + self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'}) + + +class BleachHTMLParser(html5lib.HTMLParser): + def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): + # Override HTMLParser so we can swap out the tokenizer. + self.innerHTMLMode = innerHTML + self.container = container + self.scripting = scripting + self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs) + self.reset() + + try: + self.mainLoop() + except ReparseException: + self.reset() + self.mainLoop() + + class Cleaner(object): """Cleaner for cleaning HTML fragments of malicious content @@ -104,7 +137,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, self.strip_comments = strip_comments self.filters = filters or [] - self.parser = html5lib.HTMLParser(namespaceHTMLElements=False) + self.parser = BleachHTMLParser(namespaceHTMLElements=False) self.walker = html5lib.getTreeWalker('etree') self.serializer = HTMLSerializer( quote_attr_values='always', @@ -338,6 +371,35 @@ def allow_token(self, token): return token + def disallowed_token(self, token): + token_type = token["type"] + if token_type == "EndTag": + token["data"] = "" % token["name"] + + elif token["data"]: + assert token_type in ("StartTag", "EmptyTag") + attrs = [] + for (ns, name), v in token["data"].items(): + attrs.append(' %s="%s"' % ( + name if ns is None else "%s:%s" % (prefixes[ns], name), + # Note: HTMLSerializer escapes attribute values already, so + # if we do it here (like HTMLSerializer does), then we end + # up double-escaping. + v) + ) + token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) + + else: + token["data"] = "<%s>" % token["name"] + + if token.get("selfClosing"): + token["data"] = token["data"][:-1] + "/>" + + token["type"] = "Characters" + + del token["name"] + return token + def sanitize_css(self, style): """Sanitizes css in style tags""" # disallow urls diff --git a/tests/data/13.test.out b/tests/data/13.test.out index 1c866507..0053081c 100644 --- a/tests/data/13.test.out +++ b/tests/data/13.test.out @@ -1 +1 @@ -<img src="JaVaScRiPt:alert("XSS&lt;WBR">")> \ No newline at end of file +<img src="JaVaScRiPt:alert(&quot;XSS<WBR">&quot;)> diff --git a/tests/data/14.test.out b/tests/data/14.test.out index 8e5ff754..04091589 100644 --- a/tests/data/14.test.out +++ b/tests/data/14.test.out @@ -1 +1 @@ -<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=&#106;&#97;&#118;&#97;&<wbr> \ No newline at end of file +<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;&#99;&#114;&#105;&#112;&<wbr></wbr>#116;&#58;&#97;</imgsrc=&#106;&#97;&#118;&#97;&<wbr> diff --git a/tests/data/15.test.out b/tests/data/15.test.out index 8b90245f..a7dc6e69 100644 --- a/tests/data/15.test.out +++ b/tests/data/15.test.out @@ -1 +1 @@ -le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')> \ No newline at end of file +&#108;&#101;&<wbr></wbr>#114;&#116;&#40;&#39;&#88;&#83<wbr></wbr>;&#83;&#39;&#41> diff --git a/tests/data/16.test.out b/tests/data/16.test.out index 1ecb332b..c8e31d88 100644 --- a/tests/data/16.test.out +++ b/tests/data/16.test.out @@ -1 +1 @@ -<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=&#0000106&#0000097&<wbr> \ No newline at end of file +<imgsrc=&#0000106&#0000097&<wbr>#0000118&#0000097&#0000115&<wbr></wbr>#0000099&#0000114&#0000105&<wbr></wbr>#0000112&#0000116&#0000058&<wbr></wbr>#0000097&#0000108&#0000101&<wbr></wbr>#0000114&#0000116&#0000040&<wbr></wbr>#0000039&#0000088&#0000083&<wbr></wbr>#0000083&#0000039&#0000041></imgsrc=&#0000106&#0000097&<wbr> diff --git a/tests/data/17.test.out b/tests/data/17.test.out index ae928a99..8d47f574 100644 --- a/tests/data/17.test.out +++ b/tests/data/17.test.out @@ -1 +1 @@ -<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr> \ No newline at end of file +<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63&#x72&#x69&#x70&#x74&#x3A&<wbr></wbr>#x61&#x6C&#x65&#x72&#x74&#x28&<wbr></wbr>#x27&#x58&#x53&#x53&#x27&#x29></imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr> diff --git a/tests/data/18.test.out b/tests/data/18.test.out index 8046c715..e4fe2cf3 100644 --- a/tests/data/18.test.out +++ b/tests/data/18.test.out @@ -1 +1 @@ -<img src="jav ascript:alert(&lt;WBR&gt;'XSS');"> \ No newline at end of file +<img src="jav&#x09;ascript:alert(<WBR>'XSS');"> \ No newline at end of file diff --git a/tests/data/19.test.out b/tests/data/19.test.out index 8eb8794c..4daa11ad 100644 --- a/tests/data/19.test.out +++ b/tests/data/19.test.out @@ -1,2 +1 @@ -<img src="jav -ascript:alert(&lt;WBR&gt;'XSS');"> \ No newline at end of file +<img src="jav&#x0A;ascript:alert(<WBR>'XSS');"> \ No newline at end of file diff --git a/tests/test_security.py b/tests/test_security.py index 0eeb09c6..28e3cf2a 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -8,6 +8,17 @@ from bleach import clean +def test_escaped_entities(): + # html5lib unescapes character entities, so these would become ' and " + # which makes it possible to break out of html attributes. + # + # Verify that bleach.clean() doesn't unescape entities. + assert ( + clean(''"') == + '&#39;&#34;' + ) + + def test_nested_script_tag(): assert ( clean('</script>') == @@ -105,7 +116,7 @@ def test_invalid_tag_char(): def test_unclosed_tag(): assert ( clean('& +-- +>"><script>alert("XSS")</script>& diff --git a/tests/data/10.test b/tests/data/10.test index 268771bc..a6db9f98 100644 --- a/tests/data/10.test +++ b/tests/data/10.test @@ -1 +1,3 @@ +-- +<img src="javascript:alert('XSS');"> diff --git a/tests/data/11.test b/tests/data/11.test index 16a49c70..37cbbfaf 100644 --- a/tests/data/11.test +++ b/tests/data/11.test @@ -1 +1,3 @@ +-- +<img src="javascript:alert('XSS')"> diff --git a/tests/data/12.test b/tests/data/12.test index d4b96e6f..04c7ea8a 100644 --- a/tests/data/12.test +++ b/tests/data/12.test @@ -1 +1,3 @@ +-- +<img src="JaVaScRiPt:alert('XSS')"> diff --git a/tests/data/13.test b/tests/data/13.test index 07279a83..36d4aaee 100644 --- a/tests/data/13.test +++ b/tests/data/13.test @@ -1 +1,3 @@ ")> +-- +<img src="JaVaScRiPt:alert(&quot;XSS<WBR">")> diff --git a/tests/data/14.test b/tests/data/14.test index b704c0b4..f154c73e 100644 --- a/tests/data/14.test +++ b/tests/data/14.test @@ -1 +1,3 @@ #115;crip&#116;:a +-- +<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=&#106;&#97;&#118;&#97;&<wbr> diff --git a/tests/data/15.test b/tests/data/15.test index b6a2de6b..c48c3e41 100644 --- a/tests/data/15.test +++ b/tests/data/15.test @@ -1 +1,3 @@ le&#114;t('XS;S')> +-- +le&<wbr></wbr>#114;t('X&#83<wbr></wbr>;S'&#41> diff --git a/tests/data/16.test b/tests/data/16.test index d66b5921..938240be 100644 --- a/tests/data/16.test +++ b/tests/data/16.test @@ -1 +1,3 @@ #0000118as&#0000099ri&#0000112t:&#0000097le&#0000114t(&#0000039XS&#0000083')> +-- +<imgsrc=&#0000106&#0000097&<wbr>#0000118&#0000097&#0000115&<wbr></wbr>#0000099&#0000114&#0000105&<wbr></wbr>#0000112&#0000116&#0000058&<wbr></wbr>#0000097&#0000108&#0000101&<wbr></wbr>#0000114&#0000116&#0000040&<wbr></wbr>#0000039&#0000088&#0000083&<wbr></wbr>#0000083&#0000039&#0000041></imgsrc=&#0000106&#0000097&<wbr> diff --git a/tests/data/17.test b/tests/data/17.test index 6e71b152..166e8845 100644 --- a/tests/data/17.test +++ b/tests/data/17.test @@ -1 +1,3 @@ #x63ript:&#x61lert(&#x27XSS')> +-- +<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63&#x72&#x69&#x70&#x74&#x3A&<wbr></wbr>#x61&#x6C&#x65&#x72&#x74&#x28&<wbr></wbr>#x27&#x58&#x53&#x53&#x27&#x29></imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr> diff --git a/tests/data/18.test b/tests/data/18.test index 1c173723..635461f8 100644 --- a/tests/data/18.test +++ b/tests/data/18.test @@ -1 +1,3 @@ +-- +<img src="jav&#x09;ascript:alert(<WBR>'XSS');"> diff --git a/tests/data/19.test b/tests/data/19.test index e6e79742..1a1ebe41 100644 --- a/tests/data/19.test +++ b/tests/data/19.test @@ -1 +1,3 @@ +-- +<img src="jav&#x0A;ascript:alert(<WBR>'XSS');"> diff --git a/tests/data/2.test b/tests/data/2.test index 21b93db3..aefcbe26 100644 --- a/tests/data/2.test +++ b/tests/data/2.test @@ -1 +1,3 @@ "> +-- +"><style>@import"javascript:alert('XSS')";</style> diff --git a/tests/data/20.test b/tests/data/20.test index 614b544f..ceae0bd8 100644 --- a/tests/data/20.test +++ b/tests/data/20.test @@ -1 +1,3 @@ +-- +<img src="jav&#x0D;ascript:alert(<WBR>'XSS');"> diff --git a/tests/data/3.test b/tests/data/3.test index 8dc3a4ee..67f3591b 100644 --- a/tests/data/3.test +++ b/tests/data/3.test @@ -1 +1,3 @@ >"'> +-- +>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)></img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> diff --git a/tests/data/4.test b/tests/data/4.test index c4cf51cd..10438d81 100644 --- a/tests/data/4.test +++ b/tests/data/4.test @@ -1 +1,3 @@ ipt type="text/javascript">alert("foo");script> +-- +<scr<script>ipt type="text/javascript">alert("foo");script<del></del>></scr<script> diff --git a/tests/data/5.test b/tests/data/5.test index 0b03876b..dd45837a 100644 --- a/tests/data/5.test +++ b/tests/data/5.test @@ -1 +1,3 @@ >%22%27> +-- +>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22></img%20src%3d%22javascript:alert(%27%20xss%27)%22> diff --git a/tests/data/7.test b/tests/data/7.test index 827f9b9e..73f5cab1 100644 --- a/tests/data/7.test +++ b/tests/data/7.test @@ -1 +1,3 @@ "> +-- +"> diff --git a/tests/data/8.test b/tests/data/8.test index ddf33a96..f5be4f25 100644 --- a/tests/data/8.test +++ b/tests/data/8.test @@ -1 +1,3 @@ >" +-- +>" diff --git a/tests/data/9.test b/tests/data/9.test index 9cf58659..26d27f78 100644 --- a/tests/data/9.test +++ b/tests/data/9.test @@ -1 +1,3 @@ '';!--"=&{()} +-- +'';!--"<xss>=&{()}</xss> diff --git a/tests/test_security.py b/tests/test_security.py index 4c710775..9dd49338 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -197,12 +197,15 @@ def get_tests(): return testcases -@pytest.mark.parametrize('fn, text', get_tests()) -def test_regressions(fn, text): +@pytest.mark.parametrize('fn, test_case', get_tests()) +def test_regressions(fn, test_case): """Regression tests for clean so we can see if there are issues""" - expected = six.text_type(open(fn + '.out', 'r').read()) + test_data, expected = test_case.split('\n--\n') # NOTE(willkg): This strips input and expected which makes it easier to # maintain the files. If there comes a time when the input needs whitespace # at the beginning or end, then we'll have to figure out something else. - assert clean(text.strip()) == expected.strip() + test_data = test_data.strip() + expected = expected.strip() + + assert clean(test_data) == expected From 588286152b0c24d2d2c9e68d4761c14f00ce88b6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 3 Mar 2018 10:57:04 -0500 Subject: [PATCH 145/314] Merge all the clean tests into one file and clean up * Moves test_security.py tests into test_clean.py * Removes duplicate tests and unhelpful tests * Adds additional helpful test cases * Reworks some tests to be easier and run to read by parametrizing them * Adds comments and adjusts function names to be more helpful --- tests/test_clean.py | 549 ++++++++++++++++++++++++++++++----------- tests/test_security.py | 211 ---------------- 2 files changed, 405 insertions(+), 355 deletions(-) delete mode 100644 tests/test_security.py diff --git a/tests/test_clean.py b/tests/test_clean.py index c5f78f73..799ae186 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -1,96 +1,210 @@ +import os + from html5lib.filters.base import Filter import pytest -import bleach +from bleach import clean from bleach.sanitizer import Cleaner -def test_empty(): - assert bleach.clean('') == '' +def test_clean_idempotent(): + """Make sure that applying the filter twice doesn't change anything.""" + dirty = 'invalid & < extra http://link.com' + assert clean(clean(dirty)) == clean(dirty) -def test_nbsp(): - assert bleach.clean(' test string ') == ' test string ' +def test_only_text_is_cleaned(): + some_text = 'text' + some_type = int + no_type = None + assert clean(some_text) == some_text -def test_comments_only(): - comment = '' - assert bleach.clean(comment) == '' - assert bleach.clean(comment, strip_comments=False) == comment + with pytest.raises(TypeError) as e: + clean(some_type) + assert "argument cannot be of 'type' type" in str(e) - open_comment = ''.format(open_comment) - ) + with pytest.raises(TypeError) as e: + clean(no_type) + assert "NoneType" in str(e) -def test_with_comments(): - text = 'Just text' - assert bleach.clean(text) == 'Just text' - assert bleach.clean(text, strip_comments=False) == text +def test_empty(): + assert clean('') == '' -def test_no_html(): - assert bleach.clean('no html string') == 'no html string' +def test_content_has_no_html(): + assert clean('no html string') == 'no html string' -def test_allowed_html(): - assert ( - bleach.clean('an allowed tag') == +@pytest.mark.parametrize('data, expected', [ + ( + 'an allowed tag', 'an allowed tag' - ) - assert ( - bleach.clean('another good tag') == + ), + + ( + 'another good tag', 'another good tag' ) +]) +def test_content_has_allowed_html(data, expected): + assert clean(data) == expected -def test_bad_html(): +def test_html_is_lowercased(): assert ( - bleach.clean('a fixed tag') == - 'a fixed tag' + clean('foo') == + 'foo' ) -def test_function_arguments(): - TAGS = ['span', 'br'] - ATTRS = {'span': ['style']} +@pytest.mark.parametrize('data, should_strip, expected', [ + # Regular comment + ( + '', + True, + '' + ), - text = 'a
test' - assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == - 'a
test' + # Open comment with no close comment bit + ( + '' + ), + ( + '' + ), + + # Comment with text to the right + ( + 'text', + True, + 'text' + ), + ( + 'text', + True, + 'text' + ), + ( + 'text', + False, + 'text' + ), + ( + 'text', + False, + 'text' + ), + + # Comment with text to the left + ( + 'text', + True, + 'text' + ), + ( + 'text', + True, + 'text' + ), + ( + 'text', + False, + 'text' + ), + ( + 'text', + False, + 'text' ) +]) +def test_comments(data, should_strip, expected): + assert clean(data, strip_comments=should_strip) == expected -def test_named_arguments(): - ATTRS = {'a': ['rel', 'href']} +@pytest.mark.parametrize('data, expected', [ + # Disallowed tag is escaped + ('', '<img src="javascript:alert(\'XSS\');">'), + + # Test with parens + ('a test', 'a <script>safe()</script> test'), + + # Test with braces + ('a test', 'a <style>body{}</style> test'), +]) +def test_disallowed_tags(data, expected): + assert clean(data) == expected - text = 'xx.com' - assert bleach.clean(text) == 'xx.com' + +def test_invalid_char_in_tag(): + # NOTE(willkg): Two possible outcomes because attrs aren't ordered + assert ( + clean('') in + [ + '<script src="http://xx.com/xss.js" xss=""></script>', + '<script xss="" src="http://xx.com/xss.js"></script>' + ] + ) assert ( - bleach.clean(text, attributes=ATTRS) == - 'xx.com' + clean('') == + '<script src="http://xx.com/xss.js"></script>' ) -def test_disallowed_html(): +def test_unclosed_tag(): + assert ( + clean('a fixed tag') == + 'a fixed tag' + ) assert ( - bleach.clean('a test') == - 'a <script>safe()</script> test' + clean('/script>') == + '<<script>script>evil()<</script>/script>' + ) + assert ( + clean('<script>evil()</script>') == + '<<x>script>evil()<</x>/script>' + ) + assert ( + clean('>evil()>') == + '<script<script>>evil()></script<script>' ) @@ -100,13 +214,14 @@ def test_bad_href(): ('tag < and entity', 'tag < and entity'), ]) def test_bare_entities(text, expected): - assert bleach.clean(text) == expected + assert clean(text) == expected @pytest.mark.parametrize('text, expected', [ # Test character entities ('&', '&'), (' ', ' '), + ('  test string  ', '  test string  '), ('<em>strong</em>', '<em>strong</em>'), # Test character entity at beginning of string @@ -154,75 +269,160 @@ def test_bare_entities(text, expected): # Test non-numeric entities ('&#', '&#'), - ('&#<', '&#<') + ('&#<', '&#<'), + + # html5lib tokenizer unescapes character entities, so these would become ' + # and " which makes it possible to break out of html attributes. + # + # Verify that clean() doesn't unescape entities. + (''"', ''"'), ]) def test_character_entities(text, expected): - assert bleach.clean(text) == expected + assert clean(text) == expected -def test_weird_strings(): - s = 'with html tags' - assert ( - bleach.clean(text, strip=True) == +@pytest.mark.parametrize('data, kwargs, expected', [ + # All tags are allowed, so it strips nothing + ( + 'a test with html tags', + {'strip': True}, 'a test with html tags' - ) + ), - text = 'a test with html tags' - assert ( - bleach.clean(text, strip=True) == + # img tag is disallowed, so it's stripped + ( + 'a test with html tags', + {'strip': True}, 'a test with html tags' - ) + ), - text = '

link text

' - assert ( - bleach.clean(text, tags=['p'], strip=True) == + # a tag is disallowed, so it's stripped + ( + '

link text

', + {'tags': ['p'], 'strip': True}, '

link text

' - ) - text = '

multiply nested text

' - assert ( - bleach.clean(text, tags=['p'], strip=True) == + ), + + # handle nested disallowed tag + ( + '

multiply nested text

', + {'tags': ['p'], 'strip': True}, '

multiply nested text

' - ) + ), - text = '

' - assert ( - bleach.clean(text, tags=['p', 'a'], strip=True) == + # handle disallowed tag that's deep in the tree + ( + '

', + {'tags': ['a', 'p'], 'strip': True}, '

' - ) + ), +]) +def test_stripping_tags(data, kwargs, expected): + assert clean(data, **kwargs) == expected + + +@pytest.mark.parametrize('data, expected', [ + ( + 'pt>alert(1)ipt>', + 'pt>alert(1)ipt>' + ), + ( + 'pt>pt>alert(1)', + 'pt>pt>alert(1)' + ), +]) +def test_stripping_tags_is_safe(data, expected): + """Test stripping tags shouldn't result in malicious content""" + assert clean(data, strip=True) == expected def test_allowed_styles(): + """Test allowed styles""" ATTRS = ['style'] STYLE = ['color'] assert ( - bleach.clean('', attributes=ATTRS) == + clean('', attributes=ATTRS) == '' ) text = '' - assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text + assert clean(text, attributes=ATTRS, styles=STYLE) == text text = '' assert ( - bleach.clean(text, attributes=ATTRS, styles=STYLE) == + clean(text, attributes=ATTRS, styles=STYLE) == '' ) -def test_lowercase_html(): - """We should output lowercase HTML.""" +def test_href_with_wrong_tag(): assert ( - bleach.clean('BAR', attributes=['class']) == - 'BAR' + clean('no link') == + 'no link' ) +def test_disallowed_attr(): + IMG = ['img', ] + IMG_ATTR = ['src'] + + assert ( + clean('test') == + 'test' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + + +def test_unquoted_attr_values_are_quoted(): + assert ( + clean('myabbr') == + 'myabbr' + ) + + +def test_unquoted_event_handler_attr_value(): + assert ( + clean('xx.com') == + 'xx.com' + ) + + +def test_invalid_filter_attr(): + IMG = ['img', ] + IMG_ATTR = { + 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/" + } + + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + assert ( + clean('', tags=IMG, attributes=IMG_ATTR) == + '' + ) + + +def test_poster_attribute(): + """Poster attributes should not allow javascript.""" + tags = ['video'] + attrs = {'video': ['poster']} + + test = '' + assert clean(test, tags=tags, attributes=attrs) == '' + + ok = '' + assert clean(ok, tags=tags, attributes=attrs) == ok + + def test_attributes_callable(): """Verify attributes can take a callable""" ATTRS = lambda tag, name, val: name == 'title' @@ -230,7 +430,7 @@ def test_attributes_callable(): text = u'example' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == u'example' ) @@ -245,7 +445,7 @@ def test_attributes_wildcard(): text = 'both can have ' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == 'both can have ' ) @@ -258,7 +458,7 @@ def test_attributes_wildcard_callable(): TAGS = ['a'] assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + clean(u'example', tags=TAGS, attributes=ATTRS) == u'example' ) @@ -275,12 +475,12 @@ def img_test(tag, name, val): text = 'foo blah baz' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == u'foo baz' ) text = 'foo blah baz' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == u'foo baz' ) @@ -293,7 +493,7 @@ def test_attributes_tag_list(): TAGS = ['a'] assert ( - bleach.clean(u'example', tags=TAGS, attributes=ATTRS) == + clean(u'example', tags=TAGS, attributes=ATTRS) == u'example' ) @@ -305,11 +505,44 @@ def test_attributes_list(): text = u'example' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == u'example' ) +@pytest.mark.parametrize('data, kwargs, expected', [ + # javascript: is not allowed by default + ( + 'xss', + {}, + 'xss' + ), + + # File protocol is not allowed by default + ( + 'foo', + {}, + 'foo' + ), + + # Specified protocols are allowed + ( + 'allowed href', + {'protocols': ['myprotocol']}, + 'allowed href' + ), + + # Unspecified protocols are not allowed + ( + 'invalid href', + {'protocols': ['myprotocol']}, + 'invalid href' + ) +]) +def test_uri_value_allowed_protocols(data, kwargs, expected): + assert clean(data, **kwargs) == expected + + def test_svg_attr_val_allows_ref(): """Unescape values in svg attrs that allow url references""" # Local IRI, so keep it @@ -320,7 +553,7 @@ def test_svg_attr_val_allows_ref(): text = '' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == '' ) @@ -331,7 +564,7 @@ def test_svg_attr_val_allows_ref(): } text = '' assert ( - bleach.clean(text, tags=TAGS, attributes=ATTRS) == + clean(text, tags=TAGS, attributes=ATTRS) == '' ) @@ -353,7 +586,7 @@ def test_svg_allow_local_href(text, expected): ATTRS = { 'pattern': ['id', 'href'], } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + assert clean(text, tags=TAGS, attributes=ATTRS) == expected @pytest.mark.parametrize('text, expected', [ @@ -372,73 +605,77 @@ def test_svg_allow_local_href_nonlocal(text, expected): ATTRS = { 'pattern': ['id', 'href'], } - assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + assert clean(text, tags=TAGS, attributes=ATTRS) == expected + + +@pytest.mark.xfail(reason='regression from bleach 1.4') +def test_weird_strings(): + s = '= 0.99999999: changed API') +@pytest.mark.xfail(reason='regression from bleach 1.4') def test_sarcasm(): """Jokes should crash.""" - dirty = 'Yeah right ' - clean = 'Yeah right <sarcasm/>' - assert bleach.clean(dirty) == clean - + assert ( + clean('Yeah right ') == + 'Yeah right <sarcasm/>' + ) -def test_user_defined_protocols_valid(): - valid_href = 'allowed href' - assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href +@pytest.mark.parametrize('data, expected', [ + # Convert bell + ('1\a23', '1?23'), -def test_user_defined_protocols_invalid(): - invalid_href = 'invalid href' - cleaned_href = 'invalid href' - assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href + # Convert backpsace + ('1\b23', '1?23'), + # Convert formfeed + ('1\v23', '1?23'), -def test_filters(): - # Create a Filter that changes all the attr values to "moo" - class MooFilter(Filter): - def __iter__(self): - for token in Filter.__iter__(self): - if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: - for attr, value in token['data'].items(): - token['data'][attr] = 'moo' + # Convert vertical tab + ('1\f23', '1?23'), - yield token + # Convert a bunch of characters in a string + ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'), +]) +def test_invisible_characters(data, expected): + assert clean(data) == expected - ATTRS = { - 'img': ['rel', 'src'] - } - TAGS = ['img'] - cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) +def get_tests(): + """Retrieves regression tests from data/ directory - dirty = 'this is cute! ' - assert ( - cleaner.clean(dirty) == - 'this is cute! ' - ) + :returns: list of ``(filename, filedata)`` tuples + """ + datadir = os.path.join(os.path.dirname(__file__), 'data') + tests = [ + os.path.join(datadir, fn) for fn in os.listdir(datadir) + if fn.endswith('.test') + ] + # Sort numerically which makes it easier to iterate through them + tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0])) -def test_clean_idempotent(): - """Make sure that applying the filter twice doesn't change anything.""" - dirty = 'invalid & < extra http://link.com' - assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty) + testcases = [ + (fn, open(fn, 'r').read()) for fn in tests + ] + return testcases -def test_only_text_is_cleaned(): - some_text = 'text' - some_type = int - no_type = None - assert bleach.clean(some_text) == some_text +@pytest.mark.parametrize('fn, test_case', get_tests()) +def test_regressions(fn, test_case): + """Regression tests for clean so we can see if there are issues""" + test_data, expected = test_case.split('\n--\n') - with pytest.raises(TypeError) as e: - bleach.clean(some_type) - assert "argument cannot be of 'type' type" in str(e) + # NOTE(willkg): This strips input and expected which makes it easier to + # maintain the files. If there comes a time when the input needs whitespace + # at the beginning or end, then we'll have to figure out something else. + test_data = test_data.strip() + expected = expected.strip() - with pytest.raises(TypeError) as e: - bleach.clean(no_type) - assert "NoneType" in str(e) + assert clean(test_data) == expected class TestCleaner: @@ -452,3 +689,27 @@ def test_basics(self): cleaner.clean('a
test') == 'a
test' ) + + def test_filters(self): + # Create a Filter that changes all the attr values to "moo" + class MooFilter(Filter): + def __iter__(self): + for token in Filter.__iter__(self): + if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: + for attr, value in token['data'].items(): + token['data'][attr] = 'moo' + + yield token + + ATTRS = { + 'img': ['rel', 'src'] + } + TAGS = ['img'] + + cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) + + dirty = 'this is cute! ' + assert ( + cleaner.clean(dirty) == + 'this is cute! ' + ) diff --git a/tests/test_security.py b/tests/test_security.py deleted file mode 100644 index 9dd49338..00000000 --- a/tests/test_security.py +++ /dev/null @@ -1,211 +0,0 @@ -"""More advanced security tests""" - -import os - -import pytest -import six - -from bleach import clean - - -def test_escaped_entities(): - # html5lib unescapes character entities, so these would become ' and " - # which makes it possible to break out of html attributes. - # - # Verify that bleach.clean() doesn't unescape entities. - assert ( - clean(''"') == - ''"' - ) - - -def test_nested_script_tag(): - assert ( - clean('</script>') == - '<<script>script>evil()<</script>/script>' - ) - assert ( - clean('<script>evil()</script>') == - '<<x>script>evil()<</x>/script>' - ) - - -def test_nested_script_tag_r(): - assert ( - clean('>evil()>') == - '<script<script>>evil()></script<script>' - ) - - -def test_invalid_attr(): - IMG = ['img', ] - IMG_ATTR = ['src'] - - assert ( - clean('test') == - 'test' - ) - assert ( - clean('', tags=IMG, attributes=IMG_ATTR) == - '' - ) - assert ( - clean('', tags=IMG, attributes=IMG_ATTR) == - '' - ) - - -def test_unquoted_attr(): - assert ( - clean('myabbr') == - 'myabbr' - ) - - -def test_unquoted_event_handler(): - assert ( - clean('xx.com') == - 'xx.com' - ) - - -def test_invalid_attr_value(): - assert ( - clean('') == - '<img src="javascript:alert(\'XSS\');">' - ) - - -def test_invalid_href_attr(): - assert ( - clean('xss') == - 'xss' - ) - - -def test_invalid_filter_attr(): - IMG = ['img', ] - IMG_ATTR = { - 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/" - } - - assert ( - clean('', tags=IMG, attributes=IMG_ATTR) == - '' - ) - assert ( - clean('', tags=IMG, attributes=IMG_ATTR) == - '' - ) - - -def test_invalid_tag_char(): - assert ( - clean('') in - [ - '<script src="http://xx.com/xss.js" xss=""></script>', - '<script xss="" src="http://xx.com/xss.js"></script>' - ] - ) - assert ( - clean('') == - '<script src="http://xx.com/xss.js"></script>' - ) - - -def test_unclosed_tag(): - assert ( - clean('ipt>' - assert clean(s, strip=True) == 'pt>alert(1)ipt>' - s = 'pt>pt>alert(1)' - assert clean(s, strip=True) == 'pt>pt>alert(1)' - - -def test_poster_attribute(): - """Poster attributes should not allow javascript.""" - tags = ['video'] - attrs = {'video': ['poster']} - test = '' - assert clean(test, tags=tags, attributes=attrs) == '' - ok = '' - assert clean(ok, tags=tags, attributes=attrs) == ok - - -def test_feed_protocol(): - assert clean('foo') == 'foo' - - -@pytest.mark.parametrize('data, expected', [ - # Convert bell - ('1\a23', '1?23'), - - # Convert backpsace - ('1\b23', '1?23'), - - # Convert formfeed - ('1\v23', '1?23'), - - # Convert vertical tab - ('1\f23', '1?23'), - - # Convert a bunch of characters in a string - ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'), -]) -def test_invisible_characters(data, expected): - assert clean(data) == expected - - -def get_tests(): - """Retrieves regression tests from data/ directory - - :returns: list of ``(filename, filedata)`` tuples - - """ - datadir = os.path.join(os.path.dirname(__file__), 'data') - tests = [ - os.path.join(datadir, fn) for fn in os.listdir(datadir) - if fn.endswith('.test') - ] - # Sort numerically which makes it easier to iterate through them - tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0])) - - testcases = [ - (fn, open(fn, 'r').read()) for fn in tests - ] - - return testcases - - -@pytest.mark.parametrize('fn, test_case', get_tests()) -def test_regressions(fn, test_case): - """Regression tests for clean so we can see if there are issues""" - test_data, expected = test_case.split('\n--\n') - - # NOTE(willkg): This strips input and expected which makes it easier to - # maintain the files. If there comes a time when the input needs whitespace - # at the beginning or end, then we'll have to figure out something else. - test_data = test_data.strip() - expected = expected.strip() - - assert clean(test_data) == expected From 18ecceb5f61896e1a88e8d965b1e61e860ded2a5 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 3 Mar 2018 11:15:49 -0500 Subject: [PATCH 146/314] Correct a regression comment and fix a test I misunderstood --- tests/test_clean.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_clean.py b/tests/test_clean.py index 799ae186..221addba 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -608,13 +608,12 @@ def test_svg_allow_local_href_nonlocal(text, expected): assert clean(text, tags=TAGS, attributes=ATTRS) == expected -@pytest.mark.xfail(reason='regression from bleach 1.4') def test_weird_strings(): s = '""" assert ( From d580f0abba6ae62da22e59be4355ea1d690eb1f5 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 3 Mar 2018 18:15:22 -0500 Subject: [PATCH 147/314] Fix MANIFEST and data_to_json.py related to recent changes I squashed test cases into single files--no more .out files. This carries that change through to MANIFEST.in and our tests_website system. --- MANIFEST.in | 2 +- tests_website/data_to_json.py | 38 +++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 1ae68e20..14ad79c7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,6 +12,6 @@ include docs/Makefile recursive-include docs *.rst -recursive-include tests *.py *.test *.out +recursive-include tests *.py *.test recursive-include tests_website *.html *.py *.rst diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py index debe5a9d..5870d64c 100755 --- a/tests_website/data_to_json.py +++ b/tests_website/data_to_json.py @@ -2,12 +2,12 @@ """ Util to write a directory of test cases with input filenames -.test and output filenames .test.out as JSON to -stdout. +.test as JSON to stdout. -example: +example:: + + $ python tests/data_to_json.py tests/data > testcases.json -python tests/data_to_json.py tests/data > testcases.json """ import argparse @@ -21,29 +21,33 @@ def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('data_dir', - help='directory containing test cases with input files' - ' named .test and output .test.out') + parser.add_argument( + 'data_dir', + help=( + 'directory containing test cases with names like .test' + ) + ) args = parser.parse_args() filenames = os.listdir(args.data_dir) ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')] - outs = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test.out')] testcases = [] - for infn, outfn in zip(ins, outs): + for infn in ins: case_name = infn.rsplit('.test', 1)[0] - with open(infn, 'r') as fin, open(outfn, 'r') as fout: - payload = fin.read()[:-1] + with open(infn, 'r') as fin: + data, expected = fin.read().split('\n--\n') + data = data.strip() + expected = expected.strip() + testcases.append({ - "title": case_name, - "input_filename": infn, - "output_filename": outfn, - "payload": payload, - "actual": bleach.clean(payload), - "expected": fout.read(), + 'title': case_name, + 'input_filename': infn, + 'payload': data, + 'actual': bleach.clean(data), + 'expected': expected, }) print(json.dumps(testcases, indent=4, sort_keys=True)) From 73dfef1d3b96c2e432660d8d2f2e9d0eaa230e36 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 3 Mar 2018 09:58:37 -0500 Subject: [PATCH 148/314] Fix url sanitizing Fixes a security issue where url sanitizing wouldn't work if there were character entities breaking up the scheme. This allowed javascript urls even when they were not explicitly allowed. --- bleach/sanitizer.py | 134 ++++++++++++++++++++++++++++++++++++++------ tests/test_clean.py | 98 ++++++++++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 22 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 81df765b..ac6a55cb 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -4,6 +4,7 @@ import string import six +from six.moves.urllib.parse import urlparse from xml.sax.saxutils import unescape import html5lib @@ -27,8 +28,11 @@ from bleach.utils import alphabetize_attributes, force_unicode +#: Map of entity name to expanded entity +ENTITIES = entities + #: Trie of html entity string -> character representation -ENTITIES_TRIE = Trie(entities) +ENTITIES_TRIE = Trie(ENTITIES) #: List of allowed tags ALLOWED_TAGS = [ @@ -79,13 +83,61 @@ INVISIBLE_REPLACEMENT_CHAR = '?' +def convert_entity(value): + """Convert an entity (minus the & and ; part) into what it represents + + This handles numeric, hex, and text entities. + + :arg value: the string (minus the ``&`` and ``;`` part) to convert + + :returns: unicode character + + """ + if value[0] == '#': + if value[1] in ('x', 'X'): + return six.unichr(int(value[2:], 16)) + return six.unichr(int(value[1:], 10)) + + return ENTITIES[value] + + +def convert_entities(text): + """Converts all found entities in the text + + :arg text: the text to convert entities in + + :returns: unicode text with converted entities + + """ + if '&' not in text: + return text + + new_text = [] + for part in next_possible_entity(text): + if not part: + continue + + if part.startswith('&'): + entity = match_entity(part) + if entity is not None: + new_text.append(convert_entity(entity)) + remainder = part[len(entity) + 2:] + if part: + new_text.append(remainder) + continue + + new_text.append(part) + + return u''.join(new_text) + + class BleachHTMLTokenizer(HTMLTokenizer): def consumeEntity(self, allowedChar=None, fromAttribute=False): # We don't want to consume and convert entities, so this overrides the # html5lib tokenizer's consumeEntity so that it's now a no-op. # # However, when that gets called, it's consumed an &, so we put that in - # the steam. + # the stream. if fromAttribute: self.currentToken['data'][-1][1] += '&' @@ -479,15 +531,69 @@ def sanitize_characters(self, token): new_tokens.append({'type': 'Entity', 'name': entity}) # Length of the entity plus 2--one for & at the beginning # and and one for ; at the end - part = part[len(entity) + 2:] - if part: - new_tokens.append({'type': 'Characters', 'data': part}) + remainder = part[len(entity) + 2:] + if remainder: + new_tokens.append({'type': 'Characters', 'data': remainder}) continue new_tokens.append({'type': 'Characters', 'data': part}) return new_tokens + def sanitize_uri_value(self, value, allowed_protocols): + """Checks a uri value to see if it's allowed + + :arg value: the uri value to sanitize + :arg allowed_protocols: list of allowed protocols + + :returns: allowed value or None + + """ + # NOTE(willkg): This transforms the value into one that's easier to + # match and verify, but shouldn't get returned since it's vastly + # different than the original value. + + # Convert all character entities in the value + new_value = convert_entities(value) + + # Nix single quote, whitespace, and non-printable charcters + new_value = re.sub( + "[`\000-\040\177-\240\s]+", + '', + new_value + ) + + # Remove REPLACEMENT characters + new_value = new_value.replace('\ufffd', '') + + # Lowercase it--this breaks the value, but makes it easier to match + # against + new_value = new_value.lower() + + # Drop attributes with uri values that have protocols that aren't + # allowed + parsed = urlparse(new_value) + if parsed.scheme: + # If urlparse found a scheme, check that + if parsed.scheme in allowed_protocols: + return value + + else: + # Allow uris that are just an anchor + if new_value.startswith('#'): + return value + + # Handle protocols that urlparse doesn't recognize like "myprotocol" + if ':' in new_value and new_value.split(':')[0] in allowed_protocols: + return value + + # If there's no protocol/scheme specified, then assume it's "http" + # and see if that's allowed + if 'http' in allowed_protocols: + return value + + return None + def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: @@ -508,21 +614,13 @@ def allow_token(self, token): if not self.attr_filter(token['name'], name, val): continue - # Look at attributes that have uri values + # Drop attributes with uri values that use a disallowed protocol + # Sanitize attributes with uri values if namespaced_name in self.attr_val_is_uri: - val_unescaped = re.sub( - "[`\000-\040\177-\240\s]+", - '', - unescape(val)).lower() - - # Remove replacement characters from unescaped characters. - val_unescaped = val_unescaped.replace("\ufffd", "") - - # Drop attributes with uri values that have protocols that - # aren't allowed - if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and - (val_unescaped.split(':')[0] not in self.allowed_protocols)): + new_value = self.sanitize_uri_value(val, self.allowed_protocols) + if new_value is None: continue + val = new_value # Drop values in svg attrs with non-local IRIs if namespaced_name in self.svg_attr_val_allows_ref: diff --git a/tests/test_clean.py b/tests/test_clean.py index 221addba..f680e8e1 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -213,7 +213,7 @@ def test_nested_script_tag(): ('an < entity', 'an < entity'), ('tag < and entity', 'tag < and entity'), ]) -def test_bare_entities(text, expected): +def test_bare_entities_get_escaped_correctly(text, expected): assert clean(text) == expected @@ -277,7 +277,7 @@ def test_bare_entities(text, expected): # Verify that clean() doesn't unescape entities. (''"', ''"'), ]) -def test_character_entities(text, expected): +def test_character_entities_handling(text, expected): assert clean(text) == expected @@ -534,10 +534,100 @@ def test_attributes_list(): # Unspecified protocols are not allowed ( - 'invalid href', + 'invalid href', {'protocols': ['myprotocol']}, 'invalid href' - ) + ), + + # Anchors are ok + ( + 'foo', + {'protocols': []}, + 'foo' + ), + + # Allow implicit http if allowed + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + ( + 'valid', + {'protocols': ['http']}, + 'valid' + ), + + # Disallow implicit http if disallowed + ( + 'foo', + {'protocols': []}, + 'foo' + ), + ( + 'foo', + {'protocols': []}, + 'foo' + ), + ( + 'foo', + {'protocols': []}, + 'foo' + ), + ( + 'foo', + {'protocols': []}, + 'foo' + ), + ( + 'foo', + {'protocols': []}, + 'foo' + ), + ( + 'foo', + {'protocols': []}, + 'foo' + ), + + # Disallowed protocols with sneaky character entities + ( + 'alert', + {}, + 'alert' + ), + ( + 'alert', + {}, + 'alert' + ), + + # Checking the uri should change it at all + ( + 'foo', + {}, + 'foo' + ), ]) def test_uri_value_allowed_protocols(data, kwargs, expected): assert clean(data, **kwargs) == expected From 61bf0e6db3bdce6294633555e08dd061af465c3c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 5 Mar 2018 16:08:49 -0500 Subject: [PATCH 149/314] Fix errant comment --- bleach/sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index ac6a55cb..56f6d960 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -556,7 +556,7 @@ def sanitize_uri_value(self, value, allowed_protocols): # Convert all character entities in the value new_value = convert_entities(value) - # Nix single quote, whitespace, and non-printable charcters + # Nix backtick, space characters, and control characters new_value = re.sub( "[`\000-\040\177-\240\s]+", '', From 9584f42051c0039cb0f27a617e8ab3e945018cc6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 5 Mar 2018 16:33:03 -0500 Subject: [PATCH 150/314] Prep for 2.1.3 release --- CHANGES | 30 +++++++++++++++++++++++++++++- bleach/__init__.py | 4 ++-- docs/dev.rst | 2 +- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 47bf3906..25789814 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,34 @@ -Bleach Changes +Bleach changes ============== +Version 2.1.3 (March 5th, 2018) +------------------------------- + +**Security fixes** + +* Attributes that have URI values weren't properly sanitized if the + values contained character entities. Using character entities, it + was possible to construct a URI value with a scheme that was not + allowed that would slide through unsanitized. + + This security issue was introduced in Bleach 2.1. Anyone using + Bleach 2.1 is highly encouraged to upgrade. + + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Fixed some other edge cases for attribute URI value sanitizing and + improved testing of this code. + + Version 2.1.2 (December 7th, 2017) ---------------------------------- diff --git a/bleach/__init__.py b/bleach/__init__.py index 8ed2c516..b81b0bbe 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -33,9 +33,9 @@ # yyyymmdd -__releasedate__ = '' +__releasedate__ = '20180305' # x.y.z or x.y.z.dev0 -- semver -__version__ = '2.1.3.dev0' +__version__ = '2.1.3' VERSION = parse_version(__version__) diff --git a/docs/dev.rst b/docs/dev.rst index d27a62ed..b0302524 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -74,7 +74,7 @@ Release process 3. Run the doctests:: $ cd docs/ - $ make doctests + $ make doctest 4. Verify everything works From 3e9b9ec55bbec5906800c3838d0840b4741f74d9 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 6 Mar 2018 09:26:05 -0500 Subject: [PATCH 151/314] Add tests for sanitizing urls in css properties --- bleach/sanitizer.py | 4 ++-- tests/test_css.py | 52 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 56f6d960..09cae199 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -684,10 +684,10 @@ def disallowed_token(self, token): def sanitize_css(self, style): """Sanitizes css in style tags""" - # disallow urls + # Drop any url values style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) - # gauntlet + # The gauntlet of sanitization # Validate the css in the style tag and if it's not valid, then drop # the whole thing. diff --git a/tests/test_css.py b/tests/test_css.py index d8880d78..ad81f594 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -66,7 +66,6 @@ ), ]) def test_allowed_css(data, styles, expected): - p_single = '

bar

' p_double = "

bar

" @@ -89,6 +88,57 @@ def test_valid_css(): ) +@pytest.mark.parametrize('data, expected', [ + # No url--unchanged + ( + '

foo

', + '

foo

' + ), + + # Verify urls with no quotes, single quotes, and double quotes are all dropped + ( + '

foo

', + '

foo

' + ), + ( + '

foo

', + '

foo

' + ), + ( + '

foo

', + '

foo

' + ), + + # Verify urls with spacing + ( + '

foo

', + '

foo

' + ), + ( + '

foo

', + '

foo

' + ), + ( + '

foo

', + '

foo

' + ), + ( + '

foo

', + '

foo

' + ), + + # Verify urls with character entities--this isn't valid, so the entire + # property is dropped + ( + '

foo

', + '

foo

' + ), + +]) +def test_urls(data, expected): + assert clean(data, styles=['background']) == expected + + def test_style_hang(): """The sanitizer should not hang on any inline styles""" style = [ From 28e7c3292bded1e91d194117e7d4d93ce855d698 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 16 Mar 2018 11:34:38 -0400 Subject: [PATCH 152/314] Handle ambiguous ampersands correctly This fixes the ambiguous ampersand case in character entity handling in attribute values. Fixes #359 --- bleach/sanitizer.py | 24 ++++++++++++++++-------- tests/test_clean.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 09cae199..12225efd 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -90,7 +90,8 @@ def convert_entity(value): :arg value: the string (minus the ``&`` and ``;`` part) to convert - :returns: unicode character + :returns: unicode character or None if it's an ambiguous ampersand that + doesn't match a character entity """ if value[0] == '#': @@ -98,7 +99,7 @@ def convert_entity(value): return six.unichr(int(value[2:], 16)) return six.unichr(int(value[1:], 10)) - return ENTITIES[value] + return ENTITIES.get(value, None) def convert_entities(text): @@ -120,11 +121,16 @@ def convert_entities(text): if part.startswith('&'): entity = match_entity(part) if entity is not None: - new_text.append(convert_entity(entity)) - remainder = part[len(entity) + 2:] - if part: - new_text.append(remainder) - continue + converted = convert_entity(entity) + + # If it's not an ambiguous ampersand, then replace with the + # unicode character. Otherwise, we leave the entity in. + if converted is not None: + new_text.append(converted) + remainder = part[len(entity) + 2:] + if part: + new_text.append(remainder) + continue new_text.append(part) @@ -731,7 +737,9 @@ def escape_base_amp(self, stoken): if part.startswith('&'): entity = match_entity(part) - if entity is not None: + # Only leave entities in that are not ambiguous. If they're + # ambiguous, then we escape the ampersand. + if entity is not None and convert_entity(entity) is not None: yield '&' + entity + ';' # Length of the entity plus 2--one for & at the beginning diff --git a/tests/test_clean.py b/tests/test_clean.py index f680e8e1..1f3cbfc8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -4,7 +4,7 @@ import pytest from bleach import clean -from bleach.sanitizer import Cleaner +from bleach.sanitizer import convert_entities, Cleaner def test_clean_idempotent(): @@ -246,7 +246,7 @@ def test_bare_entities_get_escaped_correctly(text, expected): 'http://example.com?active=true&current=true' ), - # Test entities in HTML attributes + # Test character entities in attribute values are left alone ( 'foo', 'foo' @@ -255,11 +255,20 @@ def test_bare_entities_get_escaped_correctly(text, expected): 'foo', 'foo' ), + + # Ambiguous ampersands get escaped in attributes + ( + 'foo', + 'foo' + ), ( 'foo', 'foo' ), + # Ambiguous ampersands in text are not escaped + ('&xx;', '&xx;'), + # Test numeric entities (''', '''), ('"', '"'), @@ -732,6 +741,24 @@ def test_invisible_characters(data, expected): assert clean(data) == expected +@pytest.mark.parametrize('data, expected', [ + # Strings without character entities pass through as is + ('', ''), + ('abc', 'abc'), + + # Handles character entities--both named and numeric + (' ', u'\xa0'), + (' ', ' '), + (' ', ' '), + + # Handles ambiguous ampersand + ('&xx;', '&xx;'), +]) +def test_convert_entities(data, expected): + print(repr(convert_entities(data))) + assert convert_entities(data) == expected + + def get_tests(): """Retrieves regression tests from data/ directory From 9818ffb81a362f4d141835a291225c2e65706ae2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sun, 18 Mar 2018 09:02:59 -0400 Subject: [PATCH 153/314] Add regression test with character entity in url --- tests/data/6.test | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tests/data/6.test diff --git a/tests/data/6.test b/tests/data/6.test new file mode 100644 index 00000000..7755c813 --- /dev/null +++ b/tests/data/6.test @@ -0,0 +1,3 @@ +hi +-- +hi From a65f5c8ea664abbd54b4c711ebd0ca26c3509b7e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 19 Mar 2018 14:39:15 -0400 Subject: [PATCH 154/314] Update CHANGES --- CHANGES | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CHANGES b/CHANGES index 25789814..5a9d5f84 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,26 @@ Bleach changes ============== +Version 2.1.4 (In development) +------------------------------ + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Handle ambiguous ampersands in correctly. (#359) + + Version 2.1.3 (March 5th, 2018) ------------------------------- @@ -14,6 +34,7 @@ Version 2.1.3 (March 5th, 2018) This security issue was introduced in Bleach 2.1. Anyone using Bleach 2.1 is highly encouraged to upgrade. + https://bugzilla.mozilla.org/show_bug.cgi?id=1442745 **Backwards incompatible changes** From 3f2270e42582d8f2d7392a54edff997b8675c797 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 19 Mar 2018 17:46:31 -0400 Subject: [PATCH 155/314] Handle nonexistent namespaces better Issue 352 has a string that manages to tokenize an html attribute with a namespace, but no name. Then the namespace doesn't exist in prefixes and that throws a KeyError. This alleviates that a bit such that if there's a namespace, but no name, it swaps the two values. Further, if prefixes doesn't have the namespace, then it ignores the namespace. Fixes #352 --- bleach/sanitizer.py | 14 +++++++++++++- tests/test_clean.py | 13 +++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 12225efd..faf8fd7a 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -668,8 +668,20 @@ def disallowed_token(self, token): assert token_type in ("StartTag", "EmptyTag") attrs = [] for (ns, name), v in token["data"].items(): + # If we end up with a namespace, but no name, switch them so we + # have a valid name to use. + if ns and not name: + ns, name = name, ns + + # Figure out namespaced name if the namespace is appropriate + # and exists; if the ns isn't in prefixes, then drop it. + if ns is None or ns not in prefixes: + namespaced_name = name + else: + namespaced_name = '%s:%s' % (prefixes[ns], name) + attrs.append(' %s="%s"' % ( - name if ns is None else "%s:%s" % (prefixes[ns], name), + namespaced_name, # NOTE(willkg): HTMLSerializer escapes attribute values # already, so if we do it here (like HTMLSerializer does), # then we end up double-escaping. diff --git a/tests/test_clean.py b/tests/test_clean.py index 1f3cbfc8..9547d631 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -759,6 +759,19 @@ def test_convert_entities(data, expected): assert convert_entities(data) == expected +def test_nonexistent_namespace(): + """Verify if the namespace doesn't exist, it doesn't fail with a KeyError + + The tokenizer creates "c" as a namespace and that doesn't exist in the map + of namespaces, so then it fails with a KeyError. I don't understand why the + tokenizer makes "c" into a namespace in this string. + + Issue #352. + + """ + assert clean('') == '<d c=""></d>' + + def get_tests(): """Retrieves regression tests from data/ directory From 46fa500e2b3275af09e888feb495d1fcd541fb00 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 21 Mar 2018 21:16:19 -0400 Subject: [PATCH 156/314] Convert entities in CSS values before sanitizing The CSS is in an HTML attribute value, so we need to convert character entities in it which makes it proper CSS before we can sanitize it. Fixes #363 --- bleach/sanitizer.py | 5 ++++- tests/test_css.py | 22 +++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index faf8fd7a..7e5d0361 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -702,7 +702,10 @@ def disallowed_token(self, token): def sanitize_css(self, style): """Sanitizes css in style tags""" - # Drop any url values + # Convert entities in the style so that it can be parsed as CSS + style = convert_entities(style) + + # Drop any url values before we do anything else style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # The gauntlet of sanitization diff --git a/tests/test_css.py b/tests/test_css.py index ad81f594..12f27f3c 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -127,11 +127,10 @@ def test_valid_css(): '

foo

' ), - # Verify urls with character entities--this isn't valid, so the entire - # property is dropped + # Verify urls with character entities ( '

foo

', - '

foo

' + '

foo

' ), ]) @@ -201,3 +200,20 @@ def test_style_hang(): ) assert clean(html, styles=styles) == expected + + +@pytest.mark.parametrize('data, styles, expected', [ + ( + '

text

', + ['font-family', 'white-space'], + '

text

' + ), + ( + '

text

', + ['font-family', 'white-space'], + '

text

' + ), +]) +def test_css_parsing_with_entities(data, styles, expected): + """The sanitizer should be ok with character entities""" + assert clean(data, tags=['p'], attributes={'p': ['style']}, styles=styles) == expected From f1f04f6580e24bd1b977b8be0a1bc1e5d5f944da Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 7 Jun 2018 13:59:46 -0400 Subject: [PATCH 157/314] Nix pinning in dev requirements --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5cfec7f1..758459aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ -e . -# Requirements to run the test suite: -pytest==3.0.6 +# Requirements to run the test suite +pytest pytest-wholenodeid -flake8==3.3.0 -tox==2.4.1 +flake8 +tox # Requirements for building docs -Sphinx==1.5.2 +Sphinx # Requirements for updating package -twine==1.8.1 +twine From 8f6c2ea0b1155716ced070d87dc2c9d4f664ddcb Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 7 Jun 2018 14:08:11 -0400 Subject: [PATCH 158/314] Change requirements.txt to requirements-dev.txt This change makes it clearer what the file is for. --- MANIFEST.in | 2 +- docs/dev.rst | 2 +- requirements.txt => requirements-dev.txt | 0 tox.ini | 6 +++--- 4 files changed, 5 insertions(+), 5 deletions(-) rename requirements.txt => requirements-dev.txt (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 14ad79c7..5a0f3385 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,7 @@ include CHANGES include CONTRIBUTORS include CONTRIBUTING.rst include CODE_OF_CONDUCT.rst -include requirements.txt +include requirements-dev.txt include tox.ini include LICENSE include README.rst diff --git a/docs/dev.rst b/docs/dev.rst index b0302524..abeaf913 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -50,7 +50,7 @@ Release process 1. Checkout master tip. -2. Check to make sure ``setup.py`` and ``requirements.txt`` are +2. Check to make sure ``setup.py`` and ``requirements-dev.txt`` are correct and match requirements-wise. 3. Update version numbers in ``bleach/__init__.py``. diff --git a/requirements.txt b/requirements-dev.txt similarity index 100% rename from requirements.txt rename to requirements-dev.txt diff --git a/tox.ini b/tox.ini index d44521c9..c58bd532 100644 --- a/tox.ini +++ b/tox.ini @@ -18,7 +18,7 @@ basepython = py35: python3.5 py36: python3.6 deps = - -rrequirements.txt + -rrequirements-dev.txt html5lib99999999: html5lib==0.99999999 html5lib999999999: html5lib==0.999999999 html5lib10b9: html5lib==1.0b9 @@ -59,7 +59,7 @@ commands = [testenv:lint] basepython = python deps = - -rrequirements.txt + -rrequirements-dev.txt commands = flake8 bleach/ @@ -67,6 +67,6 @@ commands = basepython = python changedir = docs deps = - -rrequirements.txt + -rrequirements-dev.txt commands = sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html From 63076f4420498571027bb853703f06b3bfd469ff Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 7 Jun 2018 14:33:19 -0400 Subject: [PATCH 159/314] Fix lint and docs tox environments --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index c58bd532..a5538a7b 100644 --- a/tox.ini +++ b/tox.ini @@ -57,14 +57,14 @@ commands = python setup.py build [testenv:lint] -basepython = python +basepython = python3.6 deps = -rrequirements-dev.txt commands = flake8 bleach/ [testenv:docs] -basepython = python +basepython = python3.6 changedir = docs deps = -rrequirements-dev.txt From 9959a1a57c1574806e24ea29209af882c5bdbd95 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 7 Jun 2018 14:35:53 -0400 Subject: [PATCH 160/314] Update CHANGES re: Python 3.3 support --- CHANGES | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 5a9d5f84..5a01cd8a 100644 --- a/CHANGES +++ b/CHANGES @@ -10,7 +10,7 @@ None **Backwards incompatible changes** -None +* Dropped support for Python 3.3. (#328) **Features** From b8aae5660693f4d30d76a0b8e7525af1adcbc3cc Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 7 Jun 2018 15:13:27 -0400 Subject: [PATCH 161/314] Fix requirements file name in travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b6eea407..dfecccf7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ env: install: # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5 - pip install -U pip setuptools>=18.5 - - pip install -r requirements.txt + - pip install -r requirements-dev.txt # stomp on html5lib install with the specified one - pip install html5lib==$HTML5LIB script: From 9319ec77a06c582bd5e7726c0b3c69139ad67732 Mon Sep 17 00:00:00 2001 From: Antoine Leclair Date: Fri, 29 Jun 2018 13:12:31 -0400 Subject: [PATCH 162/314] Fix error when parsing invalid URI --- CONTRIBUTORS | 1 + bleach/sanitizer.py | 11 ++++++++--- tests/test_clean.py | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 94276246..5783ab17 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -25,6 +25,7 @@ Contributors: - Alireza Savand - Andreas Malecki - Andy Freeland +- Antoine Leclair - Anton Kovalyov - Chris Beaven - Dan Gayle diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 7e5d0361..31f12400 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -576,9 +576,14 @@ def sanitize_uri_value(self, value, allowed_protocols): # against new_value = new_value.lower() - # Drop attributes with uri values that have protocols that aren't - # allowed - parsed = urlparse(new_value) + try: + # Drop attributes with uri values that have protocols that aren't + # allowed + parsed = urlparse(new_value) + except ValueError: + # URI is impossible to parse, therefore it's not allowed + return None + if parsed.scheme: # If urlparse found a scheme, check that if parsed.scheme in allowed_protocols: diff --git a/tests/test_clean.py b/tests/test_clean.py index 9547d631..951d5b2a 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -58,6 +58,9 @@ def test_html_is_lowercased(): 'foo' ) +def test_invalid_uri_does_not_raise_error(): + assert clean('text') == 'text' + @pytest.mark.parametrize('data, should_strip, expected', [ # Regular comment From 8f88b41810ef82f5a1204e45ad8d6c9329b0c0b1 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 19 Jul 2018 20:56:38 -0400 Subject: [PATCH 163/314] Sync travis and tox environments This makes sure travis and tox are testing Bleach with the same configurations. --- .travis.yml | 22 +++++++++++++--------- tox.ini | 2 ++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index dfecccf7..cd05d9aa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,21 @@ +# Note: If you update this, make sure to update tox.ini, too. sudo: false language: python cache: directories: - "~/.cache/pip" python: -- "2.7" -- "3.4" -- "3.5" -- "3.6" -- "pypy" + - "2.7" + - "3.4" + - "3.5" + - "3.6" + - "pypy" env: -- HTML5LIB=0.99999999 # 8 -- HTML5LIB=0.999999999 # 9 + - HTML5LIB=0.99999999 # 8 + - HTML5LIB=0.999999999 # 9 + - HTML5LIB=1.0b9 + - HTML5LIB=1.0b10 + - HTML5LIB=1.0.1 install: # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5 - pip install -U pip setuptools>=18.5 @@ -19,8 +23,8 @@ install: # stomp on html5lib install with the specified one - pip install html5lib==$HTML5LIB script: -- py.test -- flake8 bleach/ + - py.test + - flake8 bleach/ deploy: provider: pypi user: jezdez diff --git a/tox.ini b/tox.ini index a5538a7b..d5539644 100644 --- a/tox.ini +++ b/tox.ini @@ -3,6 +3,8 @@ # test suite on all supported python versions. To use it, "pip install tox" # and then run "tox" from this directory. +# Note: If you update this, make sure to update .travis.yml, too. + [tox] envlist = py{27,34,35,36}-html5lib{99999999,999999999,10b9,10b10,101} From 9960da4ddd777627fc39d8c1f4a36923102af06d Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 16 Aug 2018 16:30:33 -0400 Subject: [PATCH 164/314] Update for v2.1.4 release --- CHANGES | 4 ++-- bleach/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 5a01cd8a..fd17745b 100644 --- a/CHANGES +++ b/CHANGES @@ -1,8 +1,8 @@ Bleach changes ============== -Version 2.1.4 (In development) ------------------------------- +Version 2.1.4 (August 16th, 2018) +--------------------------------- **Security fixes** diff --git a/bleach/__init__.py b/bleach/__init__.py index b81b0bbe..d0d84029 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -33,9 +33,9 @@ # yyyymmdd -__releasedate__ = '20180305' +__releasedate__ = '20180816' # x.y.z or x.y.z.dev0 -- semver -__version__ = '2.1.3' +__version__ = '2.1.4' VERSION = parse_version(__version__) From ff6e5c53d8888570f06d905cf31f2132b3b946a6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 16 Aug 2018 16:47:58 -0400 Subject: [PATCH 165/314] Update for 2.1.5 development --- CHANGES | 20 ++++++++++++++++++++ bleach/__init__.py | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index fd17745b..ddd3e0a2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,26 @@ Bleach changes ============== +Version 2.1.5 (in development) +------------------------------ + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +None + + Version 2.1.4 (August 16th, 2018) --------------------------------- diff --git a/bleach/__init__.py b/bleach/__init__.py index d0d84029..367fbf42 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -33,9 +33,9 @@ # yyyymmdd -__releasedate__ = '20180816' +__releasedate__ = '' # x.y.z or x.y.z.dev0 -- semver -__version__ = '2.1.4' +__version__ = '2.1.5.dev0' VERSION = parse_version(__version__) From a507a4ed7e37cd594b8af5b4722bd6b058e9c2c2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 27 Aug 2018 09:08:47 -0400 Subject: [PATCH 166/314] Drop easy_install instructions Fixes #373 --- README.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.rst b/README.rst index 5f151dc7..6622ee46 100644 --- a/README.rst +++ b/README.rst @@ -58,10 +58,6 @@ Bleach is available on PyPI_, so you can install it with ``pip``:: $ pip install bleach -Or with ``easy_install``:: - - $ easy_install bleach - Upgrading Bleach ================ From 7970857c78bec0060f527277a91a8ca72aaabe8d Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sun, 26 Aug 2018 15:21:35 -0400 Subject: [PATCH 167/314] vendor html5lib 1.0.1 This vendors html5lib 1.0.1 and in doing that, drops the requirement to install html5lib. Fixes #386 --- .gitignore | 6 +- CHANGES | 3 +- MANIFEST.in | 3 +- bleach/__init__.py | 15 - bleach/_vendor/README.rst | 21 + bleach/_vendor/__init__.py | 0 .../html5lib-1.0.1.dist-info/DESCRIPTION.rst | 489 +++ .../html5lib-1.0.1.dist-info/INSTALLER | 1 + .../html5lib-1.0.1.dist-info/LICENSE.txt | 20 + .../_vendor/html5lib-1.0.1.dist-info/METADATA | 530 +++ .../_vendor/html5lib-1.0.1.dist-info/RECORD | 42 + bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL | 6 + .../html5lib-1.0.1.dist-info/metadata.json | 1 + .../html5lib-1.0.1.dist-info/top_level.txt | 1 + bleach/_vendor/html5lib/__init__.py | 35 + bleach/_vendor/html5lib/_ihatexml.py | 288 ++ bleach/_vendor/html5lib/_inputstream.py | 923 ++++++ bleach/_vendor/html5lib/_tokenizer.py | 1721 ++++++++++ bleach/_vendor/html5lib/_trie/__init__.py | 14 + bleach/_vendor/html5lib/_trie/_base.py | 37 + bleach/_vendor/html5lib/_trie/datrie.py | 44 + bleach/_vendor/html5lib/_trie/py.py | 67 + bleach/_vendor/html5lib/_utils.py | 124 + bleach/_vendor/html5lib/constants.py | 2947 +++++++++++++++++ bleach/_vendor/html5lib/filters/__init__.py | 0 .../filters/alphabeticalattributes.py | 29 + bleach/_vendor/html5lib/filters/base.py | 12 + .../html5lib/filters/inject_meta_charset.py | 73 + bleach/_vendor/html5lib/filters/lint.py | 93 + .../_vendor/html5lib/filters/optionaltags.py | 207 ++ bleach/_vendor/html5lib/filters/sanitizer.py | 896 +++++ bleach/_vendor/html5lib/filters/whitespace.py | 38 + bleach/_vendor/html5lib/html5parser.py | 2791 ++++++++++++++++ bleach/_vendor/html5lib/serializer.py | 409 +++ .../_vendor/html5lib/treeadapters/__init__.py | 30 + .../_vendor/html5lib/treeadapters/genshi.py | 54 + bleach/_vendor/html5lib/treeadapters/sax.py | 50 + .../_vendor/html5lib/treebuilders/__init__.py | 88 + bleach/_vendor/html5lib/treebuilders/base.py | 417 +++ bleach/_vendor/html5lib/treebuilders/dom.py | 236 ++ bleach/_vendor/html5lib/treebuilders/etree.py | 340 ++ .../html5lib/treebuilders/etree_lxml.py | 366 ++ .../_vendor/html5lib/treewalkers/__init__.py | 154 + bleach/_vendor/html5lib/treewalkers/base.py | 252 ++ bleach/_vendor/html5lib/treewalkers/dom.py | 43 + bleach/_vendor/html5lib/treewalkers/etree.py | 130 + .../html5lib/treewalkers/etree_lxml.py | 213 ++ bleach/_vendor/html5lib/treewalkers/genshi.py | 69 + bleach/_vendor/pip_install_vendor.sh | 4 + bleach/_vendor/vendor.txt | 1 + bleach/linkifier.py | 8 +- bleach/sanitizer.py | 20 +- setup.cfg | 4 + setup.py | 5 +- tests/test_clean.py | 2 +- tests/test_linkify.py | 5 +- tox.ini | 9 +- 57 files changed, 14336 insertions(+), 50 deletions(-) create mode 100644 bleach/_vendor/README.rst create mode 100644 bleach/_vendor/__init__.py create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/METADATA create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/RECORD create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt create mode 100644 bleach/_vendor/html5lib/__init__.py create mode 100644 bleach/_vendor/html5lib/_ihatexml.py create mode 100644 bleach/_vendor/html5lib/_inputstream.py create mode 100644 bleach/_vendor/html5lib/_tokenizer.py create mode 100644 bleach/_vendor/html5lib/_trie/__init__.py create mode 100644 bleach/_vendor/html5lib/_trie/_base.py create mode 100644 bleach/_vendor/html5lib/_trie/datrie.py create mode 100644 bleach/_vendor/html5lib/_trie/py.py create mode 100644 bleach/_vendor/html5lib/_utils.py create mode 100644 bleach/_vendor/html5lib/constants.py create mode 100644 bleach/_vendor/html5lib/filters/__init__.py create mode 100644 bleach/_vendor/html5lib/filters/alphabeticalattributes.py create mode 100644 bleach/_vendor/html5lib/filters/base.py create mode 100644 bleach/_vendor/html5lib/filters/inject_meta_charset.py create mode 100644 bleach/_vendor/html5lib/filters/lint.py create mode 100644 bleach/_vendor/html5lib/filters/optionaltags.py create mode 100644 bleach/_vendor/html5lib/filters/sanitizer.py create mode 100644 bleach/_vendor/html5lib/filters/whitespace.py create mode 100644 bleach/_vendor/html5lib/html5parser.py create mode 100644 bleach/_vendor/html5lib/serializer.py create mode 100644 bleach/_vendor/html5lib/treeadapters/__init__.py create mode 100644 bleach/_vendor/html5lib/treeadapters/genshi.py create mode 100644 bleach/_vendor/html5lib/treeadapters/sax.py create mode 100644 bleach/_vendor/html5lib/treebuilders/__init__.py create mode 100644 bleach/_vendor/html5lib/treebuilders/base.py create mode 100644 bleach/_vendor/html5lib/treebuilders/dom.py create mode 100644 bleach/_vendor/html5lib/treebuilders/etree.py create mode 100644 bleach/_vendor/html5lib/treebuilders/etree_lxml.py create mode 100644 bleach/_vendor/html5lib/treewalkers/__init__.py create mode 100644 bleach/_vendor/html5lib/treewalkers/base.py create mode 100644 bleach/_vendor/html5lib/treewalkers/dom.py create mode 100644 bleach/_vendor/html5lib/treewalkers/etree.py create mode 100644 bleach/_vendor/html5lib/treewalkers/etree_lxml.py create mode 100644 bleach/_vendor/html5lib/treewalkers/genshi.py create mode 100755 bleach/_vendor/pip_install_vendor.sh create mode 100644 bleach/_vendor/vendor.txt diff --git a/.gitignore b/.gitignore index 26bbdf8e..c4abbd13 100644 --- a/.gitignore +++ b/.gitignore @@ -4,10 +4,14 @@ pip-log.txt .coverage dist *.egg-info -.noseids build .tox docs/_build/ .cache/ .eggs/ .*env*/ +.pytest_cache/ +.python-version +*~ +*.swp +__pycache__ diff --git a/CHANGES b/CHANGES index ddd3e0a2..423c1ecb 100644 --- a/CHANGES +++ b/CHANGES @@ -14,7 +14,8 @@ None **Features** -None +* No longer depends on html5lib. html5lib==1.0.1 was vendored into Bleach. + (#386) **Bug fixes** diff --git a/MANIFEST.in b/MANIFEST.in index 5a0f3385..2a85593e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -10,8 +10,7 @@ include README.rst include docs/conf.py include docs/Makefile +recursive-include bleach *.py *.json *.rst *.sh *.txt INSTALLER METADATA RECORD WHEEL recursive-include docs *.rst - recursive-include tests *.py *.test - recursive-include tests_website *.html *.py *.rst diff --git a/bleach/__init__.py b/bleach/__init__.py index 367fbf42..f953fc51 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals -import warnings from pkg_resources import parse_version from bleach.linkifier import ( @@ -18,20 +17,6 @@ ) -import html5lib -try: - _html5lib_version = html5lib.__version__.split('.') - if len(_html5lib_version) < 2: - _html5lib_version = _html5lib_version + ['0'] -except Exception: - _h5ml5lib_version = ['unknown', 'unknown'] - - -# Bleach 3.0.0 won't support html5lib-python < 1.0.0. -if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]: - warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning) - - # yyyymmdd __releasedate__ = '' # x.y.z or x.y.z.dev0 -- semver diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst new file mode 100644 index 00000000..41c1d13e --- /dev/null +++ b/bleach/_vendor/README.rst @@ -0,0 +1,21 @@ +======================= +Vendored library policy +======================= + +To simplify Bleach development, we're now vendoring certain libraries that +we use. + +Vendored libraries must follow these rules: + +1. Vendored libraries must be pure Python--no compiling. +2. Source code for the libary is included in this directory. +3. License must be included in this repo and in the Bleach distribution. +4. Requirements of the library become requirements of Bleach. +5. No modifications to the library may be made. + +Way to vendor a library or update a version: + +1. Update ``vendor.txt`` with the library and version. +2. Remove old files and directories. +3. Run ``pip_install_vendor.sh`` and check everything it produced in including + the ``.dist-info`` directory and contents. diff --git a/bleach/_vendor/__init__.py b/bleach/_vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst new file mode 100644 index 00000000..c05f8c00 --- /dev/null +++ b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst @@ -0,0 +1,489 @@ +html5lib +======== + +.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master + :target: https://travis-ci.org/html5lib/html5lib-python + +html5lib is a pure-python library for parsing HTML. It is designed to +conform to the WHATWG HTML specification, as is implemented by all major +web browsers. + + +Usage +----- + +Simple usage follows this pattern: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + document = html5lib.parse(f) + +or: + +.. code-block:: python + + import html5lib + document = html5lib.parse("

Hello World!") + +By default, the ``document`` will be an ``xml.etree`` element instance. +Whenever possible, html5lib chooses the accelerated ``ElementTree`` +implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x). + +Two other tree types are supported: ``xml.dom.minidom`` and +``lxml.etree``. To use an alternative format, specify the name of +a treebuilder: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + lxml_etree_document = html5lib.parse(f, treebuilder="lxml") + +When using with ``urllib2`` (Python 2), the charset from HTTP should be +pass into html5lib as follows: + +.. code-block:: python + + from contextlib import closing + from urllib2 import urlopen + import html5lib + + with closing(urlopen("http://example.com/")) as f: + document = html5lib.parse(f, transport_encoding=f.info().getparam("charset")) + +When using with ``urllib.request`` (Python 3), the charset from HTTP +should be pass into html5lib as follows: + +.. code-block:: python + + from urllib.request import urlopen + import html5lib + + with urlopen("http://example.com/") as f: + document = html5lib.parse(f, transport_encoding=f.info().get_content_charset()) + +To have more control over the parser, create a parser object explicitly. +For instance, to make the parser raise exceptions on parse errors, use: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + parser = html5lib.HTMLParser(strict=True) + document = parser.parse(f) + +When you're instantiating parser objects explicitly, pass a treebuilder +class as the ``tree`` keyword argument to use an alternative document +format: + +.. code-block:: python + + import html5lib + parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + minidom_document = parser.parse("

Hello World!") + +More documentation is available at https://html5lib.readthedocs.io/. + + +Installation +------------ + +html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it, +use: + +.. code-block:: bash + + $ pip install html5lib + + +Optional Dependencies +--------------------- + +The following third-party libraries may be used for additional +functionality: + +- ``datrie`` can be used under CPython to improve parsing performance + (though in almost all cases the improvement is marginal); + +- ``lxml`` is supported as a tree format (for both building and + walking) under CPython (but *not* PyPy where it is known to cause + segfaults); + +- ``genshi`` has a treewalker (but not builder); and + +- ``chardet`` can be used as a fallback when character encoding cannot + be determined. + + +Bugs +---- + +Please report any bugs on the `issue tracker +`_. + + +Tests +----- + +Unit tests require the ``pytest`` and ``mock`` libraries and can be +run using the ``py.test`` command in the root directory. + +Test data are contained in a separate `html5lib-tests +`_ repository and included +as a submodule, thus for git checkouts they must be initialized:: + + $ git submodule init + $ git submodule update + +If you have all compatible Python implementations available on your +system, you can run tests on all of them using the ``tox`` utility, +which can be found on PyPI. + + +Questions? +---------- + +There's a mailing list available for support on Google Groups, +`html5lib-discuss `_, +though you may get a quicker response asking on IRC in `#whatwg on +irc.freenode.net `_. + +Change Log +---------- + +1.0.1 +~~~~~ + +Released on December 7, 2017 + +Breaking changes: + +* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) +* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + +Features: + +* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) +* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) +* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) +* Support Python 3.6. (#333) (Thank you, Jon Dufresne!) +* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) +* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!) +* Semver-compliant version number. + +Bug fixes: + +* Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) +* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) +* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) +* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) +* Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) +* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) +* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + +1.0 +~~~ + +Released and unreleased on December 7, 2017. Badly packaged release. + + +0.999999999/1.0b10 +~~~~~~~~~~~~~~~~~~ + +Released on July 15, 2016 + +* Fix attribute order going to the tree builder to be document order + instead of reverse document order(!). + + +0.99999999/1.0b9 +~~~~~~~~~~~~~~~~ + +Released on July 14, 2016 + +* **Added ordereddict as a mandatory dependency on Python 2.6.** + +* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all`` + extras that will do the right thing based on the specific + interpreter implementation. + +* Now requires the ``mock`` package for the testsuite. + +* Cease supporting DATrie under PyPy. + +* **Remove PullDOM support, as this hasn't ever been properly + tested, doesn't entirely work, and as far as I can tell is + completely unused by anyone.** + +* Move testsuite to ``py.test``. + +* **Fix #124: move to webencodings for decoding the input byte stream; + this makes html5lib compliant with the Encoding Standard, and + introduces a required dependency on webencodings.** + +* **Cease supporting Python 3.2 (in both CPython and PyPy forms).** + +* **Fix comments containing double-dash with lxml 3.5 and above.** + +* **Use scripting disabled by default (as we don't implement + scripting).** + +* **Fix #11, avoiding the XSS bug potentially caused by serializer + allowing attribute values to be escaped out of in old browser versions, + changing the quote_attr_values option on serializer to take one of + three values, "always" (the old True value), "legacy" (the new option, + and the new default), and "spec" (the old False value, and the old + default).** + +* **Fix #72 by rewriting the sanitizer to apply only to treewalkers + (instead of the tokenizer); as such, this will require amending all + callers of it to use it via the treewalker API.** + +* **Drop support of charade, now that chardet is supported once more.** + +* **Replace the charset keyword argument on parse and related methods + with a set of keyword arguments: override_encoding, transport_encoding, + same_origin_parent_encoding, likely_encoding, and default_encoding.** + +* **Move filters._base, treebuilder._base, and treewalkers._base to .base + to clarify their status as public.** + +* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the + sanitizer.htmlsanitizer module and move that to sanitizer. This means + anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no + code changes.** + +* **Rename treewalkers.lxmletree to .etree_lxml and + treewalkers.genshistream to .genshi to have a consistent API.** + +* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer, + utils) to be underscore prefixed to clarify their status as private. + + +0.9999999/1.0b8 +~~~~~~~~~~~~~~~ + +Released on September 10, 2015 + +* Fix #195: fix the sanitizer to drop broken URLs (it threw an + exception between 0.9999 and 0.999999). + + +0.999999/1.0b7 +~~~~~~~~~~~~~~ + +Released on July 7, 2015 + +* Fix #189: fix the sanitizer to allow relative URLs again (as it did + prior to 0.9999/1.0b5). + + +0.99999/1.0b6 +~~~~~~~~~~~~~ + +Released on April 30, 2015 + +* Fix #188: fix the sanitizer to not throw an exception when sanitizing + bogus data URLs. + + +0.9999/1.0b5 +~~~~~~~~~~~~ + +Released on April 29, 2015 + +* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how + this sounds, this has no known security implications. No known version + of IE (5.5 to current), Firefox (3 to current), Safari (6 to current), + Chrome (1 to current), or Opera (12 to current) will run any script + provided in these attributes. + +* Pass error message to the ParseError exception in strict parsing mode. + +* Allow data URIs in the sanitizer, with a whitelist of content-types. + +* Add support for Python implementations that don't support lone + surrogates (read: Jython). Fixes #2. + +* Remove localization of error messages. This functionality was totally + unused (and untested that everything was localizable), so we may as + well follow numerous browsers in not supporting translating technical + strings. + +* Expose treewalkers.pprint as a public API. + +* Add a documentEncoding property to HTML5Parser, fix #121. + + +0.999 +~~~~~ + +Released on December 23, 2013 + +* Fix #127: add work-around for CPython issue #20007: .read(0) on + http.client.HTTPResponse drops the rest of the content. + +* Fix #115: lxml treewalker can now deal with fragments containing, at + their root level, text nodes with non-ASCII characters on Python 2. + + +0.99 +~~~~ + +Released on September 10, 2013 + +* No library changes from 1.0b3; released as 0.99 as pip has changed + behaviour from 1.4 to avoid installing pre-release versions per + PEP 440. + + +1.0b3 +~~~~~ + +Released on July 24, 2013 + +* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any + implementation using it should be moved to + ``NonRecursiveTreeWalker``, as everything bundled with html5lib has + for years. + +* Fix #67 so that ``BufferedStream`` to correctly returns a bytes + object, thereby fixing any case where html5lib is passed a + non-seekable RawIOBase-like object. + + +1.0b2 +~~~~~ + +Released on June 27, 2013 + +* Removed reordering of attributes within the serializer. There is now + an ``alphabetical_attributes`` option which preserves the previous + behaviour through a new filter. This allows attribute order to be + preserved through html5lib if the tree builder preserves order. + +* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by + ``treeadapters.sax.to_sax`` which is generic and supports any + treewalker; it also resolves all known bugs with ``dom2sax``. + +* Fix treewalker assertions on hitting bytes strings on + Python 2. Previous to 1.0b1, treewalkers coped with mixed + bytes/unicode data on Python 2; this reintroduces this prior + behaviour on Python 2. Behaviour is unchanged on Python 3. + + +1.0b1 +~~~~~ + +Released on May 17, 2013 + +* Implementation updated to implement the `HTML specification + `_ as of 5th May + 2013 (`SVN `_ revision r7867). + +* Python 3.2+ supported in a single codebase using the ``six`` library. + +* Removed support for Python 2.5 and older. + +* Removed the deprecated Beautiful Soup 3 treebuilder. + ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that + since it doesn't support namespaces, foreign content like SVG and + MathML is parsed incorrectly. + +* Removed ``simpletree`` from the package. The default tree builder is + now ``etree`` (using the ``xml.etree.cElementTree`` implementation if + available, and ``xml.etree.ElementTree`` otherwise). + +* Removed the ``XHTMLSerializer`` as it never actually guaranteed its + output was well-formed XML, and hence provided little of use. + +* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no + longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will + return the default DOM treebuilder, which uses ``xml.dom.minidom``. + +* Optional heuristic character encoding detection now based on + ``charade`` for Python 2.6 - 3.3 compatibility. + +* Optional ``Genshi`` treewalker support fixed. + +* Many bugfixes, including: + + * #33: null in attribute value breaks XML AttValue; + + * #4: nested, indirect descendant,