From d0b3af80c723a0407e6dad8e6b2070829aef8f07 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon test test test test link text link text multiply nested text multiply nested text multiply nested text bar bar foo foo foo foo foo foo foo foo tags should not be linkified when
skip_pre=True
Fixes #150
---
bleach/__init__.py | 2 +-
bleach/tests/test_links.py | 7 +++++++
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 3092cb7f..ac163d12 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -315,7 +315,7 @@ def linkify_nodes(tree, parse_text=True):
if node.tag == ETREE_TAG('pre') and skip_pre:
linkify_nodes(node, False)
elif not (node in _seen):
- linkify_nodes(node, True)
+ linkify_nodes(node, parse_text)
current_child += 1
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index 62da8d19..2958f5e6 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -314,6 +314,13 @@ def test_skip_pre():
eq_(nofollowed, linkify(already_linked))
eq_(nofollowed, linkify(already_linked, skip_pre=True))
+ eq_(
+ linkify('
http://example.com',
+ skip_pre=True),
+ ('http://example.com
'
+ 'http://example.com')
+ )
+
def test_libgl():
"""libgl.so.1 should not be linkified."""
From c28b9e37ed659a588e49bf7bf1881ec4e6d7bc25 Mon Sep 17 00:00:00 2001
From: Jannis Leidel http://example.com
test',
+ assert (
bleach.clean('a
test',
- tags=TAGS, attributes=ATTRS))
+ tags=TAGS, attributes=ATTRS) ==
+ 'a
test'
+ )
def test_named_arguments():
@@ -65,73 +73,104 @@ def test_named_arguments():
s = ('xx.com',
'xx.com')
- eq_('xx.com', bleach.clean(s[0]))
- in_(s, bleach.clean(s[0], attributes=ATTRS))
+ assert bleach.clean(s[0]) == 'xx.com'
+ # FIXME: This might not be needed if attribute order is stable now.
+ assert bleach.clean(s[0], attributes=ATTRS) in s
def test_disallowed_html():
- eq_('a <script>safe()</script> test',
- bleach.clean('a test'))
- eq_('a <style>body{}</style> test',
- bleach.clean('a test'))
+ assert (
+ bleach.clean('a test') ==
+ 'a <script>safe()</script> test'
+ )
+ assert (
+ bleach.clean('a test') ==
+ 'a <style>body{}</style> test'
+ )
def test_bad_href():
- eq_('no link',
- bleach.clean('no link'))
+ assert (
+ bleach.clean('no link') ==
+ 'no link'
+ )
def test_bare_entities():
- eq_('an & entity', bleach.clean('an & entity'))
- eq_('an < entity', bleach.clean('an < entity'))
- eq_('tag < and entity',
- bleach.clean('tag < and entity'))
- eq_('&', bleach.clean('&'))
+ assert (
+ bleach.clean('an & entity') ==
+ 'an & entity'
+ )
+ assert (
+ bleach.clean('an < entity') ==
+ 'an < entity'
+ )
+
+ assert (
+ bleach.clean('tag < and entity') ==
+ 'tag < and entity'
+ )
+
+ assert (
+ bleach.clean('&') ==
+ '&'
+ )
def test_escaped_entities():
s = '<em>strong</em>'
- eq_(s, bleach.clean(s))
+ assert bleach.clean(s) == s
def test_serializer():
s = ''
- eq_(s, bleach.clean(s, tags=['table']))
- eq_('test
', bleach.linkify('
test
'))
- eq_('test
') == 'test'
+ assert bleach.clean('
'
- 'html tags', strip=True))
+ assert (
+ bleach.clean('a test with html tags', strip=True) ==
+ 'a test with html tags'
+ )
+ assert (
+ bleach.clean('a test with
html tags', strip=True) ==
+ 'a test with html tags'
+ )
s = ''
- eq_('
')
clean = ('both can have
',
'both can have
')
- in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+ assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
def test_sarcasm():
"""Jokes should crash.
http://xx.com'
'')
- eq_(linked, linkify(simple, skip_pre=True))
- eq_(all_linked, linkify(simple))
+ assert linkify(simple, skip_pre=True) == linked
+ assert linkify(simple) == all_linked
already_linked = 'xx' nofollowed = '
xx' - eq_(nofollowed, linkify(already_linked)) - eq_(nofollowed, linkify(already_linked, skip_pre=True)) - - eq_( - linkify('
http://example.comhttp://example.com',
- skip_pre=True),
- ('http://example.com'
- 'http://example.com')
+ assert linkify(already_linked) == nofollowed
+ assert linkify(already_linked, skip_pre=True) == nofollowed
+
+ assert (
+ linkify('http://example.comhttp://example.com', skip_pre=True) ==
+ (
+ 'http://example.com'
+ 'http://example.com'
+ )
)
def test_libgl():
"""libgl.so.1 should not be linkified."""
- eq_('libgl.so.1', linkify('libgl.so.1'))
+ s = 'libgl.so.1'
+ assert linkify(s) == s
-def test_end_of_sentence():
+@pytest.mark.parametrize('url,periods', [
+ ('example.com', '.'),
+ ('example.com', '...'),
+ ('ex.com/foo', '.'),
+ ('ex.com/foo', '....'),
+])
+def test_end_of_sentence(url, periods):
"""example.com. should match."""
out = '{0!s}{1!s}'
intxt = '{0!s}{1!s}'
- def check(u, p):
- eq_(out.format(u, p),
- linkify(intxt.format(u, p)))
-
- tests = (
- ('example.com', '.'),
- ('example.com', '...'),
- ('ex.com/foo', '.'),
- ('ex.com/foo', '....'),
- )
-
- for u, p in tests:
- yield check, u, p
+ assert linkify(intxt.format(url, periods)) == out.format(url, periods)
def test_end_of_clause():
"""example.com/foo, shouldn't include the ,"""
- eq_('ex.com/foo, bar',
- linkify('ex.com/foo, bar'))
+ assert (
+ linkify('ex.com/foo, bar') ==
+ 'ex.com/foo, bar'
+ )
def test_sarcasm():
"""Jokes should crash.something
' callbacks = [lambda *a: None] - eq_(expect, - linkify('', callbacks=callbacks)) + assert ( + linkify('', callbacks=callbacks) == + 'something
' + ) From 530fcd283c9eab23a72739ae60e37acf16f23eec Mon Sep 17 00:00:00 2001 From: Will Kahn-GreeneHello world
'.format(style) + style = [ + 'margin-top: 0px;', + 'margin-right: 0px;', + 'margin-bottom: 1.286em;', + 'margin-left: 0px;', + 'padding-top: 15px;', + 'padding-right: 15px;', + 'padding-bottom: 15px;', + 'padding-left: 15px;', + 'border-top-width: 1px;', + 'border-right-width: 1px;', + 'border-bottom-width: 1px;', + 'border-left-width: 1px;', + 'border-top-style: dotted;', + 'border-right-style: dotted;', + 'border-bottom-style: dotted;', + 'border-left-style: dotted;', + 'border-top-color: rgb(203, 200, 185);', + 'border-right-color: rgb(203, 200, 185);', + 'border-bottom-color: rgb(203, 200, 185);', + 'border-left-color: rgb(203, 200, 185);', + 'background-image: initial;', + 'background-attachment: initial;', + 'background-origin: initial;', + 'background-clip: initial;', + 'background-color: rgb(246, 246, 242);', + 'overflow-x: auto;', + 'overflow-y: auto;', + # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css. + # 'font: italic small-caps bolder condensed 16px/3 cursive;', + 'background-position: initial initial;', + 'background-repeat: initial initial;' + ] + html = 'Hello world
' % ' '.join(style) styles = [ 'border', 'float', 'overflow', 'min-height', 'vertical-align', 'white-space', @@ -120,12 +136,18 @@ def test_style_hang(): 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', ] - expected = ("""""" - """Hello world
""") + expected = ( + 'Hello world
' + ) assert clean(html, styles=styles) == expected diff --git a/tests/test_links.py b/tests/test_links.py index ac38ee70..6b7a77eb 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -3,7 +3,7 @@ except ImportError: from urllib import quote_plus -from html5lib.tokenizer import HTMLTokenizer +# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC @@ -406,6 +406,7 @@ def test_end_of_clause(): ) +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.+ This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. + The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below. +
+clean on change
+This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. - The textarea below contains sample-payload - you can also add your own. Watch it sanitize in the textarea and iframe below. + Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below.
clean on change
+ + +clean when dirty HTML changes
- This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. - Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below. -
-clean when dirty HTML changes
-+ This is the demo for Bleach, a whitelist-based HTML sanitizing library that escapes or strips markup and attributes. + Enter a sample payload in the textarea below and watch it sanitize in the textarea and iframe below. +
+ +clean when dirty HTML changes
- + - - - + + +test
', tags=['p']) == 'test
' - - -def test_no_href_links(): - s = 'x' - assert bleach.linkify(s) == s - - -def test_weird_strings(): - s = '3' - assert bleach.clean(s) == '' +class TestClean: + def test_empty(self): + assert bleach.clean('') == '' + + def test_nbsp(self): + if six.PY3: + expected = '\xa0test string\xa0' + else: + expected = six.u('\\xa0test string\\xa0') + + assert bleach.clean(' test string ') == expected + + def test_comments_only(self): + comment = '' + open_comment = ''.format(open_comment) + ) + + def test_with_comments(self): + html = 'Just text' + assert 'Just text', bleach.clean(html) == 'Just text' + assert bleach.clean(html, strip_comments=False) == html + + def test_no_html(self): + assert bleach.clean('no html string') == 'no html string' + + def test_allowed_html(self): + assert ( + bleach.clean('an allowed tag') == + 'an allowed tag' + ) + assert ( + bleach.clean('another good tag') == + 'another good tag' + ) + + def test_bad_html(self): + assert ( + bleach.clean('a fixed tag') == + 'a fixed tag' + ) + + def test_function_arguments(self): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + assert ( + bleach.clean('alink text
' + ) + s = 'multiply nested text
' + assert ( + bleach.clean(s, tags=['p'], strip=True) == + 'multiply nested text
' + ) + + s = ('') + assert ( + bleach.clean(s, tags=['p', 'a'], strip=True) == + '' + ) + + def test_allowed_styles(self): + ATTR = ['style'] + STYLE = ['color'] + blank = '' + s = '' + assert bleach.clean('', attributes=ATTR) == blank + assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s + assert ( + bleach.clean('', attributes=ATTR, styles=STYLE) == + s + ) + + def test_lowercase_html(self): + """We should output lowercase HTML.""" + dirty = 'BAR' + clean = 'BAR' + assert bleach.clean(dirty, attributes=['class']) == clean + + def test_wildcard_attributes(self): + ATTR = { + '*': ['id'], + 'img': ['src'], + } + TAG = ['img', 'em'] + dirty = ('both can have ' + 'link text
' - ) - s = 'multiply nested text
' - assert ( - bleach.clean(s, tags=['p'], strip=True) == - 'multiply nested text
' - ) - - s = ('') - assert ( - bleach.clean(s, tags=['p', 'a'], strip=True) == - '' - ) - - -def test_allowed_styles(): - ATTR = ['style'] - STYLE = ['color'] - blank = '' - s = '' - assert bleach.clean('', attributes=ATTR) == blank - assert bleach.clean(s, attributes=ATTR, styles=STYLE) == s - assert ( - bleach.clean('', attributes=ATTR, styles=STYLE) == - s - ) - - def test_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" dirty = 'invalid & < extra http://link.com' @@ -203,138 +309,8 @@ def test_idempotent(): ) -def test_rel_already_there(): - """Make sure rel attribute is updated not replaced""" - linked = ('Click ' - 'here.') - - link_good = 'Click here.' - - assert bleach.linkify(linked) == link_good - assert bleach.linkify(link_good) == link_good - - -def test_lowercase_html(): - """We should output lowercase HTML.""" - dirty = 'BAR' - clean = 'BAR' - assert bleach.clean(dirty, attributes=['class']) == clean - - -def test_wildcard_attributes(): - ATTR = { - '*': ['id'], - 'img': ['src'], - } - TAG = ['img', 'em'] - dirty = ('both can have ' - 'test
', tags=['p']) == 'test
' From b46c7ae058c1ad5f2d351f47e6e57a4dfa1591c3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greeneblah blah blah
', + ... tags=['p'], + ... attributes=['style'], + ... styles=['color'], + ... ) + u'blah blah blah
' + + +As a dict +--------- -The ``attributes`` kwarg is a whitelist of attributes. It can be a list, in -which case the attributes are allowed for any tag, or a dictionary, in which -case the keys are tag names (or a wildcard: ``*`` for all tags) and the values -are lists of allowed attributes. +The ``attributes`` value can be a dict, in which case the keys are tag names (or +a wildcard: ``*`` for all tags) and the values are lists of allowed attributes. For example: @@ -80,23 +108,19 @@ In this case, ``class`` is allowed on any allowed element (from the ``tags`` argument), ```` tags are allowed to have ``href`` and ``rel`` attributes, and so on. -The default value is also a conservative dict found in -``bleach.ALLOWED_ATTRIBUTES``. - -Callable Filters ----------------- +Using functions +--------------- -You can also use a callable (instead of a list) in the ``attributes`` kwarg. If -the callable returns ``True``, the attribute is allowed. Otherwise, it is -stripped. For example: +You can also use callables. If the callable returns ``True``, the attribute is +allowed. Otherwise, it is stripped. For example: .. doctest:: >>> from urlparse import urlparse >>> import bleach - >>> def filter_src(name, value): + >>> def allow_src(name, value): ... if name in ('alt', 'height', 'width'): ... return True ... if name == 'src': @@ -108,7 +132,7 @@ stripped. For example: ... u'
'
+ >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+ u'this is cute!
'
+ assert (
+ bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) ==
+ 'this is cute! `` sections.
-By default, ``linkify()`` will perform some sanitization, only allowing a set
-of "safe" tags. Because it uses the HTML5 parsing algorithm, it will always
-handle things like unclosed tags.
+By default, ``linkify()`` will perform some sanitization, only allowing a set of
+"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
+things like unclosed tags.
.. note::
+
You may pass a ``string`` or ``unicode`` object, but Bleach will always
return ``unicode``.
+.. autofunction:: bleach.linkify
+
-Callbacks
-=========
+Callbacks for adjusting attributes (``callbacks``)
+==================================================
The second argument to ``linkify()`` is a list or other iterable of callback
functions. These callbacks can modify links that exist and links that are being
@@ -36,20 +35,23 @@ Each callback will get the following arguments::
def my_callback(attrs, new=False):
-The ``attrs`` argument is a dict of attributes of the ```` tag. The ``new``
-argument is a boolean indicating if the link is new (e.g. an email address or
-URL found in the text) or already existed (e.g. an ```` tag found in the
-text). The ``attrs`` dict also contains a ``_text`` key, which is the innerText
-of the ```` tag.
+The ``attrs`` argument is a dict of attributes of the ```` tag. Keys of the
+``attrs`` dict are namespaced attr names. For example ``(None, 'href')``. The
+``attrs`` dict also contains a ``_text`` key, which is the innerText of the
+```` tag.
-The callback must return a dict of attributes (including ``_text``) or
-``None``. The new dict of attributes will be passed to the next callback in the
-list. If any callback returns ``None``, the link will not be created and the
-original text left in place, or will be removed, and its original innerText
-left in place.
+The ``new`` argument is a boolean indicating if the link is new (e.g. an email
+address or URL found in the text) or already existed (e.g. an ```` tag found
+in the text).
-The default value is simply to add ``rel="nofollow"``. See ``bleach.callbacks``
-for some included callback functions.
+The callback must return a dict of attributes (including ``_text``) or ``None``.
+The new dict of attributes will be passed to the next callback in the list.
+
+If any callback returns ``None``, new links will not be created and existing
+links will be removed leaving the innerText left in its place.
+
+The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
+included callback functions.
Setting Attributes
@@ -59,22 +61,24 @@ For example, to set ``rel="nofollow"`` on all links found in the text, a simple
(and included) callback might be::
def set_nofollow(attrs, new=False):
- attrs['rel'] = 'nofollow'
+ attrs[(None, 'rel')] = 'nofollow'
return attrs
-This would overwrite the value of the ``rel`` attribute if it was set.
-You could also make external links open in a new tab, or set a class::
+This would set the value of the ``rel`` attribute, stomping on a previous value
+if there was one.
+
+You could also make external links open in a new tab or set a class::
from urlparse import urlparse
def set_target(attrs, new=False):
- p = urlparse(attrs['href'])
+ p = urlparse(attrs[(None, 'href')])
if p.netloc not in ['my-domain.com', 'other-domain.com']:
- attrs['target'] = '_blank'
- attrs['class'] = 'external'
+ attrs[(None, 'target')] = '_blank'
+ attrs[(None, 'class')] = 'external'
else:
- attrs.pop('target', None)
+ attrs.pop((None, 'target'), None)
return attrs
@@ -89,18 +93,20 @@ sanitizing attributes.)
def allowed_attributes(attrs, new=False):
"""Only allow href, target, rel and title."""
- allowed = ['href', 'target', 'rel', 'title']
+ allowed = [(None, 'href'), (None, 'target'),
+ (None, 'rel'), (None, 'title')]
return dict((k, v) for k, v in attrs.items() if k in allowed)
+
Or you could remove a specific attribute, if it exists::
def remove_title1(attrs, new=False):
- attrs.pop('title', None)
+ attrs.pop((None, 'title'), None)
return attrs
def remove_title2(attrs, new=False):
- if 'title' in attrs:
- del attrs['title']
+ if (None, 'title') in attrs:
+ del attrs[(None, 'title')]
return attrs
@@ -117,6 +123,7 @@ limit the length of text inside an ```` tag.
"""Shorten overly-long URLs in the text."""
if not new: # Only looking at newly-created links.
return attrs
+
# _text will be the same as the URL for new links.
text = attrs['_text']
if len(text) > 25:
@@ -130,10 +137,10 @@ limit the length of text inside an ```` tag.
def outgoing_bouncer(attrs, new=False):
"""Send outgoing links through a bouncer."""
- p = urlparse(attrs['href'])
+ p = urlparse((None, attrs['href']))
if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
bouncer = 'http://outgoing.my-domain.com/?destination=%s'
- attrs['href'] = bouncer % quote(attrs['href'])
+ attrs[(None, 'href')] = bouncer % quote(attrs['href'])
return attrs
@@ -151,7 +158,7 @@ write the following callback::
return attrs
# If the TLD is '.py', make sure it starts with http: or https:
- href = attrs['href']
+ href = attrs[(None, 'href')]
if href.endswith('.py') and not href.startswith(('http:', 'https:')):
# This looks like a Python file, not a URL. Don't make a link.
return None
@@ -168,13 +175,13 @@ If you want to remove certain links, even if they are written in the text with
def remove_mailto(attrs, new=False):
"""Remove any mailto: links."""
- if attrs['href'].startswith('mailto:'):
+ if attrs[(None, 'href')].startswith('mailto:'):
return None
return attrs
-``skip_pre``
-============
+Skipping links in pre blocks (``skip_pre``)
+===========================================
```` tags are often special, literal sections. If you don't want to create
any new links within a ```` section, pass ``skip_pre=True``.
@@ -184,8 +191,8 @@ any new links within a ```` section, pass ``skip_pre=True``.
tags will still be passed through all the callbacks.
-``parse_email``
-===============
+Linkifying email addresses (``parse_email``)
+============================================
By default, ``linkify()`` does not create ``mailto:`` links for email
addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
@@ -194,4 +201,50 @@ they are newly created or already in the text, so be careful when writing
callbacks that may need to behave differently if the protocol is ``mailto:``.
+Using ``bleach.linkifier.LinkifyFilter``
+========================================
+
+``bleach.linkify`` works by paring an HTML fragment and then running it through
+the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
+back into text.
+
+You can use this filter wherever you can use an html5lib Filter. For example, you
+could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+
+For example, using all the defaults:
+
+.. doctest::
+
+ >>> from functools import partial
+
+ >>> from bleach import Cleaner
+ >>> from bleach.linkifier import LinkifyFilter
+
+ >>> cleaner = Cleaner(tags=['pre'])
+ >>> cleaner.clean('http://example.com
')
+ u'http://example.com
'
+
+ >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter])
+ >>> cleaner.clean('http://example.com
')
+ u'http://example.com
'
+
+
+And passing parameters to ``LinkifyFilter``:
+
+.. doctest::
+
+ >>> from functools import partial
+
+ >>> from bleach import Cleaner
+ >>> from bleach.linkifier import LinkifyFilter
+
+ >>> cleaner = Cleaner(
+ ... tags=['pre'],
+ ... filters=[partial(LinkifyFilter, skip_pre=True)]
+ ... )
+ ...
+ >>> cleaner.clean('http://example.com
')
+ u'http://example.com
'
+
+
.. _Crate: https://crate.io/
From ddc39ec4a30c5a378976ca97664939c67420ebe5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 4 Mar 2017 09:44:23 -0500
Subject: [PATCH 074/314] Minor code cleanup and comments
---
bleach/linkifier.py | 159 ++++++++++++++++++++++++++++----------------
bleach/sanitizer.py | 8 ++-
2 files changed, 110 insertions(+), 57 deletions(-)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index b4ba2ea8..c6a8486a 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -54,6 +54,18 @@
class LinkifyFilter(Filter):
+ """html5lib filter that linkifies text
+
+ This will do the following:
+
+ * convert email addresses into links
+ * convert urls into links
+ * edit existing links by running them through callbacks--the default is to
+ add a ``rel="nofollow"``
+
+ This filter can be used anywhere html5lib filters can be used.
+
+ """
def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
super(LinkifyFilter, self).__init__(source)
@@ -62,6 +74,13 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False):
self.parse_email = parse_email
def apply_callbacks(self, attrs, is_new):
+ """Given an attrs dict and an is_new bool, runs through callbacks
+
+ Callbacks can return an adjusted attrs dict or None. In the case of
+ None, we stop going through callbacks and return that and the link gets
+ dropped.
+
+ """
for cb in self.callbacks:
attrs = cb(attrs, is_new)
if attrs is None:
@@ -70,6 +89,23 @@ def apply_callbacks(self, attrs, is_new):
def extract_character_data(self, token_list):
"""Extracts and squashes character sequences in a token stream"""
+ # FIXME(willkg): This is a terrible idea. What it does is drop all the
+ # tags from the token list and merge the Characters and SpaceCharacters
+ # tokens into a single text.
+ #
+ # So something like this::
+ #
+ # "" "" "some text" "" ""
+ #
+ # gets converted to "some text".
+ #
+ # This gets used to figure out the ``_text`` fauxttribute value for
+ # linkify callables.
+ #
+ # I'm not really sure how else to support that ``_text`` fauxttribute and
+ # maintain some modicum of backwards compatability with previous versions
+ # of Bleach.
+
out = []
for token in token_list:
token_type = token['type']
@@ -86,6 +122,7 @@ def handle_email_addresses(self, src_iter):
new_tokens = []
end = 0
+ # For each email address we find in the text
for match in email_re.finditer(text):
if match.start() > end:
new_tokens.append(
@@ -101,13 +138,13 @@ def handle_email_addresses(self, src_iter):
attrs = self.apply_callbacks(attrs, True)
if attrs is None:
- # Just add the text
+ # Just add the text--but not as a link
new_tokens.append(
{u'type': u'Characters', u'data': match.group(0)}
)
else:
- # Add a "a" tag
+ # Add an "a" tag for the new link
_text = attrs.pop(u'_text', '')
attrs = alphabetize_attributes(attrs)
new_tokens.extend([
@@ -118,6 +155,8 @@ def handle_email_addresses(self, src_iter):
end = match.end()
if new_tokens:
+ # Yield the adjusted set of tokens and then continue
+ # through the loop
if end < len(text):
new_tokens.append({u'type': u'Characters', u'data': text[end:]})
@@ -128,46 +167,58 @@ def handle_email_addresses(self, src_iter):
yield token
- def strip_wrapping_parentheses(self, fragment):
- """Strips wrapping parentheses"""
+ def strip_parentheses(self, fragment):
+ """Strips parentheses from before and after url"""
openp = closep = 0
# Count consecutive opening parentheses at the beginning of the
# fragment (string)
- for char in fragment:
- if char == '(':
- openp += 1
- else:
- break
-
- if openp:
- newer_frag = ''
- # Cut the consecutive opening brackets from the fragment
- fragment = fragment[openp:]
-
- # Reverse the fragment for easier detection of parentheses
- # inside the URL
- reverse_fragment = fragment[::-1]
- skip = False
- for char in reverse_fragment:
- if char == ')' and closep < openp and not skip:
- # Remove the closing parentheses if it has a matching
- # opening parentheses (they are balanced).
- closep += 1
- continue
-
- elif char != ')':
- # Do not remove ')' from the URL itself.
- skip = True
-
- newer_frag += char
-
- # Reverse fragment back
- fragment = newer_frag[::-1]
+ if fragment.startswith(u'('):
+ for char in fragment:
+ if char == '(':
+ openp += 1
+ else:
+ break
+
+ if openp:
+ newer_frag = ''
+
+ # Cut the consecutive opening brackets from the fragment
+ fragment = fragment[openp:]
+
+ # Reverse the fragment for easier detection of parentheses
+ # inside the URL
+ reverse_fragment = fragment[::-1]
+ skip = False
+ for char in reverse_fragment:
+ if char == ')' and closep < openp and not skip:
+ # Remove the closing parentheses if it has a matching
+ # opening parentheses (they are balanced).
+ closep += 1
+ continue
+
+ elif char != ')':
+ # Do not remove ')' from the URL itself.
+ skip = True
+
+ newer_frag += char
+
+ # Reverse fragment back
+ fragment = newer_frag[::-1]
+
+ # Sometimes we pick up ) at the end of a url, but the url is in a
+ # parenthesized phrase like:
+ #
+ # "i looked at the site (at http://example.com)"
+ if fragment.endswith(u')') and u'(' not in fragment:
+ new_fragment = fragment.rstrip(u')')
+ closep += (len(fragment) - len(new_fragment))
+ fragment = new_fragment
return fragment, u'(' * openp, u')' * closep
def strip_punctuation(self, fragment):
+ """Strips punctuation at the end of a url match"""
match = re.search(punct_re, fragment)
if match:
return fragment[0:match.start()], match.group(0)
@@ -192,19 +243,12 @@ def handle_links(self, src_iter):
prefix = suffix = ''
# Sometimes we pick up ( and ), so drop them from the url
- if url.startswith('('):
- url, prefix, suffix = self.strip_wrapping_parentheses(url)
-
- if url.endswith(u')') and u'(' not in url:
- new_url = url.rstrip(u')')
- suffix = url[len(new_url):] + suffix
- url = new_url
+ url, prefix, suffix = self.strip_parentheses(url)
# Sometimes we pick up . and , at the end of the url that's
# part of the sentence and not the url so drop it
url, punct_suffix = self.strip_punctuation(url)
- if punct_suffix:
- suffix = suffix + punct_suffix
+ suffix = suffix + punct_suffix
# If there's no protocol, add one
if re.search(proto_re, url):
@@ -218,19 +262,20 @@ def handle_links(self, src_iter):
}
attrs = self.apply_callbacks(attrs, True)
- if prefix:
- new_tokens.append(
- {u'type': u'Characters', u'data': prefix}
- )
-
if attrs is None:
# Just add the text
new_tokens.append(
- {u'type': u'Characters', u'data': url}
+ {u'type': u'Characters', u'data': prefix + url + suffix}
)
else:
- # Add an "a" tag!
+ # Add the "a" tag!
+
+ if prefix:
+ new_tokens.append(
+ {u'type': u'Characters', u'data': prefix}
+ )
+
_text = attrs.pop(u'_text', '')
attrs = alphabetize_attributes(attrs)
@@ -240,14 +285,16 @@ def handle_links(self, src_iter):
{u'type': u'EndTag', u'name': 'a'},
])
- if suffix:
- new_tokens.append(
- {u'type': u'Characters', u'data': suffix}
- )
+ if suffix:
+ new_tokens.append(
+ {u'type': u'Characters', u'data': suffix}
+ )
end = match.end()
if new_tokens:
+ # Yield the adjusted set of tokens and then continue
+ # through the loop
if end < len(text):
new_tokens.append({u'type': u'Characters', u'data': text[end:]})
@@ -334,9 +381,9 @@ def __iter__(self):
# yet
continue
- elif in_pre:
+ elif in_pre and self.skip_pre:
# NOTE(willkg): We put this clause here since in_a and
- # switching in and out of is_a takes precedence.
+ # switching in and out of in_a takes precedence.
if token['type'] == 'EndTag' and token['name'] == 'pre':
in_pre = False
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 610dd903..18ce49f4 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -9,6 +9,11 @@
class BleachSanitizerFilter(sanitizer.Filter):
+ """html5lib Filter that sanitizes text
+
+ This filter can be used anywhere html5lib filters can be used.
+
+ """
def __init__(self, source, allowed_attributes_map,
strip_disallowed_elements=False, strip_html_comments=True,
**kwargs):
@@ -60,6 +65,7 @@ def sanitize_token(self, token):
return token
def allow_token(self, token):
+ """Handles the case where we're allowing the tag"""
if 'data' in token:
allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
if not callable(allowed_attributes):
@@ -131,7 +137,7 @@ def allow_token(self, token):
return token
def sanitize_css(self, style):
- """html5lib sanitizer filter replacement to fix issues"""
+ """Sanitizes css in style tags"""
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
From 6968e5d8eec7be235ea167d97c73ea2937ad0a59 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 4 Mar 2017 09:52:37 -0500
Subject: [PATCH 075/314] Add tests for alphabetize_attributes
---
tests/test_utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
create mode 100644 tests/test_utils.py
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..076617df
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,44 @@
+from collections import OrderedDict
+
+from bleach.utils import alphabetize_attributes
+
+
+class TestAlphabeticalAttributes:
+ def test_empty_cases(self):
+ assert alphabetize_attributes(None) is None
+
+ assert alphabetize_attributes({}) == {}
+
+ def test_ordering(self):
+ assert (
+ alphabetize_attributes({
+ (None, 'a'): 1,
+ (None, 'b'): 2
+ }) ==
+ OrderedDict([
+ ((None, 'a'), 1),
+ ((None, 'b'), 2)
+ ])
+ )
+ assert (
+ alphabetize_attributes({
+ (None, 'b'): 1,
+ (None, 'a'): 2}
+ ) ==
+ OrderedDict([
+ ((None, 'a'), 2),
+ ((None, 'b'), 1)
+ ])
+ )
+
+ def test_different_namespaces(self):
+ assert (
+ alphabetize_attributes({
+ ('xlink', 'href'): 'abc',
+ (None, 'alt'): '123'
+ }) ==
+ OrderedDict([
+ ((None, 'alt'), '123'),
+ (('xlink', 'href'), 'abc')
+ ])
+ )
From a1a85e9226e2be45a4eded6680b5ede4b2fa1e4c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 4 Mar 2017 10:37:37 -0500
Subject: [PATCH 076/314] Fix handling for over-eager url matching
---
bleach/linkifier.py | 100 +++++++++++++++++++-------------------------
1 file changed, 42 insertions(+), 58 deletions(-)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index c6a8486a..04ab8275 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -40,8 +40,6 @@
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-punct_re = re.compile(r'([\.,]+)$')
-
email_re = re.compile(
r"""(?
Date: Sat, 4 Mar 2017 10:41:20 -0500
Subject: [PATCH 077/314] Add tests from #78
---
tests/test_links.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/tests/test_links.py b/tests/test_links.py
index 1712d199..8e166543 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -467,6 +467,14 @@ def test_sarcasm():
'(http://en.wikipedia.org/wiki/)Test_(assessment',
('(', 'en.wikipedia.org/wiki/)Test_(assessment',
'http://en.wikipedia.org/wiki/)Test_(assessment', '')
+ ),
+ (
+ 'hello (http://www.mu.de/blah.html) world',
+ ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world')
+ ),
+ (
+ 'hello (http://www.mu.de/blah.html). world',
+ ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world')
)
])
def test_wrapping_parentheses(data, expected_data):
From bb44c173700853c0da33a1cbb43632e95f54e885 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 4 Mar 2017 10:53:43 -0500
Subject: [PATCH 078/314] Move "a" tag handling to a method
---
bleach/linkifier.py | 92 +++++++++++++++++++++++++--------------------
1 file changed, 51 insertions(+), 41 deletions(-)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 04ab8275..a3e46009 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -254,7 +254,6 @@ def handle_links(self, src_iter):
else:
# Add the "a" tag!
-
if prefix:
new_tokens.append(
{u'type': u'Characters', u'data': prefix}
@@ -289,6 +288,49 @@ def handle_links(self, src_iter):
yield token
+ def handle_a_tag(self, token_buffer):
+ """Handle the "a" tag
+
+ This could adjust the link or drop it altogether depending on what the
+ callbacks return.
+
+ This yields the new set of tokens.
+
+ """
+ a_token = token_buffer[0]
+ if a_token['data']:
+ attrs = a_token['data']
+ else:
+ attrs = {}
+ text = self.extract_character_data(token_buffer)
+ attrs['_text'] = text
+
+ attrs = self.apply_callbacks(attrs, False)
+
+ if attrs is None:
+ # We're dropping the "a" tag and everything else and replacing
+ # it with character data. So emit that token.
+ yield {'type': 'Characters', 'data': text}
+
+ else:
+ new_text = attrs.pop('_text', '')
+ a_token['data'] = alphabetize_attributes(attrs)
+
+ if text == new_text:
+ # The callbacks didn't change the text, so we yield the new "a"
+ # token, then whatever else was there, then the end "a" token
+ yield a_token
+ for mem in token_buffer[1:]:
+ yield mem
+
+ else:
+ # If the callbacks changed the text, then we're going to drop
+ # all the tokens between the start and end "a" tags and replace
+ # it with the new text
+ yield a_token
+ yield {'type': 'Characters', 'data': force_unicode(new_text)}
+ yield token_buffer[-1]
+
def __iter__(self):
in_a = False
in_pre = False
@@ -300,47 +342,15 @@ def __iter__(self):
# Handle the case where we're in an "a" tag--we want to buffer tokens
# until we hit an end "a" tag.
if token['type'] == 'EndTag' and token['name'] == 'a':
- # We're no longer in an "a" tag, so we get all the things we
- # need to apply callbacks and then figure out what to do with
- # this "a" tag.
- in_a = False
- a_token = token_buffer[0]
- if a_token['data']:
- attrs = a_token['data']
- else:
- attrs = {}
-
- text = self.extract_character_data(token_buffer)
- attrs['_text'] = text
-
- attrs = self.apply_callbacks(attrs, False)
- if attrs is None:
- # We're dropping the "a" tag and everything else and replacing
- # it with character data. So emit that token.
- yield {'type': 'Characters', 'data': text}
-
- else:
- new_text = attrs.pop('_text', '')
- # FIXME(willkg): add nofollow here
- a_token['data'] = alphabetize_attributes(attrs)
-
- if text == new_text:
- # The callbacks didn't change the text, so we yield the
- # new "a" token, then whatever else was there, then the
- # end "a" token
- yield a_token
- for mem in token_buffer[1:]:
- yield mem
- yield token
-
- else:
- # If the callbacks changed the text, then we're going
- # to drop all the tokens between the start and end "a"
- # tags and replace it with the new text
- yield a_token
- yield {'type': 'Characters', 'data': force_unicode(new_text)}
- yield token
+ # Add the end tag to the token buffer and then handle them
+ # and yield anything returned
+ token_buffer.append(token)
+ for new_token in self.handle_a_tag(token_buffer):
+ yield new_token
+ # Clear "a" related state and continue since we've yielded all
+ # the tokens we're going to yield
+ in_a = False
token_buffer = []
continue
From 460aa6d3a95f5cc89f9589e8398491f3a5e2180d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 6 Mar 2017 13:55:05 -0500
Subject: [PATCH 079/314] Restructure linkify; clean up __init__; update docs
This is a ton of changes all in one commit. Sorry.
* Restructures linkify so it mirrors clean. This has the nicety in that
the two are parallel and can be used the same.
* Rework how url_re and email_re work so that it's possible to override
them and/or provide your own list of allowed procotols and TLDs
* Overhaul the docs including converting linkify examples to doctest
* Update CHANGES
---
CHANGES | 23 ++--
bleach/__init__.py | 245 +++++++++---------------------------------
bleach/linkifier.py | 100 ++++++++++++++---
bleach/sanitizer.py | 142 ++++++++++++++++++++++++
docs/clean.rst | 38 ++++---
docs/linkify.rst | 250 +++++++++++++++++++++++++++++--------------
tests/test_basics.py | 8 +-
tests/test_links.py | 44 ++++++--
8 files changed, 518 insertions(+), 332 deletions(-)
diff --git a/CHANGES b/CHANGES
index d7d9b0d2..9afe859f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,7 +17,7 @@ Version 2.0 (in development)
* ``bleach.clean`` and friends were rewritten
- ``clean`` is now implemented as an html5lib Filter and happens at a different
+ ``clean`` was reimplemented as an html5lib filter and happens at a different
step in the HTML parsing -> traversing -> serializing process. Because of
that, there are some differences in clean's output as compared with previous
versions.
@@ -43,11 +43,14 @@ Version 2.0 (in development)
Now it's more like this::
def check_protocol(attrs, is_new):
- if not attrs.get((None, 'href'), '').startswith(('http:', 'https:')):
+ if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
# ^^^^^^^^^^^^^^
return None
return attrs
+ Further, you need to make sure you're always using unicode values. If you
+ don't then html5lib will raise an assertion error that the value is not
+ unicode.
**Changes**
@@ -55,17 +58,19 @@ Version 2.0 (in development)
* Supports html5lib >= 0.99999999 (8 9s).
-* There's a ``bleach.Cleaner`` class that you can instantiate with your
- favorite clean settings and reuse it.
+* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your
+ favorite clean settings for easy reuse.
-* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib Filter.
+* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your
+ favorite linkify settings for easy reuse.
-* You can pass ``bleach.linkifier.LinkifyFilter`` as a Filter to
- ``bleach.Cleaner`` allowing you to clean and linkify in one pass.
+* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
+ you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and
+ linkify in one pass.
-* Lots of bug fixes.
+* Tons of bug fixes.
-* Test cleanup.
+* Cleaned up tests.
* Documentation fixes.
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 0155a127..07b5075c 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,169 +1,28 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import logging
-import re
-
-import html5lib
-from html5lib.filters import sanitizer
-from html5lib.filters.sanitizer import allowed_protocols
-from html5lib.serializer import HTMLSerializer
-
-from bleach import callbacks as linkify_callbacks
-from bleach.encoding import force_unicode
-from bleach.linkifier import LinkifyFilter
-from bleach.sanitizer import BleachSanitizerFilter
-from bleach.version import __version__, VERSION # flake8: noqa
-
-__all__ = ['Cleaner', 'clean', 'linkify']
-
-log = logging.getLogger(__name__)
-log.addHandler(logging.NullHandler())
-
-ALLOWED_TAGS = [
- 'a',
- 'abbr',
- 'acronym',
- 'b',
- 'blockquote',
- 'code',
- 'em',
- 'i',
- 'li',
- 'ol',
- 'strong',
- 'ul',
-]
-
-ALLOWED_ATTRIBUTES = {
- 'a': ['href', 'title'],
- 'abbr': ['title'],
- 'acronym': ['title'],
-}
-
-ALLOWED_STYLES = []
-
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
-
-
-class Cleaner(object):
- """Cleaner for cleaning HTML fragments of malicious content
-
- This cleaner is a security-focused function whose sole purpose is to remove
- malicious content from a string such that it can be displayed as content in
- a web page.
-
- This cleaner is not designed to use to transform content to be used in
- non-web-page contexts.
-
- To use::
-
- from bleach import Cleaner
-
- cleaner = Cleaner()
-
- for text in all_the_yucky_things:
- sanitized = cleaner.clean(text)
-
- """
-
- def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
- styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
- strip_comments=True, filters=None):
- """Initializes a Cleaner
-
- :arg tags: whitelist of allowed tags; defaults to
- ``bleach.ALLOWED_TAGS``
-
- :arg attributes: whitelist of allowed attributes; defaults to
- ``bleach.ALLOWED_ATTRIBUTES``
-
- :arg styles: whitelist of allowed css; defaults to
- ``bleach.ALLOWED_STYLES``
-
- :arg protocols: whitelist of allowed protocols for links; defaults
- to ``bleach.ALLOWED_PROTOCOLS``
-
- :arg strip: whether or not to strip disallowed elements
-
- :arg strip_comments: whether or not to strip HTML comments
-
- :arg filters: list of html5lib Filter classes to pass streamed content through
-
- See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
- .. Warning::
-
- Using filters changes the output of ``bleach.Cleaner.clean``.
- Make sure the way the filters change the output are secure.
-
- """
- self.tags = tags
- self.attributes = attributes
- self.styles = styles
- self.protocols = protocols
- self.strip = strip
- self.strip_comments = strip_comments
- self.filters = filters or []
-
- self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
- self.walker = html5lib.getTreeWalker('etree')
- self.serializer = HTMLSerializer(
- quote_attr_values='always',
- omit_optional_tags=False,
-
- # Bleach has its own sanitizer, so don't use the html5lib one
- sanitize=False,
-
- # Bleach sanitizer alphabetizes already, so don't use the html5lib one
- alphabetical_attributes=False,
- )
-
- def clean(self, text):
- """Cleans text and returns sanitized result as unicode
-
- :arg str text: text to be cleaned
-
- :returns: sanitized text as unicode
-
- """
- if not text:
- return u''
-
- text = force_unicode(text)
-
- dom = self.parser.parseFragment(text)
- filtered = BleachSanitizerFilter(
- source=self.walker(dom),
-
- # Bleach-sanitizer-specific things
- allowed_attributes_map=self.attributes,
- strip_disallowed_elements=self.strip,
- strip_html_comments=self.strip_comments,
-
- # html5lib-sanitizer things
- allowed_elements=self.tags,
- allowed_css_properties=self.styles,
- allowed_protocols=self.protocols,
- allowed_svg_properties=[],
- )
-
- # Apply any filters after the BleachSanitizerFilter
- for filter_class in self.filters:
- filtered = filter_class(source=filtered)
+from bleach.linkifier import (
+ DEFAULT_CALLBACKS,
+ Linker,
+ LinkifyFilter,
+)
+from bleach.sanitizer import (
+ ALLOWED_ATTRIBUTES,
+ ALLOWED_PROTOCOLS,
+ ALLOWED_STYLES,
+ ALLOWED_TAGS,
+ BleachSanitizerFilter,
+ Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
- return self.serializer.render(filtered)
+__all__ = ['clean', 'linkify']
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
- strip_comments=True, filters=None):
+ strip_comments=True):
"""Clean an HTML fragment of malicious content and return it
This function is a security-focused function whose sole purpose is to
@@ -182,36 +41,27 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
.. Note::
- If you're cleaning a lot of text and passing the same argument
- values, consider caching a ``Cleaner`` instance.
+ If you're cleaning a lot of text and passing the same argument values or
+ you want more configurability, consider using a
+ :py:class:`bleach.sanitizer.Cleaner` instance.
- :arg text: the text to clean
+ :arg str text: the text to clean
- :arg tags: whitelist of allowed tags; defaults to
+ :arg list tags: whitelist of allowed tags; defaults to
``bleach.ALLOWED_TAGS``
- :arg attributes: whitelist of allowed attributes; defaults to
+ :arg dict attributes: whitelist of allowed attributes; defaults to
``bleach.ALLOWED_ATTRIBUTES``
- :arg styles: whitelist of allowed css; defaults to
+ :arg list styles: whitelist of allowed css; defaults to
``bleach.ALLOWED_STYLES``
- :arg protocols: whitelist of allowed protocols for links; defaults
+ :arg list protocols: whitelist of allowed protocols for links; defaults
to ``bleach.ALLOWED_PROTOCOLS``
- :arg strip: whether or not to strip disallowed elements
-
- :arg strip_comments: whether or not to strip HTML comments
-
- :arg filters: list of html5lib Filter classes to pass streamed content through
-
- See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+ :arg bool strip: whether or not to strip disallowed elements
- .. Warning::
-
- Using filters changes the output of
- ``bleach.Cleaner.clean``. Make sure the way the filters
- change the output are secure.
+ :arg bool strip_comments: whether or not to strip HTML comments
:returns: cleaned text as unicode
@@ -223,7 +73,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
protocols=protocols,
strip=strip,
strip_comments=strip_comments,
- filters=filters,
)
return cleaner.clean(text)
@@ -231,40 +80,42 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False):
"""Convert URL-like strings in an HTML fragment to links
- ``linkify()`` converts strings that look like URLs, domain names and email
+ This function converts strings that look like URLs, domain names and email
addresses in text that may be an HTML fragment to links, while preserving:
1. links already in the string
2. urls found in attributes
3. email addresses
- ``linkify()`` does a best-effort approach and tries to recover from bad
+ linkify does a best-effort approach and tries to recover from bad
situations due to crazy text.
- """
- parser = html5lib.HTMLParser(namespaceHTMLElements=False)
- walker = html5lib.getTreeWalker('etree')
- serializer = HTMLSerializer(
- quote_attr_values='always',
- omit_optional_tags=False,
+ .. Note::
- # Bleach has its own sanitizer, so don't use the html5lib one
- sanitize=False,
+ If you're linking a lot of text and passing the same argument values or
+ you want more configurability, consider using a
+ :py:class:`bleach.linkifier.Linker` instance.
- # Bleach sanitizer alphabetizes already, so don't use the html5lib one
- alphabetical_attributes=False,
- )
+ .. Note::
+
+ If you have text that you want to clean and then linkify, consider using
+ the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+ pass. That way you're not parsing the HTML twice.
+
+ :arg str text: the text to linkify
- text = force_unicode(text)
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes
- if not text:
- return u''
+ :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag
- dom = parser.parseFragment(text)
- filtered = LinkifyFilter(
- source=walker(dom),
+ :arg bool parse_email: whether or not to linkify email addresses
+
+ :returns: linkified text as unicode
+
+ """
+ linker = Linker(
callbacks=callbacks,
skip_pre=skip_pre,
parse_email=parse_email
)
- return serializer.render(filtered)
+ return linker.linkify(text)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index a3e46009..1396f056 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -1,14 +1,19 @@
from __future__ import unicode_literals
import re
+import html5lib
from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
-from bleach import allowed_protocols
+from bleach import callbacks as linkify_callbacks
from bleach.encoding import force_unicode
from bleach.utils import alphabetize_attributes
-# FIXME(willkg): Move this to a constants module.
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
@@ -27,20 +32,37 @@
TLDS.reverse()
-url_re = re.compile(
- r"""\(* # Match any opening parentheses.
- \b(?"]*)?
- # /path/zz (excluding "unsafe" chars from RFC 1738,
- # except for # and ~, which happen in practice)
- """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
- re.IGNORECASE | re.VERBOSE | re.UNICODE)
+def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+ """Builds the url regex used by linkifier
+
+ If you want a different set of tlds or allowed protocols, pass those in
+ and stomp on the existing ``url_re``::
+
+ from bleach import linkifier
+
+ my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+ linker = LinkifyFilter(url_re=my_url_re)
+
+ """
+ return re.compile(
+ r"""\(* # Match any opening parentheses.
+ \b(?"]*)?
+ # /path/zz (excluding "unsafe" chars from RFC 1738,
+ # except for # and ~, which happen in practice)
+ """.format('|'.join(protocols), '|'.join(tlds)),
+ re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+URL_RE = build_url_re()
+
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-email_re = re.compile(
+EMAIL_RE = re.compile(
r"""(? end:
new_tokens.append(
{u'type': u'Characters', u'data': text[end:match.start()]}
@@ -221,7 +287,7 @@ def handle_links(self, src_iter):
new_tokens = []
end = 0
- for match in url_re.finditer(text):
+ for match in self.url_re.finditer(text):
if match.start() > end:
new_tokens.append(
{u'type': u'Characters', u'data': text[end:match.start()]}
@@ -235,7 +301,7 @@ def handle_links(self, src_iter):
url, prefix, suffix = self.strip_non_url_bits(url)
# If there's no protocol, add one
- if re.search(proto_re, url):
+ if PROTO_RE.search(url):
href = url
else:
href = u'http://%s' % url
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 18ce49f4..fcbcd915 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -2,12 +2,154 @@
import re
from xml.sax.saxutils import unescape
+import html5lib
from html5lib.constants import namespaces
from html5lib.filters import sanitizer
+from html5lib.serializer import HTMLSerializer
+from bleach.encoding import force_unicode
from bleach.utils import alphabetize_attributes
+ALLOWED_TAGS = [
+ 'a',
+ 'abbr',
+ 'acronym',
+ 'b',
+ 'blockquote',
+ 'code',
+ 'em',
+ 'i',
+ 'li',
+ 'ol',
+ 'strong',
+ 'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+ 'a': ['href', 'title'],
+ 'abbr': ['title'],
+ 'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+
+ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
+# a simple routine that returns the tag name with the namespace prefix
+# as returned by etree's Element.tag attribute
+
+
+class Cleaner(object):
+ """Cleaner for cleaning HTML fragments of malicious content
+
+ This cleaner is a security-focused function whose sole purpose is to remove
+ malicious content from a string such that it can be displayed as content in
+ a web page.
+
+ This cleaner is not designed to use to transform content to be used in
+ non-web-page contexts.
+
+ To use::
+
+ from bleach import Cleaner
+
+ cleaner = Cleaner()
+
+ for text in all_the_yucky_things:
+ sanitized = cleaner.clean(text)
+
+ """
+
+ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+ styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+ strip_comments=True, filters=None):
+ """Initializes a Cleaner
+
+ :arg tags: whitelist of allowed tags; defaults to
+ ``bleach.ALLOWED_TAGS``
+
+ :arg attributes: whitelist of allowed attributes; defaults to
+ ``bleach.ALLOWED_ATTRIBUTES``
+
+ :arg styles: whitelist of allowed css; defaults to
+ ``bleach.ALLOWED_STYLES``
+
+ :arg protocols: whitelist of allowed protocols for links; defaults
+ to ``bleach.ALLOWED_PROTOCOLS``
+
+ :arg strip: whether or not to strip disallowed elements
+
+ :arg strip_comments: whether or not to strip HTML comments
+
+ :arg filters: list of html5lib Filter classes to pass streamed content through
+
+ See http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+ .. Warning::
+
+ Using filters changes the output of ``bleach.Cleaner.clean``.
+ Make sure the way the filters change the output are secure.
+
+ """
+ self.tags = tags
+ self.attributes = attributes
+ self.styles = styles
+ self.protocols = protocols
+ self.strip = strip
+ self.strip_comments = strip_comments
+ self.filters = filters or []
+
+ self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+ self.walker = html5lib.getTreeWalker('etree')
+ self.serializer = HTMLSerializer(
+ quote_attr_values='always',
+ omit_optional_tags=False,
+
+ # Bleach has its own sanitizer, so don't use the html5lib one
+ sanitize=False,
+
+ # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+ alphabetical_attributes=False,
+ )
+
+ def clean(self, text):
+ """Cleans text and returns sanitized result as unicode
+
+ :arg str text: text to be cleaned
+
+ :returns: sanitized text as unicode
+
+ """
+ if not text:
+ return u''
+
+ text = force_unicode(text)
+
+ dom = self.parser.parseFragment(text)
+ filtered = BleachSanitizerFilter(
+ source=self.walker(dom),
+
+ # Bleach-sanitizer-specific things
+ allowed_attributes_map=self.attributes,
+ strip_disallowed_elements=self.strip,
+ strip_html_comments=self.strip_comments,
+
+ # html5lib-sanitizer things
+ allowed_elements=self.tags,
+ allowed_css_properties=self.styles,
+ allowed_protocols=self.protocols,
+ allowed_svg_properties=[],
+ )
+
+ # Apply any filters after the BleachSanitizerFilter
+ for filter_class in self.filters:
+ filtered = filter_class(source=filtered)
+
+ return self.serializer.render(filtered)
+
+
class BleachSanitizerFilter(sanitizer.Filter):
"""html5lib Filter that sanitizes text
diff --git a/docs/clean.rst b/docs/clean.rst
index e281e2ca..161e4357 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -214,6 +214,7 @@ whitelist and invalid markup. For example:
>>> bleach.clean('is not allowed')
u'<span>is not allowed</span>'
+
>>> bleach.clean('is not allowed', tags=['b'])
u'<span>is not allowed</span>'
@@ -227,6 +228,7 @@ If you would rather Bleach stripped this markup entirely, you can pass
>>> bleach.clean('is not allowed', strip=True)
u'is not allowed'
+
>>> bleach.clean('is not allowed', tags=['b'], strip=True)
u'is not allowed'
@@ -250,10 +252,20 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
u'my html'
+Using ``bleach.sanitizer.Cleaner``
+==================================
+
+If you're cleaning a lot of text or you need better control of things, you
+should create a :py:class:`bleach.sanitizer.Cleaner` instance.
+
+.. autoclass:: bleach.sanitizer.Cleaner
+ :members:
+
+
html5lib Filters (``filters``)
-==============================
+------------------------------
-Bleach sanitizing is implemented as an html5lib Filter. The consequence of this
+Bleach sanitizing is implemented as an html5lib filter. The consequence of this
is that we can pass the streamed content through additional specified filters
after the :py:class:`bleach.sanitizer.BleachSanitizingFilter` filter has run.
@@ -267,7 +279,7 @@ Trivial Filter example:
.. doctest::
- >>> import bleach
+ >>> from bleach.sanitizer import Cleaner
>>> from html5lib.filters.base import Filter
>>> class MooFilter(Filter):
@@ -283,8 +295,9 @@ Trivial Filter example:
... }
...
>>> TAGS = ['img']
+ >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
>>> dirty = 'this is cute!
'
- >>> bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+ >>> cleaner.clean(dirty)
u'this is cute!
'
@@ -294,20 +307,11 @@ Trivial Filter example:
filter is applying maintain the safety guarantees of the output.
-Using ``bleach.Cleaner``
-========================
-
-If you're cleaning a lot of text, you might want to create a
-:py:class:`bleach.Cleaner` instance.
-
-.. autoclass:: bleach.Cleaner
- :members:
-
-
Using ``bleach.sanitizer.BleachSanitizerFilter``
================================================
-``bleach.clean`` creates a ``bleach.Cleaner`` which creates a
+``bleach.clean`` creates a ``bleach.sanitizer.Cleaner`` which creates a
``bleach.sanitizer.BleachSanitizerFilter`` which does the sanitizing work.
-``BleachSanitizerFilter`` is an html5lib Filter and can be used anywhere you can
-use an html5lib Filter.
+
+``BleachSanitizerFilter`` is an html5lib filter and can be used anywhere you can
+use an html5lib filter.
diff --git a/docs/linkify.rst b/docs/linkify.rst
index 6fe032ed..a468830c 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -5,22 +5,19 @@
Linkifying text fragments
=========================
-``linkify()`` searches text for links, URLs, and email addresses and lets you
-control how and when those links are rendered.
+:py:func:`bleach.linkify` searches text for links, URLs, and email addresses and
+lets you control how and when those links are rendered.
-``linkify()`` works by building a document tree, so it's guaranteed never to do
-weird things to URLs in attribute values, can modify the value of attributes on
-```` tags, and can even do things like skip ```` sections.
-
-By default, ``linkify()`` will perform some sanitization, only allowing a set of
-"safe" tags. Because it uses the HTML5 parsing algorithm, it will always handle
-things like unclosed tags.
+It works by building a document tree, so it's guaranteed never to do weird
+things to URLs in attribute values, can modify the value of attributes on
+```` tags and can even do things like skip ```` sections.
.. note::
You may pass a ``string`` or ``unicode`` object, but Bleach will always
return ``unicode``.
+
.. autofunction:: bleach.linkify
@@ -57,29 +54,44 @@ included callback functions.
Setting Attributes
------------------
-For example, to set ``rel="nofollow"`` on all links found in the text, a simple
-(and included) callback might be::
+For example, you could add a ``title`` attribute to all links:
+
+.. doctest::
+
+ >>> from bleach.linkifier import Linker
- def set_nofollow(attrs, new=False):
- attrs[(None, 'rel')] = 'nofollow'
- return attrs
+ >>> def set_title(attrs, new=False):
+ ... attrs[(None, u'title')] = u'link in user text'
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[set_title])
+ >>> linker.linkify('abc http://example.com def')
+ u'abc http://example.com def'
This would set the value of the ``rel`` attribute, stomping on a previous value
if there was one.
-You could also make external links open in a new tab or set a class::
+Here's another example that makes external links open in a new tab and look like
+an external link:
- from urlparse import urlparse
+.. doctest::
- def set_target(attrs, new=False):
- p = urlparse(attrs[(None, 'href')])
- if p.netloc not in ['my-domain.com', 'other-domain.com']:
- attrs[(None, 'target')] = '_blank'
- attrs[(None, 'class')] = 'external'
- else:
- attrs.pop((None, 'target'), None)
- return attrs
+ >>> from urlparse import urlparse
+ >>> from bleach.linkifier import Linker
+
+ >>> def set_target(attrs, new=False):
+ ... p = urlparse(attrs[(None, u'href')])
+ ... if p.netloc not in ['my-domain.com', 'other-domain.com']:
+ ... attrs[(None, u'target')] = u'_blank'
+ ... attrs[(None, u'class')] = u'external'
+ ... else:
+ ... attrs.pop((None, u'target'), None)
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[set_target])
+ >>> linker.linkify('abc http://example.com def')
+ u'abc http://example.com def'
Removing Attributes
@@ -89,25 +101,42 @@ You can easily remove attributes you don't want to allow, even on existing
links (```` tags) in the text. (See also :ref:`clean() ` for
sanitizing attributes.)
-::
+.. doctest::
- def allowed_attributes(attrs, new=False):
- """Only allow href, target, rel and title."""
- allowed = [(None, 'href'), (None, 'target'),
- (None, 'rel'), (None, 'title')]
- return dict((k, v) for k, v in attrs.items() if k in allowed)
+ >>> from bleach.linkifier import Linker
+
+ >>> def allowed_attrs(attrs, new=False):
+ ... """Only allow href, target, rel and title."""
+ ... allowed = [
+ ... (None, u'href'),
+ ... (None, u'target'),
+ ... (None, u'rel'),
+ ... (None, u'title'),
+ ... u'_text',
+ ... ]
+ ... return dict((k, v) for k, v in attrs.items() if k in allowed)
+ ...
+ >>> linker = Linker(callbacks=[allowed_attrs])
+ >>> linker.linkify('link')
+ u'link'
-Or you could remove a specific attribute, if it exists::
+Or you could remove a specific attribute, if it exists:
- def remove_title1(attrs, new=False):
- attrs.pop((None, 'title'), None)
- return attrs
+.. doctest::
+
+ >>> from bleach.linkifier import Linker
- def remove_title2(attrs, new=False):
- if (None, 'title') in attrs:
- del attrs[(None, 'title')]
- return attrs
+ >>> def remove_title(attrs, new=False):
+ ... attrs.pop((None, u'title'), None)
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[remove_title])
+ >>> linker.linkify('link')
+ u'link'
+
+ >>> linker.linkify('link')
+ u'link'
Altering Attributes
@@ -117,31 +146,50 @@ You can alter and overwrite attributes, including the link text, via the
``_text`` key, to, for example, pass outgoing links through a warning page, or
limit the length of text inside an ```` tag.
-::
+Example of shortening link text:
- def shorten_url(attrs, new=False):
- """Shorten overly-long URLs in the text."""
- if not new: # Only looking at newly-created links.
- return attrs
+.. doctest::
+
+ >>> from bleach.linkifier import Linker
+
+ >>> def shorten_url(attrs, new=False):
+ ... """Shorten overly-long URLs in the text."""
+ ... # Only adjust newly-created links
+ ... if not new:
+ ... return attrs
+ ... # _text will be the same as the URL for new links
+ ... text = attrs[u'_text']
+ ... if len(text) > 25:
+ ... attrs[u'_text'] = text[0:22] + u'...'
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[shorten_url])
+ >>> linker.linkify('http://example.com/longlonglonglonglongurl')
+ u'http://example.com/lon...'
- # _text will be the same as the URL for new links.
- text = attrs['_text']
- if len(text) > 25:
- attrs['_text'] = text[0:22] + '...'
- return attrs
-::
+Example of switching all links to go through a bouncer first:
- from urllib2 import quote
- from urlparse import urlparse
+.. doctest::
- def outgoing_bouncer(attrs, new=False):
- """Send outgoing links through a bouncer."""
- p = urlparse((None, attrs['href']))
- if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
- bouncer = 'http://outgoing.my-domain.com/?destination=%s'
- attrs[(None, 'href')] = bouncer % quote(attrs['href'])
- return attrs
+ >>> from six.moves.urllib.parse import quote, urlparse
+ >>> from bleach.linkifier import Linker
+
+ >>> def outgoing_bouncer(attrs, new=False):
+ ... """Send outgoing links through a bouncer."""
+ ... href_key = (None, u'href')
+ ... p = urlparse(attrs.get(href_key, None))
+ ... if p.netloc not in ['example.com', 'www.example.com', '']:
+ ... bouncer = 'http://bn.ce/?destination=%s'
+ ... attrs[href_key] = bouncer % quote(attrs[href_key])
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[outgoing_bouncer])
+ >>> linker.linkify('http://example.com')
+ u'http://example.com'
+
+ >>> linker.linkify('http://foo.com')
+ u'http://foo.com'
Preventing Links
@@ -151,33 +199,53 @@ A slightly more complex example is inspired by Crate_, where strings like
``models.py`` are often found, and linkified. ``.py`` is the ccTLD for
Paraguay, so ``example.py`` may be a legitimate URL, but in the case of a site
dedicated to Python packages, odds are it is not. In this case, Crate_ could
-write the following callback::
+write the following callback:
- def dont_linkify_python(attrs, new=False):
- if not new: # This is an existing tag, leave it be.
- return attrs
+.. doctest::
- # If the TLD is '.py', make sure it starts with http: or https:
- href = attrs[(None, 'href')]
- if href.endswith('.py') and not href.startswith(('http:', 'https:')):
- # This looks like a Python file, not a URL. Don't make a link.
- return None
+ >>> from bleach.linkifier import Linker
+
+ >>> def dont_linkify_python(attrs, new=False):
+ ... # This is an existing link, so leave it be
+ ... if not new:
+ ... return attrs
+ ... # If the TLD is '.py', make sure it starts with http: or https:.
+ ... # Use _text because that's the original text
+ ... link_text = attrs[u'_text']
+ ... if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
+ ... # This looks like a Python file, not a URL. Don't make a link.
+ ... return None
+ ... # Everything checks out, keep going to the next callback.
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[dont_linkify_python])
+ >>> linker.linkify('abc http://example.com def')
+ u'abc http://example.com def'
- # Everything checks out, keep going to the next callback.
- return attrs
+ >>> linker.linkify('abc models.py def')
+ u'abc models.py def'
Removing Links
--------------
If you want to remove certain links, even if they are written in the text with
-```` tags, you can still return ``None``::
+```` tags, have the callback return ``None``.
- def remove_mailto(attrs, new=False):
- """Remove any mailto: links."""
- if attrs[(None, 'href')].startswith('mailto:'):
- return None
- return attrs
+For example, this removes any ``mailto:`` links:
+
+.. doctest::
+
+ >>> from bleach.linkifier import Linker
+
+ >>> def remove_mailto(attrs, new=False):
+ ... if attrs[(None, u'href')].startswith(u'mailto:'):
+ ... return None
+ ... return attrs
+ ...
+ >>> linker = Linker(callbacks=[remove_mailto])
+ >>> linker.linkify('mail janet!')
+ u'mail janet!'
Skipping links in pre blocks (``skip_pre``)
@@ -194,11 +262,31 @@ any new links within a ```` section, pass ``skip_pre=True``.
Linkifying email addresses (``parse_email``)
============================================
-By default, ``linkify()`` does not create ``mailto:`` links for email
-addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
-will go through exactly the same set of callbacks as all other links, whether
-they are newly created or already in the text, so be careful when writing
-callbacks that may need to behave differently if the protocol is ``mailto:``.
+By default, :py:func:`bleach.linkify` does not create ``mailto:`` links for
+email addresses, but if you pass ``parse_email=True``, it will. ``mailto:``
+links will go through exactly the same set of callbacks as all other links,
+whether they are newly created or already in the text, so be careful when
+writing callbacks that may need to behave differently if the protocol is
+``mailto:``.
+
+
+Using ``bleach.linkifier.Linker``
+=================================
+
+If you're linking a lot of text and passing the same argument values or you want
+more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+instance.
+
+.. doctest::
+
+ >>> from bleach.linkifier import Linker
+
+ >>> linker = Linker(skip_pre=True)
+ >>> linker.linkify('a b c http://example.com d e f')
+ u'a b c http://example.com d e f'
+
+
+.. autoclass:: bleach.linkifier.Linker
Using ``bleach.linkifier.LinkifyFilter``
@@ -235,7 +323,7 @@ And passing parameters to ``LinkifyFilter``:
>>> from functools import partial
- >>> from bleach import Cleaner
+ >>> from bleach.sanitizer import Cleaner
>>> from bleach.linkifier import LinkifyFilter
>>> cleaner = Cleaner(
diff --git a/tests/test_basics.py b/tests/test_basics.py
index e3f5d2da..bff29c0f 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -3,6 +3,7 @@
import six
import bleach
+from bleach.sanitizer import Cleaner
class TestClean:
@@ -291,8 +292,11 @@ def __iter__(self):
}
TAGS = ['img']
dirty = 'this is cute!
'
+
+ cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
assert (
- bleach.clean(dirty, tags=TAGS, attributes=ATTRS, filters=[MooFilter]) ==
+ cleaner.clean(dirty) ==
'this is cute!
'
)
@@ -302,7 +306,7 @@ def test_basics(self):
TAGS = ['span', 'br']
ATTRS = {'span': ['style']}
- cleaner = bleach.Cleaner(tags=TAGS, attributes=ATTRS)
+ cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
assert (
cleaner.clean('a
test') ==
diff --git a/tests/test_links.py b/tests/test_links.py
index 8e166543..e602abd4 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -1,3 +1,4 @@
+import re
try:
from urllib.parse import quote_plus
except ImportError:
@@ -6,13 +7,7 @@
import pytest
from bleach import linkify, DEFAULT_CALLBACKS as DC
-from bleach.linkifier import url_re
-
-
-def test_url_re():
- text = 'just what i am looking for...it'
- match = url_re.search(text)
- assert not match, 'matched {0!s}'.format(text[slice(*match.span())])
+from bleach.linkifier import Linker
def test_empty():
@@ -540,8 +535,7 @@ def test_link_emails_and_urls():
def test_links_case_insensitive():
"""Protocols and domain names are case insensitive."""
- expect = (''
- 'HTTP://EXAMPLE.COM')
+ expect = 'HTTP://EXAMPLE.COM'
assert linkify('HTTP://EXAMPLE.COM') == expect
@@ -599,3 +593,35 @@ def test_hang():
linkify("an@email.com", parse_email=True) ==
'an@email.com '
)
+
+
+def test_url_re_arg():
+ """Verifies that a specified url_re is used"""
+ fred_re = re.compile(r"""(fred\.com)""")
+
+ linker = Linker(url_re=fred_re)
+ assert (
+ linker.linkify('a b c fred.com d e f') ==
+ 'a b c fred.com d e f'
+ )
+
+ assert (
+ linker.linkify('a b c http://example.com d e f') ==
+ 'a b c http://example.com d e f'
+ )
+
+
+def test_email_re_arg():
+ """Verifies that a specified email_re is used"""
+ fred_re = re.compile(r"""(fred@example\.com)""")
+
+ linker = Linker(parse_email=True, email_re=fred_re)
+ assert (
+ linker.linkify('a b c fred@example.com d e f') ==
+ 'a b c fred@example.com d e f'
+ )
+
+ assert (
+ linker.linkify('a b c jim@example.com d e f') ==
+ 'a b c jim@example.com d e f'
+ )
From 975091d0ba9c9ed4000d0c457eddbd03178ab44e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 6 Mar 2017 14:12:49 -0500
Subject: [PATCH 080/314] Minor fixes
---
CHANGES | 6 +++---
bleach/sanitizer.py | 6 +-----
2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/CHANGES b/CHANGES
index 9afe859f..3ff2b789 100644
--- a/CHANGES
+++ b/CHANGES
@@ -44,7 +44,7 @@ Version 2.0 (in development)
def check_protocol(attrs, is_new):
if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')):
- # ^^^^^^^^^^^^^^
+ # ^^^^^^^^^^^^^^^
return None
return attrs
@@ -65,8 +65,8 @@ Version 2.0 (in development)
favorite linkify settings for easy reuse.
* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that
- you can pass as a filter to ``bleach.Cleaner`` allowing you to clean and
- linkify in one pass.
+ you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
+ and linkify in one pass.
* Tons of bug fixes.
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index fcbcd915..06c90665 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -36,10 +36,6 @@
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
class Cleaner(object):
"""Cleaner for cleaning HTML fragments of malicious content
@@ -53,7 +49,7 @@ class Cleaner(object):
To use::
- from bleach import Cleaner
+ from bleach.sanitizer import Cleaner
cleaner = Cleaner()
From ef442862f6b0ee64e88570705bf4d287f80669c8 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 6 Mar 2017 16:33:23 -0500
Subject: [PATCH 081/314] More test cleanup
* move linkify tests to test_linkify.py
* remove tests that are related to the previous implementation
---
tests/test_basics.py | 48 ++++++++------------------------------------
tests/test_links.py | 35 +++++++++++++++++++-------------
2 files changed, 29 insertions(+), 54 deletions(-)
diff --git a/tests/test_basics.py b/tests/test_basics.py
index bff29c0f..031ab66d 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -140,8 +140,7 @@ def test_stripping(self):
'multiply nested text
'
)
- s = ('')
+ s = ''
assert (
bleach.clean(s, tags=['p', 'a'], strip=True) ==
''
@@ -301,6 +300,13 @@ def __iter__(self):
)
+def test_clean_idempotent():
+ """Make sure that applying the filter twice doesn't change anything."""
+ dirty = 'invalid & < extra http://link.com'
+
+ assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+
+
class TestCleaner:
def test_basics(self):
TAGS = ['span', 'br']
@@ -312,41 +318,3 @@ def test_basics(self):
cleaner.clean('a
test') ==
'a
test'
)
-
-
-class TestLinkify:
- def test_no_href_links(self):
- s = 'x'
- assert bleach.linkify(s) == s
-
- def test_rel_already_there(self):
- """Make sure rel attribute is updated not replaced"""
- linked = ('Click '
- 'here.')
-
- link_good = 'Click here.'
-
- assert bleach.linkify(linked) == link_good
- assert bleach.linkify(link_good) == link_good
-
-
-def test_idempotent():
- """Make sure that applying the filter twice doesn't change anything."""
- dirty = 'invalid & < extra http://link.com'
-
- clean = bleach.clean(dirty)
- assert bleach.clean(clean) == clean
-
- linked = bleach.linkify(dirty)
- assert (
- bleach.linkify(linked) ==
- 'invalid & < extra http://link.com'
- )
-
-
-def test_serializer():
- s = '
'
- assert bleach.clean(s, tags=['table']) == s
- assert bleach.linkify('test
') == 'test
'
- assert bleach.clean('test
', tags=['p']) == 'test
'
diff --git a/tests/test_links.py b/tests/test_links.py
index e602abd4..28b6ad6d 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -515,12 +515,6 @@ def test_ignore_bad_protocols():
)
-def test_max_recursion_depth():
- """If we hit the max recursion depth, just return the string."""
- test = '' * 2000 + 'foo' + '' * 2000
- assert linkify(test) == test
-
-
def test_link_emails_and_urls():
"""parse_email=True shouldn't prevent URLs from getting linkified."""
assert (
@@ -551,14 +545,6 @@ def test_elements_inside_links():
)
-def test_remove_first_childlink():
- callbacks = [lambda *a: None]
- assert (
- linkify('', callbacks=callbacks) ==
- 'something
'
- )
-
-
def test_drop_link_tags():
"""Verify that dropping link tags *just* drops the tag and not the content"""
html = (
@@ -625,3 +611,24 @@ def test_email_re_arg():
linker.linkify('a b c jim@example.com d e f') ==
'a b c jim@example.com d e f'
)
+
+
+def test_linkify_idempotent():
+ dirty = 'invalid & < extra http://link.com'
+ assert linkify(linkify(dirty)) == linkify(dirty)
+
+
+class TestLinkify:
+ def test_no_href_links(self):
+ s = 'x'
+ assert linkify(s) == s
+
+ def test_rel_already_there(self):
+ """Make sure rel attribute is updated not replaced"""
+ linked = ('Click '
+ 'here.')
+
+ link_good = 'Click here.'
+
+ assert linkify(linked) == link_good
+ assert linkify(link_good) == link_good
From a08454cdfea4bd758deab28e19084ccaa7388e1c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 6 Mar 2017 20:17:10 -0500
Subject: [PATCH 082/314] Rework attributes value and filters
This reworks how attributes argument works. Callables now take three arguments:
tag, attribute name and attribute value. Callables can be passed in as the
attributes argument value or as a value for any of the tags in the dict.
This also reworks the implementation so the complexity of the different shapes
is shuffled away out of ``allow_token`` which simplifies it a bit.
---
CHANGES | 10 +++++
bleach/sanitizer.py | 65 ++++++++++++++++++++++--------
docs/clean.rst | 90 ++++++++++++++++++++++++++----------------
tests/test_basics.py | 59 +++++++++++++++++++++++----
tests/test_security.py | 2 +-
5 files changed, 166 insertions(+), 60 deletions(-)
diff --git a/CHANGES b/CHANGES
index 3ff2b789..79f56a9e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -25,6 +25,12 @@ Version 2.0 (in development)
Amongst other things, this version will add end tags even if the tag in
question is to be escaped.
+* ``bleach.clean`` and friends attribute callables now take three arguments:
+ tag, attribute name and attribute value. Previously they only took attribute
+ name and attribute value.
+
+ All attribute callables will need to be updated.
+
* ``bleach.linkify`` was rewritten
``linkify`` was reimplemented as an html5lib Filter. As such, it no longer
@@ -52,6 +58,8 @@ Version 2.0 (in development)
don't then html5lib will raise an assertion error that the value is not
unicode.
+ All linkify filters will need to be updated.
+
**Changes**
* Supports Python 3.6.
@@ -68,6 +76,8 @@ Version 2.0 (in development)
you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean
and linkify in one pass.
+* ``bleach.clean`` and friends can now take a callable as an attributes arg value.
+
* Tons of bug fixes.
* Cleaned up tests.
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 06c90665..1223e79b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -128,7 +128,7 @@ def clean(self, text):
source=self.walker(dom),
# Bleach-sanitizer-specific things
- allowed_attributes_map=self.attributes,
+ attributes=self.attributes,
strip_disallowed_elements=self.strip,
strip_html_comments=self.strip_comments,
@@ -146,22 +146,58 @@ def clean(self, text):
return self.serializer.render(filtered)
+def attribute_filter_factory(attributes):
+ """Generates attribute filter function for the given attributes value
+
+ The attributes value can take one of several shapes. This returns a filter
+ function appropriate to the attributes value. One nice thing about this is
+ that there's less if/then shenanigans in the ``allow_token`` method.
+
+ """
+ if callable(attributes):
+ return attributes
+
+ if isinstance(attributes, dict):
+ def _attr_filter(tag, attr, value):
+ if tag in attributes:
+ attr_val = attributes[tag]
+ if callable(attr_val):
+ return attr_val(tag, attr, value)
+
+ if attr in attr_val:
+ return True
+
+ if '*' in attributes:
+ attr_val = attributes['*']
+ if callable(attr_val):
+ return attr_val(tag, attr, value)
+
+ return attr in attr_val
+
+ return False
+
+ return _attr_filter
+
+ if isinstance(attributes, list):
+ def _attr_filter(tag, attr, value):
+ return attr in attributes
+
+ return _attr_filter
+
+ raise ValueError('attributes needs to be a callable, a list or a dict')
+
+
class BleachSanitizerFilter(sanitizer.Filter):
"""html5lib Filter that sanitizes text
This filter can be used anywhere html5lib filters can be used.
"""
- def __init__(self, source, allowed_attributes_map,
+ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
strip_disallowed_elements=False, strip_html_comments=True,
**kwargs):
- if isinstance(allowed_attributes_map, dict):
- self.wildcard_attributes = allowed_attributes_map.get('*', [])
- self.allowed_attributes_map = allowed_attributes_map
- else:
- self.wildcard_attributes = allowed_attributes_map
- self.allowed_attributes_map = {}
+ self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements
self.strip_html_comments = strip_html_comments
@@ -205,10 +241,6 @@ def sanitize_token(self, token):
def allow_token(self, token):
"""Handles the case where we're allowing the tag"""
if 'data' in token:
- allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
- if not callable(allowed_attributes):
- allowed_attributes += self.wildcard_attributes
-
# Loop through all the attributes and drop the ones that are not
# allowed, are unsafe or break other rules. Additionally, fix
# attribute values that need fixing.
@@ -220,11 +252,10 @@ def allow_token(self, token):
namespace, name = namespaced_name
# Drop attributes that are not explicitly allowed
- if callable(allowed_attributes):
- if not allowed_attributes(name, val):
- continue
-
- elif name not in allowed_attributes:
+ #
+ # NOTE(willkg): We pass in the attribute name--not a namespaced
+ # name.
+ if not self.attr_filter(token['name'], name, val):
continue
# Look at attributes that have uri values
diff --git a/docs/clean.rst b/docs/clean.rst
index 161e4357..b02c4525 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -55,8 +55,8 @@ The default value is also a conservative dict found in
As a list
---------
-The ``attributes`` value can be a list, in which case the attributes are allowed
-for any tag.
+The ``attributes`` value can be a list which specifies the list of attributes
+allowed for any tag.
For example:
@@ -76,10 +76,12 @@ For example:
As a dict
---------
-The ``attributes`` value can be a dict, in which case the keys are tag names (or
-a wildcard: ``*`` for all tags) and the values are lists of allowed attributes.
+The ``attributes`` value can be a dict which maps tags to what attributes they can have.
-For example:
+You can also specify ``*``, which will match any tag.
+
+For example, this allows "href" and "rel" for "a" tags, "alt" for the "img" tag
+and "class" for any tag (including "a" and "img"):
.. doctest::
@@ -99,48 +101,66 @@ For example:
u'
'
-In this case, ``class`` is allowed on any allowed element (from the ``tags``
-argument), ```` tags are allowed to have ``href`` and ``rel`` attributes,
-and so on.
-
-
Using functions
---------------
-You can also use callables. If the callable returns ``True``, the attribute is
-allowed. Otherwise, it is stripped. For example:
+You can also use callables that take the tag, attribute name and attribute value
+and returns ``True`` to keep the attribute or ``False`` to drop it.
+
+You can pass a callable as the attributes argument value and it'll run for
+every tag/attr.
+
+For example:
+
+.. doctest::
+
+ >>> import bleach
+
+ >>> def allow_h(tag, name, value):
+ ... return name[0] == 'h'
+
+ >>> bleach.clean(
+ ... u'link',
+ ... tags=['a'],
+ ... attributes=allow_h,
+ ... )
+ u'link'
+
+
+You can also pass a callable as a value in an attributes dict and it'll run for
+attributes for specified tags:
.. doctest::
- >>> from urlparse import urlparse
- >>> import bleach
+ >>> from urlparse import urlparse
+ >>> import bleach
- >>> def allow_src(name, value):
- ... if name in ('alt', 'height', 'width'):
- ... return True
- ... if name == 'src':
- ... p = urlparse(value)
- ... return (not p.netloc) or p.netloc == 'mydomain.com'
- ... return False
+ >>> def allow_src(tag, name, value):
+ ... if name in ('alt', 'height', 'width'):
+ ... return True
+ ... if name == 'src':
+ ... p = urlparse(value)
+ ... return (not p.netloc) or p.netloc == 'mydomain.com'
+ ... return False
- >>> bleach.clean(
- ... u'
',
- ... tags=['img'],
- ... attributes={
- ... 'img': allow_src
- ... }
- ... )
- u'
'
+ >>> bleach.clean(
+ ... u'
',
+ ... tags=['img'],
+ ... attributes={
+ ... 'img': allow_src
+ ... }
+ ... )
+ u'
'
Allowed styles (``styles``)
===========================
-If you allow the ``style`` attribute, you will also need to whitelist styles
-users are allowed to set, for example ``color`` and ``background-color``.
+If you allow the ``style`` attribute, you will also need to specify the allowed
+styles users are allowed to set, for example ``color`` and ``background-color``.
-The default value is an empty list, i.e., the ``style`` attribute will be
-allowed but no values will be.
+The default value is an empty list. In other words, the ``style`` attribute will
+be allowed but no style declaration names will be allowed.
For example, to allow users to set the color and font-weight of text:
@@ -205,8 +225,8 @@ Default protocols are in ``bleach.ALLOWED_PROTOCOLS``.
Stripping markup (``strip``)
============================
-By default, Bleach *escapes* tags that aren't specified in the tags
-whitelist and invalid markup. For example:
+By default, Bleach *escapes* tags that aren't specified in the allowed tags list
+and invalid markup. For example:
.. doctest::
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 031ab66d..5b59ebf9 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -164,23 +164,46 @@ def test_lowercase_html(self):
clean = 'BAR'
assert bleach.clean(dirty, attributes=['class']) == clean
- def test_wildcard_attributes(self):
+ def test_attributes_callable(self):
+ """Verify attributes can take a callable"""
+ ATTRS = lambda tag, name, val: name == 'title'
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+ def test_attributes_wildcard(self):
+ """Verify attributes[*] works"""
ATTRS = {
'*': ['id'],
'img': ['src'],
}
- TAG = ['img', 'em']
+ TAGS = ['img', 'em']
dirty = ('both can have '
'
')
assert (
- bleach.clean(dirty, tags=TAG, attributes=ATTRS) ==
+ bleach.clean(dirty, tags=TAGS, attributes=ATTRS) ==
'both can have
'
)
- def test_callable_attributes(self):
- """Verify callable attributes work and get correct arg values"""
- def img_test(attr, val):
- return attr == 'src' and val.startswith('https')
+ def test_attributes_wildcard_callable(self):
+ """Verify attributes[*] callable works"""
+ ATTRS = {
+ '*': lambda tag, name, val: name == 'title'
+ }
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+ def test_attributes_tag_callable(self):
+ """Verify attributes[tag] callable works"""
+ def img_test(tag, name, val):
+ return name == 'src' and val.startswith('https')
ATTRS = {
'img': img_test,
@@ -198,6 +221,28 @@ def img_test(attr, val):
u'foo
baz'
)
+ def test_attributes_tag_list(self):
+ """Verify attributes[tag] list works"""
+ ATTRS = {
+ 'a': ['title']
+ }
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+ def test_attributes_list(self):
+ """Verify attributes list works"""
+ ATTRS = ['title']
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
def test_svg_attr_val_allows_ref(self):
"""Unescape values in svg attrs that allow url references"""
# Local IRI, so keep it
diff --git a/tests/test_security.py b/tests/test_security.py
index 2aac0200..da0fe92f 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -75,7 +75,7 @@ def test_invalid_href_attr():
def test_invalid_filter_attr():
IMG = ['img', ]
IMG_ATTR = {
- 'img': lambda attr, val: attr == 'src' and val == "http://example.com/"
+ 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
}
assert (
From 2cedde71bfa263ccf0ce76f630468912aa3f212f Mon Sep 17 00:00:00 2001
From: "Alexandr N. Zamaraev"
Date: Tue, 21 Feb 2017 23:49:19 +0700
Subject: [PATCH 083/314] Correct dublicates in email_re
see #247
---
bleach/linkifier.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 1396f056..92351be4 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -64,10 +64,10 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols):
EMAIL_RE = re.compile(
r"""(?
Date: Wed, 22 Feb 2017 12:06:41 +0700
Subject: [PATCH 084/314] Add test incorrect email
---
tests/test_links.py | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tests/test_links.py b/tests/test_links.py
index 28b6ad6d..99b30b89 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -109,6 +109,13 @@ def ft(attrs, new=False):
True,
'mailto james@example.com.au.'
),
+ # Incorrect email
+ (
+ '"\\\n"@opa.ru',
+ True,
+ '"\\\n"@opa.ru'
+ ),
+
])
def test_email_link(data, parse_email, expected):
assert linkify(data, parse_email=parse_email) == expected
From 9a617a52d6b5e81bd7ca8407f1e0810fc412cc2a Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 6 Mar 2017 21:50:30 -0500
Subject: [PATCH 085/314] Change skip_pre to the more general skip_tags
This changes skip_pre to a more general skip_tags that lets you skip linkifying
in a specified list of tags--not just pre.
---
CHANGES | 17 +++++++++
README.rst | 2 +-
bleach/__init__.py | 18 +++++----
bleach/linkifier.py | 89 ++++++++++++++++++++++++++++++++++++---------
bleach/sanitizer.py | 30 ++++++++++++---
docs/goals.rst | 22 +++++------
docs/linkify.rst | 14 +++----
setup.py | 4 +-
tests/test_links.py | 10 ++---
9 files changed, 149 insertions(+), 57 deletions(-)
diff --git a/CHANGES b/CHANGES
index 79f56a9e..050f4fc1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -60,6 +60,23 @@ Version 2.0 (in development)
All linkify filters will need to be updated.
+* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been
+ replaced with a more general ``skip_tags`` argument.
+
+ Before, you might do::
+
+ bleach.linkify(some_text, skip_pre=True)
+
+ The equivalent with Bleach 2.0 is::
+
+ bleach.linkify(some_text, skip_tags=['pre'])
+
+ You can skip other tags, too, like ``style`` or ``script`` or other places
+ where you don't want linkification happening.
+
+ All uses of linkify that use ``skip_pre`` will need to be updated.
+
+
**Changes**
* Supports Python 3.6.
diff --git a/README.rst b/README.rst
index 403ff9b6..08dd886a 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Bleach
.. image:: https://badge.fury.io/py/bleach.svg
:target: http://badge.fury.io/py/bleach
-Bleach is a whitelist-based HTML sanitizing library that escapes or strips
+Bleach is a allowed-list-based HTML sanitizing library that escapes or strips
markup and attributes.
Bleach can also linkify text safely, applying filters that Django's ``urlize``
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 07b5075c..a231f136 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -47,16 +47,16 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
:arg str text: the text to clean
- :arg list tags: whitelist of allowed tags; defaults to
+ :arg list tags: allowed list of tags; defaults to
``bleach.ALLOWED_TAGS``
- :arg dict attributes: whitelist of allowed attributes; defaults to
- ``bleach.ALLOWED_ATTRIBUTES``
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
+ defaults to ``bleach.ALLOWED_ATTRIBUTES``
- :arg list styles: whitelist of allowed css; defaults to
+ :arg list styles: allowed list of css styles; defaults to
``bleach.ALLOWED_STYLES``
- :arg list protocols: whitelist of allowed protocols for links; defaults
+ :arg list protocols: allowed list of protocols for links; defaults
to ``bleach.ALLOWED_PROTOCOLS``
:arg bool strip: whether or not to strip disallowed elements
@@ -77,7 +77,7 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
return cleaner.clean(text)
-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False):
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
"""Convert URL-like strings in an HTML fragment to links
This function converts strings that look like URLs, domain names and email
@@ -106,7 +106,9 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False
:arg list callbacks: list of callbacks to run when adjusting tag attributes
- :arg bool skip_pre: whether or not to skip linkifying text in a ``pre`` tag
+ :arg list skip_tags: list of tags that you don't want to linkify the
+ contents of; for example, you could set this to ``['pre']`` to skip
+ linkifying contents of ``pre`` tags
:arg bool parse_email: whether or not to linkify email addresses
@@ -115,7 +117,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False
"""
linker = Linker(
callbacks=callbacks,
- skip_pre=skip_pre,
+ skip_tags=skip_tags,
parse_email=parse_email
)
return linker.linkify(text)
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 92351be4..6103e81e 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -74,10 +74,40 @@ def build_url_re(tlds=TLDS, protocols=allowed_protocols):
class Linker(object):
- def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False,
+ """Convert URL-like strings in an HTML fragment to links
+
+ This function converts strings that look like URLs, domain names and email
+ addresses in text that may be an HTML fragment to links, while preserving:
+
+ 1. links already in the string
+ 2. urls found in attributes
+ 3. email addresses
+
+ linkify does a best-effort approach and tries to recover from bad
+ situations due to crazy text.
+
+ """
+ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
url_re=URL_RE, email_re=EMAIL_RE):
+ """Creates a Linker instance
+
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes
+
+ :arg list skip_tags: list of tags that you don't want to linkify the
+ contents of; for example, you could set this to ``['pre']`` to skip
+ linkifying contents of ``pre`` tags
+
+ :arg bool parse_email: whether or not to linkify email addresses
+
+ :arg re url_re: url matching regex
+
+ :arg email_re: email matching regex
+
+ :returns: linkified text as unicode
+
+ """
self.callbacks = callbacks
- self.skip_pre = skip_pre
+ self.skip_tags = skip_tags
self.parse_email = parse_email
self.url_re = url_re
self.email_re = email_re
@@ -105,7 +135,7 @@ def linkify(self, text):
filtered = LinkifyFilter(
source=self.walker(dom),
callbacks=self.callbacks,
- skip_pre=self.skip_pre,
+ skip_tags=self.skip_tags,
parse_email=self.parse_email,
url_re=self.url_re,
email_re=self.email_re,
@@ -126,12 +156,31 @@ class LinkifyFilter(Filter):
This filter can be used anywhere html5lib filters can be used.
"""
- def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False,
+ def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
url_re=URL_RE, email_re=EMAIL_RE):
+ """Creates a LinkifyFilter instance
+
+ :arg TreeWalker source: stream
+
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes
+
+ :arg list skip_tags: list of tags that you don't want to linkify the
+ contents of; for example, you could set this to ``['pre']`` to skip
+ linkifying contents of ``pre`` tags
+
+ :arg bool parse_email: whether or not to linkify email addresses
+
+ :arg re url_re: url matching regex
+
+ :arg email_re: email matching regex
+
+ :returns: linkified text as unicode
+
+ """
super(LinkifyFilter, self).__init__(source)
self.callbacks = callbacks or []
- self.skip_pre = skip_pre
+ self.skip_tags = skip_tags or []
self.parse_email = parse_email
self.url_re = url_re
@@ -140,9 +189,15 @@ def __init__(self, source, callbacks=None, skip_pre=False, parse_email=False,
def apply_callbacks(self, attrs, is_new):
"""Given an attrs dict and an is_new bool, runs through callbacks
- Callbacks can return an adjusted attrs dict or None. In the case of
- None, we stop going through callbacks and return that and the link gets
- dropped.
+ Callbacks can return an adjusted attrs dict or ``None``. In the case of
+ ``None``, we stop going through callbacks and return that and the link
+ gets dropped.
+
+ :arg dict attrs: map of ``(namespace, name)`` -> ``value``
+
+ :arg bool is_new: whether or not this link was added by linkify
+
+ :returns: adjusted attrs dict or ``None``
"""
for cb in self.callbacks:
@@ -399,7 +454,7 @@ def handle_a_tag(self, token_buffer):
def __iter__(self):
in_a = False
- in_pre = False
+ in_skip_tag = None
token_buffer = []
@@ -425,10 +480,10 @@ def __iter__(self):
continue
elif token['type'] in ['StartTag', 'EmptyTag']:
- if token['name'] == 'pre' and self.skip_pre:
- # The "pre" tag starts a "special mode" where we don't linkify
- # anything.
- in_pre = True
+ if token['name'] in self.skip_tags:
+ # Skip tags start a "special mode" where we don't linkify
+ # anything until the end tag.
+ in_skip_tag = token['name']
elif token['name'] == 'a':
# The "a" tag is special--we switch to a slurp mode and
@@ -441,13 +496,13 @@ def __iter__(self):
# yet
continue
- elif in_pre and self.skip_pre:
+ elif in_skip_tag and self.skip_tags:
# NOTE(willkg): We put this clause here since in_a and
# switching in and out of in_a takes precedence.
- if token['type'] == 'EndTag' and token['name'] == 'pre':
- in_pre = False
+ if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
+ in_skip_tag = None
- elif not in_a and not in_pre and token['type'] == 'Characters':
+ elif not in_a and not in_skip_tag and token['type'] == 'Characters':
new_stream = iter([token])
if self.parse_email:
new_stream = self.handle_email_addresses(new_stream)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 1223e79b..b5c2fe95 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -63,16 +63,16 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
strip_comments=True, filters=None):
"""Initializes a Cleaner
- :arg tags: whitelist of allowed tags; defaults to
+ :arg list tags: allowed list of tags; defaults to
``bleach.ALLOWED_TAGS``
- :arg attributes: whitelist of allowed attributes; defaults to
- ``bleach.ALLOWED_ATTRIBUTES``
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
+ defaults to ``bleach.ALLOWED_ATTRIBUTES``
- :arg styles: whitelist of allowed css; defaults to
+ :arg list styles: allowed list of css styles; defaults to
``bleach.ALLOWED_STYLES``
- :arg protocols: whitelist of allowed protocols for links; defaults
+ :arg list protocols: allowed list of protocols for links; defaults
to ``bleach.ALLOWED_PROTOCOLS``
:arg strip: whether or not to strip disallowed elements
@@ -196,7 +196,27 @@ class BleachSanitizerFilter(sanitizer.Filter):
def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
strip_disallowed_elements=False, strip_html_comments=True,
**kwargs):
+ """Creates a BleachSanitizerFilter instance
+ :arg Treewalker source: stream
+
+ :arg list tags: allowed list of tags; defaults to
+ ``bleach.ALLOWED_TAGS``
+
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
+ defaults to ``bleach.ALLOWED_ATTRIBUTES``
+
+ :arg list styles: allowed list of css styles; defaults to
+ ``bleach.ALLOWED_STYLES``
+
+ :arg list protocols: allowed list of protocols for links; defaults
+ to ``bleach.ALLOWED_PROTOCOLS``
+
+ :arg strip_disallowed_elements: whether or not to strip disallowed elements
+
+ :arg strip_html_comments: whether or not to strip HTML comments
+
+ """
self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements
diff --git a/docs/goals.rst b/docs/goals.rst
index 632c222c..015bc563 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -13,15 +13,15 @@ Goals
=====
-Always take a whitelist-based approach
---------------------------------------
+Always take a allowed-list-based approach
+-----------------------------------------
-Bleach should always take a whitelist-based approach to allowing any kind of
-content or markup. Blacklisting is error-prone and not future proof.
+Bleach should always take a allowed-list-based approach to markup filtering.
+Specifying disallowed lists is error-prone and not future proof.
For example, you should have to opt-in to allowing the ``onclick`` attribute,
-not blacklist all the other ``on*`` attributes. Future versions of HTML may add
-new event handlers, like ``ontouch``, that old blacklists would not prevent.
+not opt-out of all the other ``on*`` attributes. Future versions of HTML may add
+new event handlers, like ``ontouch``, that old disallow would not prevent.
Main goal is to sanitize input of malicious content
@@ -39,8 +39,8 @@ Examples might include:
These examples, and others, are traditionally prone to security issues like XSS
or other script injection, or annoying issues like unclosed tags and invalid
-markup. Bleach will take a proactive, whitelist-only approach to allowing HTML
-content, and will use the HTML5 parsing algorithm to handle invalid markup.
+markup. Bleach will take a proactive, allowed-list-only approach to allowing
+HTML content, and will use the HTML5 parsing algorithm to handle invalid markup.
See the :ref:`chapter on clean() ` for more info.
@@ -52,7 +52,7 @@ The secondary goal of Bleach is to provide a mechanism for finding or altering
links (```` tags with ``href`` attributes, or things that look like URLs or
email addresses) in text.
-While Bleach itself will always operate on a whitelist-based security model,
+While Bleach itself will always operate on a allowed-list-based security model,
the :ref:`linkify() method ` is flexible enough to allow the
creation, alteration, and removal of links based on an extremely wide range of
use cases.
@@ -69,8 +69,8 @@ Sanitize complete HTML documents
--------------------------------
Once you're creating whole documents, you have to allow so many tags that a
-blacklist approach (e.g. forbidding `` test') ==
- 'a <script>safe()</script> test'
- )
- assert (
- bleach.clean('a test') ==
- 'a <style>body{}</style> test'
- )
-
- def test_bad_href(self):
- assert (
- bleach.clean('no link') ==
- 'no link'
- )
-
- def test_bare_entities(self):
- assert (
- bleach.clean('an & entity') ==
- 'an & entity'
- )
- assert (
- bleach.clean('an < entity') ==
- 'an < entity'
- )
-
- assert (
- bleach.clean('tag < and entity') ==
- 'tag < and entity'
- )
-
- assert (
- bleach.clean('&') ==
- '&'
- )
-
- def test_escaped_entities(self):
- s = '<em>strong</em>'
- assert bleach.clean(s) == s
-
- def test_weird_strings(self):
- s = '3'
- assert bleach.clean(s) == ''
-
- def test_stripping(self):
- assert (
- bleach.clean('a test with html tags', strip=True) ==
- 'a test with html tags'
- )
- assert (
- bleach.clean('a test with
html tags',
- strip=True) ==
- 'a test with html tags'
- )
-
- s = ''
- assert (
- bleach.clean(s, tags=['p'], strip=True) ==
- 'link text
'
- )
- s = 'multiply nested text
'
- assert (
- bleach.clean(s, tags=['p'], strip=True) ==
- 'multiply nested text
'
- )
-
- s = ''
- assert (
- bleach.clean(s, tags=['p', 'a'], strip=True) ==
- ''
- )
-
- def test_allowed_styles(self):
- ATTRS = ['style']
- STYLE = ['color']
- blank = ''
- s = ''
- assert bleach.clean('', attributes=ATTRS) == blank
- assert bleach.clean(s, attributes=ATTRS, styles=STYLE) == s
- assert (
- bleach.clean('', attributes=ATTRS, styles=STYLE) ==
- s
- )
-
- def test_lowercase_html(self):
- """We should output lowercase HTML."""
- dirty = 'BAR'
- clean = 'BAR'
- assert bleach.clean(dirty, attributes=['class']) == clean
-
- def test_attributes_callable(self):
- """Verify attributes can take a callable"""
- ATTRS = lambda tag, name, val: name == 'title'
- TAGS = ['a']
-
- assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
- u'example'
- )
-
- def test_attributes_wildcard(self):
- """Verify attributes[*] works"""
- ATTRS = {
- '*': ['id'],
- 'img': ['src'],
- }
- TAGS = ['img', 'em']
- dirty = ('both can have '
- '
')
- assert (
- bleach.clean(dirty, tags=TAGS, attributes=ATTRS) ==
- 'both can have
'
- )
-
- def test_attributes_wildcard_callable(self):
- """Verify attributes[*] callable works"""
- ATTRS = {
- '*': lambda tag, name, val: name == 'title'
- }
- TAGS = ['a']
-
- assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
- u'example'
- )
-
- def test_attributes_tag_callable(self):
- """Verify attributes[tag] callable works"""
- def img_test(tag, name, val):
- return name == 'src' and val.startswith('https')
-
- ATTRS = {
- 'img': img_test,
- }
- TAGS = ['img']
-
- assert (
- bleach.clean('foo
baz', tags=TAGS,
- attributes=ATTRS) ==
- u'foo
baz'
- )
- assert (
- bleach.clean('foo
baz', tags=TAGS,
- attributes=ATTRS) ==
- u'foo
baz'
- )
-
- def test_attributes_tag_list(self):
- """Verify attributes[tag] list works"""
- ATTRS = {
- 'a': ['title']
- }
- TAGS = ['a']
-
- assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
- u'example'
- )
-
- def test_attributes_list(self):
- """Verify attributes list works"""
- ATTRS = ['title']
- TAGS = ['a']
-
- assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
- u'example'
- )
-
- def test_svg_attr_val_allows_ref(self):
- """Unescape values in svg attrs that allow url references"""
- # Local IRI, so keep it
- text = ''
- TAGS = ['svg', 'rect']
- ATTRS = {
- 'rect': ['fill'],
- }
- assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
- ''
- )
-
- # Non-local IRI, so drop it
- text = ''
- TAGS = ['svg', 'rect']
- ATTRS = {
- 'rect': ['fill'],
- }
- assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
- ''
- )
-
- @pytest.mark.parametrize('text, expected', [
- (
- '',
- ''
- ),
- (
- '',
- # NOTE(willkg): Bug in html5lib serializer drops the xlink part
- ''
- ),
- ])
- def test_svg_allow_local_href(self, text, expected):
- """Keep local hrefs for svg elements"""
- TAGS = ['svg', 'pattern']
- ATTRS = {
- 'pattern': ['id', 'href'],
- }
- assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
- @pytest.mark.parametrize('text, expected', [
- (
- '',
- ''
- ),
- (
- '',
- ''
- ),
- ])
- def test_svg_allow_local_href_nonlocal(self, text, expected):
- """Drop non-local hrefs for svg elements"""
- TAGS = ['svg', 'pattern']
- ATTRS = {
- 'pattern': ['id', 'href'],
- }
- assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
-
- @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
- def test_sarcasm(self):
- """Jokes should crash. """
- dirty = 'Yeah right '
- clean = 'Yeah right <sarcasm/>'
- assert bleach.clean(dirty) == clean
-
- def test_user_defined_protocols_valid(self):
- valid_href = 'allowed href'
- assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
-
- def test_user_defined_protocols_invalid(self):
- invalid_href = 'invalid href'
- cleaned_href = 'invalid href'
- assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
-
- def test_filters(self):
- # Create a Filter that changes all the attr values to "moo"
- class MooFilter(Filter):
- def __iter__(self):
- for token in Filter.__iter__(self):
- if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
- for attr, value in token['data'].items():
- token['data'][attr] = 'moo'
-
- yield token
-
- ATTRS = {
- 'img': ['rel', 'src']
- }
- TAGS = ['img']
- dirty = 'this is cute!
'
-
- cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
-
- assert (
- cleaner.clean(dirty) ==
- 'this is cute!
'
- )
-
-
-def test_clean_idempotent():
- """Make sure that applying the filter twice doesn't change anything."""
- dirty = 'invalid & < extra http://link.com'
-
- assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
-
-
-class TestCleaner:
- def test_basics(self):
- TAGS = ['span', 'br']
- ATTRS = {'span': ['style']}
-
- cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
-
- assert (
- cleaner.clean('a
test') ==
- 'a
test'
- )
diff --git a/tests/test_clean.py b/tests/test_clean.py
new file mode 100644
index 00000000..a6a37557
--- /dev/null
+++ b/tests/test_clean.py
@@ -0,0 +1,404 @@
+from html5lib.filters.base import Filter
+import pytest
+import six
+
+import bleach
+from bleach.sanitizer import Cleaner
+
+
+def test_empty():
+ assert bleach.clean('') == ''
+
+
+def test_nbsp():
+ if six.PY3:
+ expected = '\xa0test string\xa0'
+ else:
+ expected = six.u('\\xa0test string\\xa0')
+
+ assert bleach.clean(' test string ') == expected
+
+
+def test_comments_only():
+ comment = ''
+ assert bleach.clean(comment) == ''
+ assert bleach.clean(comment, strip_comments=False) == comment
+
+ open_comment = ''.format(open_comment)
+ )
+
+
+def test_with_comments():
+ text = 'Just text'
+ assert bleach.clean(text) == 'Just text'
+ assert bleach.clean(text, strip_comments=False) == text
+
+
+def test_no_html():
+ assert bleach.clean('no html string') == 'no html string'
+
+
+def test_allowed_html():
+ assert (
+ bleach.clean('an allowed tag') ==
+ 'an allowed tag'
+ )
+ assert (
+ bleach.clean('another good tag') ==
+ 'another good tag'
+ )
+
+
+def test_bad_html():
+ assert (
+ bleach.clean('a fixed tag') ==
+ 'a fixed tag'
+ )
+
+
+def test_function_arguments():
+ TAGS = ['span', 'br']
+ ATTRS = {'span': ['style']}
+
+ text = 'a
test'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ 'a
test'
+ )
+
+
+def test_named_arguments():
+ ATTRS = {'a': ['rel', 'href']}
+
+ text = 'xx.com'
+ assert bleach.clean(text) == 'xx.com'
+ assert (
+ bleach.clean(text, attributes=ATTRS) ==
+ 'xx.com'
+ )
+
+
+def test_disallowed_html():
+ assert (
+ bleach.clean('a test') ==
+ 'a <script>safe()</script> test'
+ )
+ assert (
+ bleach.clean('a test') ==
+ 'a <style>body{}</style> test'
+ )
+
+
+def test_bad_href():
+ assert (
+ bleach.clean('no link') ==
+ 'no link'
+ )
+
+
+def test_bare_entities():
+ assert (
+ bleach.clean('an & entity') ==
+ 'an & entity'
+ )
+ assert (
+ bleach.clean('an < entity') ==
+ 'an < entity'
+ )
+
+ assert (
+ bleach.clean('tag < and entity') ==
+ 'tag < and entity'
+ )
+
+ assert (
+ bleach.clean('&') ==
+ '&'
+ )
+
+
+def test_escaped_entities():
+ s = '<em>strong</em>'
+ assert bleach.clean(s) == s
+
+
+def test_weird_strings():
+ s = '3'
+ assert bleach.clean(s) == ''
+
+
+def test_stripping():
+ text = 'a test with html tags'
+ assert (
+ bleach.clean(text, strip=True) ==
+ 'a test with html tags'
+ )
+
+ text = 'a test with
html tags'
+ assert (
+ bleach.clean(text, strip=True) ==
+ 'a test with html tags'
+ )
+
+ text = ''
+ assert (
+ bleach.clean(text, tags=['p'], strip=True) ==
+ 'link text
'
+ )
+ text = 'multiply nested text
'
+ assert (
+ bleach.clean(text, tags=['p'], strip=True) ==
+ 'multiply nested text
'
+ )
+
+ text = ''
+ assert (
+ bleach.clean(text, tags=['p', 'a'], strip=True) ==
+ ''
+ )
+
+
+def test_allowed_styles():
+ ATTRS = ['style']
+ STYLE = ['color']
+
+ assert (
+ bleach.clean('', attributes=ATTRS) ==
+ ''
+ )
+
+ text = ''
+ assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text
+
+ text = ''
+ assert (
+ bleach.clean(text, attributes=ATTRS, styles=STYLE) ==
+ ''
+ )
+
+
+def test_lowercase_html():
+ """We should output lowercase HTML."""
+ assert (
+ bleach.clean('BAR', attributes=['class']) ==
+ 'BAR'
+ )
+
+
+def test_attributes_callable():
+ """Verify attributes can take a callable"""
+ ATTRS = lambda tag, name, val: name == 'title'
+ TAGS = ['a']
+
+ text = u'example'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+
+def test_attributes_wildcard():
+ """Verify attributes[*] works"""
+ ATTRS = {
+ '*': ['id'],
+ 'img': ['src'],
+ }
+ TAGS = ['img', 'em']
+
+ text = 'both can have
'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ 'both can have
'
+ )
+
+
+def test_attributes_wildcard_callable():
+ """Verify attributes[*] callable works"""
+ ATTRS = {
+ '*': lambda tag, name, val: name == 'title'
+ }
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+
+def test_attributes_tag_callable():
+ """Verify attributes[tag] callable works"""
+ def img_test(tag, name, val):
+ return name == 'src' and val.startswith('https')
+
+ ATTRS = {
+ 'img': img_test,
+ }
+ TAGS = ['img']
+
+ text = 'foo
baz'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ u'foo
baz'
+ )
+ text = 'foo
baz'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ u'foo
baz'
+ )
+
+
+def test_attributes_tag_list():
+ """Verify attributes[tag] list works"""
+ ATTRS = {
+ 'a': ['title']
+ }
+ TAGS = ['a']
+
+ assert (
+ bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+
+def test_attributes_list():
+ """Verify attributes list works"""
+ ATTRS = ['title']
+ TAGS = ['a']
+
+ text = u'example'
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ u'example'
+ )
+
+
+def test_svg_attr_val_allows_ref():
+ """Unescape values in svg attrs that allow url references"""
+ # Local IRI, so keep it
+ TAGS = ['svg', 'rect']
+ ATTRS = {
+ 'rect': ['fill'],
+ }
+
+ text = ''
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ ''
+ )
+
+ # Non-local IRI, so drop it
+ TAGS = ['svg', 'rect']
+ ATTRS = {
+ 'rect': ['fill'],
+ }
+ text = ''
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ ''
+ )
+
+
+@pytest.mark.parametrize('text, expected', [
+ (
+ '',
+ ''
+ ),
+ (
+ '',
+ # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+ ''
+ ),
+])
+def test_svg_allow_local_href(text, expected):
+ """Keep local hrefs for svg elements"""
+ TAGS = ['svg', 'pattern']
+ ATTRS = {
+ 'pattern': ['id', 'href'],
+ }
+ assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.parametrize('text, expected', [
+ (
+ '',
+ ''
+ ),
+ (
+ '',
+ ''
+ ),
+])
+def test_svg_allow_local_href_nonlocal(text, expected):
+ """Drop non-local hrefs for svg elements"""
+ TAGS = ['svg', 'pattern']
+ ATTRS = {
+ 'pattern': ['id', 'href'],
+ }
+ assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
+def test_sarcasm():
+ """Jokes should crash. """
+ dirty = 'Yeah right '
+ clean = 'Yeah right <sarcasm/>'
+ assert bleach.clean(dirty) == clean
+
+
+def test_user_defined_protocols_valid():
+ valid_href = 'allowed href'
+ assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
+
+
+def test_user_defined_protocols_invalid():
+ invalid_href = 'invalid href'
+ cleaned_href = 'invalid href'
+ assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+
+
+def test_filters():
+ # Create a Filter that changes all the attr values to "moo"
+ class MooFilter(Filter):
+ def __iter__(self):
+ for token in Filter.__iter__(self):
+ if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+ for attr, value in token['data'].items():
+ token['data'][attr] = 'moo'
+
+ yield token
+
+ ATTRS = {
+ 'img': ['rel', 'src']
+ }
+ TAGS = ['img']
+
+ cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
+ dirty = 'this is cute!
'
+ assert (
+ cleaner.clean(dirty) ==
+ 'this is cute!
'
+ )
+
+
+def test_clean_idempotent():
+ """Make sure that applying the filter twice doesn't change anything."""
+ dirty = 'invalid & < extra http://link.com'
+ assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+
+
+class TestCleaner:
+ def test_basics(self):
+ TAGS = ['span', 'br']
+ ATTRS = {'span': ['style']}
+
+ cleaner = Cleaner(tags=TAGS, attributes=ATTRS)
+
+ assert (
+ cleaner.clean('a
test') ==
+ 'a
test'
+ )
diff --git a/tests/test_links.py b/tests/test_linkify.py
similarity index 100%
rename from tests/test_links.py
rename to tests/test_linkify.py
From 205edc0094c3a5ad217d164048d57a22a69fed93 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Wed, 20 Sep 2017 09:00:30 -0400
Subject: [PATCH 117/314] Add code of conduct blurb, move some docs around
Fixes #313
---
CODE_OF_CONDUCT.rst | 9 +++++++++
README.rst | 41 ++++++++++++++++++++++++++---------------
docs/dev.rst | 6 ++++++
3 files changed, 41 insertions(+), 15 deletions(-)
create mode 100644 CODE_OF_CONDUCT.rst
diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst
new file mode 100644
index 00000000..da20d8db
--- /dev/null
+++ b/CODE_OF_CONDUCT.rst
@@ -0,0 +1,9 @@
+Code of conduct
+===============
+
+This project and repository is governed by Mozilla's code of conduct and
+etiquette guidelines. For more details please see the `Mozilla Community
+Participation Guidelines
+ `_ and
+`Developer Etiquette Guidelines
+`_.
diff --git a/README.rst b/README.rst
index b728c292..863772e8 100644
--- a/README.rst
+++ b/README.rst
@@ -51,21 +51,6 @@ please read our wiki page at
``_.
-Security
-========
-
-Bleach is a security-related library.
-
-We have a responsible security vulnerability reporting process. Please use
-that if you're reporting a security issue.
-
-Security issues are fixed in private. After we land such a fix, we'll do a
-release.
-
-For every release, we mark security issues we've fixed in the ``CHANGES`` in
-the **Security issues** section. We include relevant CVE links.
-
-
Installing Bleach
=================
@@ -104,6 +89,32 @@ The simplest way to use Bleach is:
u'an http://example.com url
+Security
+========
+
+Bleach is a security-related library.
+
+We have a responsible security vulnerability reporting process. Please use
+that if you're reporting a security issue.
+
+Security issues are fixed in private. After we land such a fix, we'll do a
+release.
+
+For every release, we mark security issues we've fixed in the ``CHANGES`` in
+the **Security issues** section. We include relevant CVE links.
+
+
+Code of conduct
+===============
+
+This project and repository is governed by Mozilla's code of conduct and
+etiquette guidelines. For more details please see the `Mozilla Community
+Participation Guidelines
+ `_ and
+`Developer Etiquette Guidelines
+`_.
+
+
.. _html5lib: https://github.com/html5lib/html5lib-python
.. _GitHub: https://github.com/mozilla/bleach
.. _ReadTheDocs: https://bleach.readthedocs.io/
diff --git a/docs/dev.rst b/docs/dev.rst
index cfa0a8c7..98707048 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -19,6 +19,12 @@ To install Bleach to make changes to it:
$ pip install -e .
+.. include:: ../CONTRIBUTING.rst
+
+
+.. include:: ../CODE_OF_CONDUCT.rst
+
+
Docs
====
From 4c80d008059257a17af3982c1aba4a3b7879370b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Wed, 20 Sep 2017 09:07:20 -0400
Subject: [PATCH 118/314] Change "Security issues" to "Security fixes"
This is clearer regarding the intent of that block.
---
CHANGES | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/CHANGES b/CHANGES
index 59db3338..5ea7aff9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,7 +4,7 @@ Bleach Changes
Version 2.1 (in development)
----------------------------
-**Security issues**
+**Security fixes**
**Backwards incompatible changes**
@@ -40,7 +40,7 @@ Version 2.1 (in development)
Version 2.0 (March 8th, 2017)
-----------------------------
-**Security issues**
+**Security fixes**
* None
@@ -150,7 +150,7 @@ Version 2.0 (March 8th, 2017)
Version 1.5 (November 4th, 2016)
--------------------------------
-**Security issues**
+**Security fixes**
* None
@@ -192,7 +192,7 @@ Version 1.5 (November 4th, 2016)
Version 1.4.3 (May 23rd, 2016)
------------------------------
-**Security issues**
+**Security fixes**
* None
From 2a9854d9484797beeed1673454980404483774b3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Wed, 20 Sep 2017 11:08:11 -0400
Subject: [PATCH 119/314] Fix test_websites to work with Python 3
---
tests_website/data_to_json.py | 2 +-
tests_website/open_test_page.py | 2 ++
tests_website/server.py | 16 ++++++++++------
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py
index ffd346f5..debe5a9d 100755
--- a/tests_website/data_to_json.py
+++ b/tests_website/data_to_json.py
@@ -50,4 +50,4 @@ def main():
if __name__ == '__main__':
- main()
\ No newline at end of file
+ main()
diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py
index b812de92..79f4adf2 100755
--- a/tests_website/open_test_page.py
+++ b/tests_website/open_test_page.py
@@ -2,6 +2,7 @@
import webbrowser
+
TEST_BROWSERS = set([
# 'mozilla',
'firefox',
@@ -29,6 +30,7 @@
])
REGISTERED_BROWSERS = set(webbrowser._browsers.keys())
+
if __name__ == '__main__':
for b in TEST_BROWSERS & REGISTERED_BROWSERS:
webbrowser.get(b).open_new_tab('http://localhost:8080')
diff --git a/tests_website/server.py b/tests_website/server.py
index 83fcf84a..8a8c6438 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -9,17 +9,19 @@
python server.py
"""
-import SimpleHTTPServer
-import SocketServer
-import json
+# import SimpleHTTPServer
+# import SocketServer
+
+import six
+
import bleach
PORT = 8080
-class BleachCleanHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_POST(self):
content_len = int(self.headers.getheader('content-length', 0))
body = self.rfile.read(content_len)
@@ -36,7 +38,9 @@ def do_POST(self):
if __name__ == '__main__':
- SocketServer.TCPServer.allow_reuse_address = True # Prevent 'cannot bind to address' errors on restart
- httpd = SocketServer.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
+ # Prevent 'cannot bind to address' errors on restart
+ six.moves.socketserver.TCPServer.allow_reuse_address = True
+
+ httpd = six.moves.socketserver.TCPServer(('127.0.0.1', PORT), BleachCleanHandler)
print("listening on localhost port %d" % PORT)
httpd.serve_forever()
From daec5ef18487fa31779165cb104a22b5931b4c3b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Wed, 20 Sep 2017 11:31:06 -0400
Subject: [PATCH 120/314] More Python 3 fixes for tests_websites
---
tests_website/server.py | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/tests_website/server.py b/tests_website/server.py
index 8a8c6438..edc791a4 100755
--- a/tests_website/server.py
+++ b/tests_website/server.py
@@ -1,20 +1,17 @@
#!/usr/bin/env python
"""
-Simple Test/Demo Server for running bleach.clean output
-on various desktops.
+Simple Test/Demo Server for running bleach.clean output on various
+desktops.
Usage:
-python server.py
-"""
+ python server.py
-# import SimpleHTTPServer
-# import SocketServer
+"""
import six
-
import bleach
@@ -23,17 +20,26 @@
class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_POST(self):
- content_len = int(self.headers.getheader('content-length', 0))
+ if six.PY2:
+ content_len = int(self.headers.getheader('content-length', 0))
+ else:
+ content_len = int(self.headers.get('content-length', 0))
body = self.rfile.read(content_len)
print("read %s bytes: %s" % (content_len, body))
+
+ if six.PY3:
+ body = body.decode('utf-8')
+ print('input: %r' % body)
cleaned = bleach.clean(body)
- print("cleaned %s" % cleaned)
self.send_response(200)
self.send_header('Content-Length', len(cleaned))
self.send_header('Content-Type', 'text/plain;charset=UTF-8')
self.end_headers()
+ if six.PY3:
+ cleaned = bytes(cleaned, encoding='utf-8')
+ print("cleaned: %r" % cleaned)
self.wfile.write(cleaned)
From 67afdf8ae7d323305ea104c0efb6bcb37547edc2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 27 Jul 2017 13:07:08 -0400
Subject: [PATCH 121/314] Prevent HTMLTokenizer from unescaping entities
This overrides the HTMLTokenizer's .consumeEntity() method such that it doesn't
convert character entities.
This also fixes some other escaping/unescaping oddities so that the output of
bleach.clean() is more correct in regards to intended behavior.
One thing this breaks is the idempotent property for bleach.clean()--it's no
longer idempotent. Since it escapes text more correctly now and that's not an
idempotent transform, this is no longer idempotent.
For example, bleach.clean() can't differentiate between a user talking about
code and saying this:
I like my html wrapped in !
and this:
I like my html escaped like this <b>!
I'm not sure why we thought bleach.clean() could ever be correct and idempotent.
Seems like that was an error.
---
CHANGES | 7 ++++-
bleach/sanitizer.py | 66 ++++++++++++++++++++++++++++++++++++++++--
tests/data/13.test.out | 2 +-
tests/data/14.test.out | 2 +-
tests/data/15.test.out | 2 +-
tests/data/16.test.out | 2 +-
tests/data/17.test.out | 2 +-
tests/data/18.test.out | 2 +-
tests/data/19.test.out | 3 +-
tests/test_security.py | 15 ++++++++--
10 files changed, 90 insertions(+), 13 deletions(-)
diff --git a/CHANGES b/CHANGES
index 5ea7aff9..ae1d52f3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,6 +17,12 @@ Version 2.1 (in development)
* clean, linkify: accept only unicode or utf-8-encoded str (#176)
+* ``bleach.clean()`` no longer unescapes entities including ones that are missing
+ a ``;`` at the end which can happen in urls and other places. (#143)
+
+* ``bleach.clean()`` is no longer idempotent. If you run ``bleach.clean()`` on
+ text multiple times, it'll escape things again and again.
+
**Features**
**Bug fixes**
@@ -36,7 +42,6 @@ Version 2.1 (in development)
* add test website and scripts to test ``bleach.clean()`` output in browser;
thank you, Greg Guthe!
-
Version 2.0 (March 8th, 2017)
-----------------------------
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 26cfad2a..f9fb4287 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -4,9 +4,15 @@
from xml.sax.saxutils import unescape
import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+ ReparseException,
+ namespaces,
+ prefixes,
+ tokenTypes,
+)
from html5lib.filters import sanitizer
from html5lib.serializer import HTMLSerializer
+from html5lib._tokenizer import HTMLTokenizer
from bleach.utils import alphabetize_attributes, force_unicode
@@ -44,6 +50,33 @@
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+class BleachHTMLTokenizer(HTMLTokenizer):
+ def consumeEntity(self, allowedChar=None, fromAttribute=False):
+ # We don't want to consume and convert entities. Instead we put the
+ # '&' in output.
+ if fromAttribute:
+ self.currentToken['data'][-1][1] += '&'
+
+ else:
+ self.tokenQueue.append({"type": tokenTypes['Characters'], "data": '&'})
+
+
+class BleachHTMLParser(html5lib.HTMLParser):
+ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+ # Override HTMLParser so we can swap out the tokenizer.
+ self.innerHTMLMode = innerHTML
+ self.container = container
+ self.scripting = scripting
+ self.tokenizer = BleachHTMLTokenizer(stream, parser=self, **kwargs)
+ self.reset()
+
+ try:
+ self.mainLoop()
+ except ReparseException:
+ self.reset()
+ self.mainLoop()
+
+
class Cleaner(object):
"""Cleaner for cleaning HTML fragments of malicious content
@@ -104,7 +137,7 @@ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
self.strip_comments = strip_comments
self.filters = filters or []
- self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+ self.parser = BleachHTMLParser(namespaceHTMLElements=False)
self.walker = html5lib.getTreeWalker('etree')
self.serializer = HTMLSerializer(
quote_attr_values='always',
@@ -338,6 +371,35 @@ def allow_token(self, token):
return token
+ def disallowed_token(self, token):
+ token_type = token["type"]
+ if token_type == "EndTag":
+ token["data"] = "%s>" % token["name"]
+
+ elif token["data"]:
+ assert token_type in ("StartTag", "EmptyTag")
+ attrs = []
+ for (ns, name), v in token["data"].items():
+ attrs.append(' %s="%s"' % (
+ name if ns is None else "%s:%s" % (prefixes[ns], name),
+ # Note: HTMLSerializer escapes attribute values already, so
+ # if we do it here (like HTMLSerializer does), then we end
+ # up double-escaping.
+ v)
+ )
+ token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+
+ else:
+ token["data"] = "<%s>" % token["name"]
+
+ if token.get("selfClosing"):
+ token["data"] = token["data"][:-1] + "/>"
+
+ token["type"] = "Characters"
+
+ del token["name"]
+ return token
+
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
# disallow urls
diff --git a/tests/data/13.test.out b/tests/data/13.test.out
index 1c866507..0053081c 100644
--- a/tests/data/13.test.out
+++ b/tests/data/13.test.out
@@ -1 +1 @@
-<img src="JaVaScRiPt:alert("XSS<WBR">")>
\ No newline at end of file
+<img src="JaVaScRiPt:alert("XSS<WBR">")>
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 8e5ff754..04091589 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-<imgsrc=java&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=java&<wbr>
\ No newline at end of file
+<imgsrc=java&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=java&<wbr>
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index 8b90245f..a7dc6e69 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')>
\ No newline at end of file
+le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')>
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
index 1ecb332b..c8e31d88 100644
--- a/tests/data/16.test.out
+++ b/tests/data/16.test.out
@@ -1 +1 @@
-<imgsrc=ja&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=ja&<wbr>
\ No newline at end of file
+<imgsrc=ja&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=ja&<wbr>
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
index ae928a99..8d47f574 100644
--- a/tests/data/17.test.out
+++ b/tests/data/17.test.out
@@ -1 +1 @@
-<imgsrc=javas&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=javas&<wbr>
\ No newline at end of file
+<imgsrc=javas&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=javas&<wbr>
diff --git a/tests/data/18.test.out b/tests/data/18.test.out
index 8046c715..e4fe2cf3 100644
--- a/tests/data/18.test.out
+++ b/tests/data/18.test.out
@@ -1 +1 @@
-<img src="jav ascript:alert(<WBR>'XSS');">
\ No newline at end of file
+<img src="jav	ascript:alert(<WBR>'XSS');">
\ No newline at end of file
diff --git a/tests/data/19.test.out b/tests/data/19.test.out
index 8eb8794c..4daa11ad 100644
--- a/tests/data/19.test.out
+++ b/tests/data/19.test.out
@@ -1,2 +1 @@
-<img src="jav
-ascript:alert(<WBR>'XSS');">
\ No newline at end of file
+<img src="jav
ascript:alert(<WBR>'XSS');">
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 0eeb09c6..28e3cf2a 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -8,6 +8,17 @@
from bleach import clean
+def test_escaped_entities():
+ # html5lib unescapes character entities, so these would become ' and "
+ # which makes it possible to break out of html attributes.
+ #
+ # Verify that bleach.clean() doesn't unescape entities.
+ assert (
+ clean(''"') ==
+ ''"'
+ )
+
+
def test_nested_script_tag():
assert (
clean('</script>') ==
@@ -105,7 +116,7 @@ def test_invalid_tag_char():
def test_unclosed_tag():
assert (
clean('&
+--
+>"><script>alert("XSS")</script>&
diff --git a/tests/data/10.test b/tests/data/10.test
index 268771bc..a6db9f98 100644
--- a/tests/data/10.test
+++ b/tests/data/10.test
@@ -1 +1,3 @@
+--
+<img src="javascript:alert('XSS');">
diff --git a/tests/data/11.test b/tests/data/11.test
index 16a49c70..37cbbfaf 100644
--- a/tests/data/11.test
+++ b/tests/data/11.test
@@ -1 +1,3 @@
+--
+<img src="javascript:alert('XSS')">
diff --git a/tests/data/12.test b/tests/data/12.test
index d4b96e6f..04c7ea8a 100644
--- a/tests/data/12.test
+++ b/tests/data/12.test
@@ -1 +1,3 @@
+--
+<img src="JaVaScRiPt:alert('XSS')">
diff --git a/tests/data/13.test b/tests/data/13.test
index 07279a83..36d4aaee 100644
--- a/tests/data/13.test
+++ b/tests/data/13.test
@@ -1 +1,3 @@
")>
+--
+<img src="JaVaScRiPt:alert("XSS<WBR">")>
diff --git a/tests/data/14.test b/tests/data/14.test
index b704c0b4..f154c73e 100644
--- a/tests/data/14.test
+++ b/tests/data/14.test
@@ -1 +1,3 @@
#115;cript:a
+--
+<imgsrc=java&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=java&<wbr>
diff --git a/tests/data/15.test b/tests/data/15.test
index b6a2de6b..c48c3e41 100644
--- a/tests/data/15.test
+++ b/tests/data/15.test
@@ -1 +1,3 @@
lert('XS;S')>
+--
+le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')>
diff --git a/tests/data/16.test b/tests/data/16.test
index d66b5921..938240be 100644
--- a/tests/data/16.test
+++ b/tests/data/16.test
@@ -1 +1,3 @@
#0000118ascript:alert('XSS')>
+--
+<imgsrc=ja&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=ja&<wbr>
diff --git a/tests/data/17.test b/tests/data/17.test
index 6e71b152..166e8845 100644
--- a/tests/data/17.test
+++ b/tests/data/17.test
@@ -1 +1,3 @@
#x63ript:alert('XSS')>
+--
+<imgsrc=javas&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=javas&<wbr>
diff --git a/tests/data/18.test b/tests/data/18.test
index 1c173723..635461f8 100644
--- a/tests/data/18.test
+++ b/tests/data/18.test
@@ -1 +1,3 @@
+--
+<img src="jav	ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/19.test b/tests/data/19.test
index e6e79742..1a1ebe41 100644
--- a/tests/data/19.test
+++ b/tests/data/19.test
@@ -1 +1,3 @@
+--
+<img src="jav
ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/2.test b/tests/data/2.test
index 21b93db3..aefcbe26 100644
--- a/tests/data/2.test
+++ b/tests/data/2.test
@@ -1 +1,3 @@
">
+--
+"><style>@import"javascript:alert('XSS')";</style>
diff --git a/tests/data/20.test b/tests/data/20.test
index 614b544f..ceae0bd8 100644
--- a/tests/data/20.test
+++ b/tests/data/20.test
@@ -1 +1,3 @@
+--
+<img src="jav
ascript:alert(<WBR>'XSS');">
diff --git a/tests/data/3.test b/tests/data/3.test
index 8dc3a4ee..67f3591b 100644
--- a/tests/data/3.test
+++ b/tests/data/3.test
@@ -1 +1,3 @@
>"'>
+--
+>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)></img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)>
diff --git a/tests/data/4.test b/tests/data/4.test
index c4cf51cd..10438d81 100644
--- a/tests/data/4.test
+++ b/tests/data/4.test
@@ -1 +1,3 @@
ipt type="text/javascript">alert("foo");script>
+--
+<scr<script>ipt type="text/javascript">alert("foo");script<del></del>></scr<script>
diff --git a/tests/data/5.test b/tests/data/5.test
index 0b03876b..dd45837a 100644
--- a/tests/data/5.test
+++ b/tests/data/5.test
@@ -1 +1,3 @@
>%22%27>
+--
+>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22></img%20src%3d%22javascript:alert(%27%20xss%27)%22>
diff --git a/tests/data/7.test b/tests/data/7.test
index 827f9b9e..73f5cab1 100644
--- a/tests/data/7.test
+++ b/tests/data/7.test
@@ -1 +1,3 @@
">
+--
+">
diff --git a/tests/data/8.test b/tests/data/8.test
index ddf33a96..f5be4f25 100644
--- a/tests/data/8.test
+++ b/tests/data/8.test
@@ -1 +1,3 @@
>"
+--
+>"
diff --git a/tests/data/9.test b/tests/data/9.test
index 9cf58659..26d27f78 100644
--- a/tests/data/9.test
+++ b/tests/data/9.test
@@ -1 +1,3 @@
'';!--"=&{()}
+--
+'';!--"<xss>=&{()}</xss>
diff --git a/tests/test_security.py b/tests/test_security.py
index 4c710775..9dd49338 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -197,12 +197,15 @@ def get_tests():
return testcases
-@pytest.mark.parametrize('fn, text', get_tests())
-def test_regressions(fn, text):
+@pytest.mark.parametrize('fn, test_case', get_tests())
+def test_regressions(fn, test_case):
"""Regression tests for clean so we can see if there are issues"""
- expected = six.text_type(open(fn + '.out', 'r').read())
+ test_data, expected = test_case.split('\n--\n')
# NOTE(willkg): This strips input and expected which makes it easier to
# maintain the files. If there comes a time when the input needs whitespace
# at the beginning or end, then we'll have to figure out something else.
- assert clean(text.strip()) == expected.strip()
+ test_data = test_data.strip()
+ expected = expected.strip()
+
+ assert clean(test_data) == expected
From 588286152b0c24d2d2c9e68d4761c14f00ce88b6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 3 Mar 2018 10:57:04 -0500
Subject: [PATCH 145/314] Merge all the clean tests into one file and clean up
* Moves test_security.py tests into test_clean.py
* Removes duplicate tests and unhelpful tests
* Adds additional helpful test cases
* Reworks some tests to be easier and run to read by parametrizing them
* Adds comments and adjusts function names to be more helpful
---
tests/test_clean.py | 549 ++++++++++++++++++++++++++++++-----------
tests/test_security.py | 211 ----------------
2 files changed, 405 insertions(+), 355 deletions(-)
delete mode 100644 tests/test_security.py
diff --git a/tests/test_clean.py b/tests/test_clean.py
index c5f78f73..799ae186 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -1,96 +1,210 @@
+import os
+
from html5lib.filters.base import Filter
import pytest
-import bleach
+from bleach import clean
from bleach.sanitizer import Cleaner
-def test_empty():
- assert bleach.clean('') == ''
+def test_clean_idempotent():
+ """Make sure that applying the filter twice doesn't change anything."""
+ dirty = 'invalid & < extra http://link.com'
+ assert clean(clean(dirty)) == clean(dirty)
-def test_nbsp():
- assert bleach.clean(' test string ') == ' test string '
+def test_only_text_is_cleaned():
+ some_text = 'text'
+ some_type = int
+ no_type = None
+ assert clean(some_text) == some_text
-def test_comments_only():
- comment = ''
- assert bleach.clean(comment) == ''
- assert bleach.clean(comment, strip_comments=False) == comment
+ with pytest.raises(TypeError) as e:
+ clean(some_type)
+ assert "argument cannot be of 'type' type" in str(e)
- open_comment = ''.format(open_comment)
- )
+ with pytest.raises(TypeError) as e:
+ clean(no_type)
+ assert "NoneType" in str(e)
-def test_with_comments():
- text = 'Just text'
- assert bleach.clean(text) == 'Just text'
- assert bleach.clean(text, strip_comments=False) == text
+def test_empty():
+ assert clean('') == ''
-def test_no_html():
- assert bleach.clean('no html string') == 'no html string'
+def test_content_has_no_html():
+ assert clean('no html string') == 'no html string'
-def test_allowed_html():
- assert (
- bleach.clean('an allowed tag') ==
+@pytest.mark.parametrize('data, expected', [
+ (
+ 'an allowed tag',
'an allowed tag'
- )
- assert (
- bleach.clean('another good tag') ==
+ ),
+
+ (
+ 'another good tag',
'another good tag'
)
+])
+def test_content_has_allowed_html(data, expected):
+ assert clean(data) == expected
-def test_bad_html():
+def test_html_is_lowercased():
assert (
- bleach.clean('a fixed tag') ==
- 'a fixed tag'
+ clean('foo') ==
+ 'foo'
)
-def test_function_arguments():
- TAGS = ['span', 'br']
- ATTRS = {'span': ['style']}
+@pytest.mark.parametrize('data, should_strip, expected', [
+ # Regular comment
+ (
+ '',
+ True,
+ ''
+ ),
- text = 'a
test'
- assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
- 'a
test'
+ # Open comment with no close comment bit
+ (
+ ''
+ ),
+ (
+ ''
+ ),
+
+ # Comment with text to the right
+ (
+ 'text',
+ True,
+ 'text'
+ ),
+ (
+ 'text',
+ True,
+ 'text'
+ ),
+ (
+ 'text',
+ False,
+ 'text'
+ ),
+ (
+ 'text',
+ False,
+ 'text'
+ ),
+
+ # Comment with text to the left
+ (
+ 'text',
+ True,
+ 'text'
+ ),
+ (
+ 'text',
+ True,
+ 'text'
+ ),
+ (
+ 'text',
+ False,
+ 'text'
+ ),
+ (
+ 'text',
+ False,
+ 'text'
)
+])
+def test_comments(data, should_strip, expected):
+ assert clean(data, strip_comments=should_strip) == expected
-def test_named_arguments():
- ATTRS = {'a': ['rel', 'href']}
+@pytest.mark.parametrize('data, expected', [
+ # Disallowed tag is escaped
+ ('
', '<img src="javascript:alert(\'XSS\');">'),
+
+ # Test with parens
+ ('a test', 'a <script>safe()</script> test'),
+
+ # Test with braces
+ ('a test', 'a <style>body{}</style> test'),
+])
+def test_disallowed_tags(data, expected):
+ assert clean(data) == expected
- text = 'xx.com'
- assert bleach.clean(text) == 'xx.com'
+
+def test_invalid_char_in_tag():
+ # NOTE(willkg): Two possible outcomes because attrs aren't ordered
+ assert (
+ clean('') in
+ [
+ '<script src="http://xx.com/xss.js" xss=""></script>',
+ '<script xss="" src="http://xx.com/xss.js"></script>'
+ ]
+ )
assert (
- bleach.clean(text, attributes=ATTRS) ==
- 'xx.com'
+ clean('') ==
+ '<script src="http://xx.com/xss.js"></script>'
)
-def test_disallowed_html():
+def test_unclosed_tag():
+ assert (
+ clean('a fixed tag') ==
+ 'a fixed tag'
+ )
assert (
- bleach.clean('a test') ==
- 'a <script>safe()</script> test'
+ clean('/script>') ==
+ '<<script>script>evil()<</script>/script>'
+ )
+ assert (
+ clean('<script>evil()< /script>') ==
+ '<<x>script>evil()<</x>/script>'
+ )
+ assert (
+ clean('>') ==
+ '<script<script>>evil()></script<script>'
)
@@ -100,13 +214,14 @@ def test_bad_href():
('tag < and entity', 'tag < and entity'),
])
def test_bare_entities(text, expected):
- assert bleach.clean(text) == expected
+ assert clean(text) == expected
@pytest.mark.parametrize('text, expected', [
# Test character entities
('&', '&'),
(' ', ' '),
+ (' test string ', ' test string '),
('<em>strong</em>', '<em>strong</em>'),
# Test character entity at beginning of string
@@ -154,75 +269,160 @@ def test_bare_entities(text, expected):
# Test non-numeric entities
('', '&#'),
- ('<', '&#<')
+ ('<', '&#<'),
+
+ # html5lib tokenizer unescapes character entities, so these would become '
+ # and " which makes it possible to break out of html attributes.
+ #
+ # Verify that clean() doesn't unescape entities.
+ (''"', ''"'),
])
def test_character_entities(text, expected):
- assert bleach.clean(text) == expected
+ assert clean(text) == expected
-def test_weird_strings():
- s = '3'
- assert bleach.clean(s) == ''
-
-
-def test_stripping():
- text = 'a test with html tags'
- assert (
- bleach.clean(text, strip=True) ==
+@pytest.mark.parametrize('data, kwargs, expected', [
+ # All tags are allowed, so it strips nothing
+ (
+ 'a test with html tags',
+ {'strip': True},
'a test with html tags'
- )
+ ),
- text = 'a test with
html tags'
- assert (
- bleach.clean(text, strip=True) ==
+ # img tag is disallowed, so it's stripped
+ (
+ 'a test with
html tags',
+ {'strip': True},
'a test with html tags'
- )
+ ),
- text = ''
- assert (
- bleach.clean(text, tags=['p'], strip=True) ==
+ # a tag is disallowed, so it's stripped
+ (
+ '',
+ {'tags': ['p'], 'strip': True},
'link text
'
- )
- text = 'multiply nested text
'
- assert (
- bleach.clean(text, tags=['p'], strip=True) ==
+ ),
+
+ # handle nested disallowed tag
+ (
+ 'multiply nested text
',
+ {'tags': ['p'], 'strip': True},
'multiply nested text
'
- )
+ ),
- text = ''
- assert (
- bleach.clean(text, tags=['p', 'a'], strip=True) ==
+ # handle disallowed tag that's deep in the tree
+ (
+ '',
+ {'tags': ['a', 'p'], 'strip': True},
''
- )
+ ),
+])
+def test_stripping_tags(data, kwargs, expected):
+ assert clean(data, **kwargs) == expected
+
+
+@pytest.mark.parametrize('data, expected', [
+ (
+ 'pt>alert(1) ipt>',
+ 'pt>alert(1)ipt>'
+ ),
+ (
+ 'pt>pt>alert(1)',
+ 'pt>pt>alert(1)'
+ ),
+])
+def test_stripping_tags_is_safe(data, expected):
+ """Test stripping tags shouldn't result in malicious content"""
+ assert clean(data, strip=True) == expected
def test_allowed_styles():
+ """Test allowed styles"""
ATTRS = ['style']
STYLE = ['color']
assert (
- bleach.clean('', attributes=ATTRS) ==
+ clean('', attributes=ATTRS) ==
''
)
text = ''
- assert bleach.clean(text, attributes=ATTRS, styles=STYLE) == text
+ assert clean(text, attributes=ATTRS, styles=STYLE) == text
text = ''
assert (
- bleach.clean(text, attributes=ATTRS, styles=STYLE) ==
+ clean(text, attributes=ATTRS, styles=STYLE) ==
''
)
-def test_lowercase_html():
- """We should output lowercase HTML."""
+def test_href_with_wrong_tag():
assert (
- bleach.clean('BAR', attributes=['class']) ==
- 'BAR'
+ clean('no link') ==
+ 'no link'
)
+def test_disallowed_attr():
+ IMG = ['img', ]
+ IMG_ATTR = ['src']
+
+ assert (
+ clean('test') ==
+ 'test'
+ )
+ assert (
+ clean('
', tags=IMG, attributes=IMG_ATTR) ==
+ '
'
+ )
+ assert (
+ clean('
', tags=IMG, attributes=IMG_ATTR) ==
+ '
'
+ )
+
+
+def test_unquoted_attr_values_are_quoted():
+ assert (
+ clean('myabbr') ==
+ 'myabbr'
+ )
+
+
+def test_unquoted_event_handler_attr_value():
+ assert (
+ clean('xx.com') ==
+ 'xx.com'
+ )
+
+
+def test_invalid_filter_attr():
+ IMG = ['img', ]
+ IMG_ATTR = {
+ 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
+ }
+
+ assert (
+ clean('
', tags=IMG, attributes=IMG_ATTR) ==
+ '
'
+ )
+ assert (
+ clean('
', tags=IMG, attributes=IMG_ATTR) ==
+ '
'
+ )
+
+
+def test_poster_attribute():
+ """Poster attributes should not allow javascript."""
+ tags = ['video']
+ attrs = {'video': ['poster']}
+
+ test = ''
+ assert clean(test, tags=tags, attributes=attrs) == ''
+
+ ok = ''
+ assert clean(ok, tags=tags, attributes=attrs) == ok
+
+
def test_attributes_callable():
"""Verify attributes can take a callable"""
ATTRS = lambda tag, name, val: name == 'title'
@@ -230,7 +430,7 @@ def test_attributes_callable():
text = u'example'
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
u'example'
)
@@ -245,7 +445,7 @@ def test_attributes_wildcard():
text = 'both can have
'
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
'both can have
'
)
@@ -258,7 +458,7 @@ def test_attributes_wildcard_callable():
TAGS = ['a']
assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ clean(u'example', tags=TAGS, attributes=ATTRS) ==
u'example'
)
@@ -275,12 +475,12 @@ def img_test(tag, name, val):
text = 'foo
baz'
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
u'foo
baz'
)
text = 'foo
baz'
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
u'foo
baz'
)
@@ -293,7 +493,7 @@ def test_attributes_tag_list():
TAGS = ['a']
assert (
- bleach.clean(u'example', tags=TAGS, attributes=ATTRS) ==
+ clean(u'example', tags=TAGS, attributes=ATTRS) ==
u'example'
)
@@ -305,11 +505,44 @@ def test_attributes_list():
text = u'example'
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
u'example'
)
+@pytest.mark.parametrize('data, kwargs, expected', [
+ # javascript: is not allowed by default
+ (
+ 'xss',
+ {},
+ 'xss'
+ ),
+
+ # File protocol is not allowed by default
+ (
+ 'foo',
+ {},
+ 'foo'
+ ),
+
+ # Specified protocols are allowed
+ (
+ 'allowed href',
+ {'protocols': ['myprotocol']},
+ 'allowed href'
+ ),
+
+ # Unspecified protocols are not allowed
+ (
+ 'invalid href',
+ {'protocols': ['myprotocol']},
+ 'invalid href'
+ )
+])
+def test_uri_value_allowed_protocols(data, kwargs, expected):
+ assert clean(data, **kwargs) == expected
+
+
def test_svg_attr_val_allows_ref():
"""Unescape values in svg attrs that allow url references"""
# Local IRI, so keep it
@@ -320,7 +553,7 @@ def test_svg_attr_val_allows_ref():
text = ''
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
''
)
@@ -331,7 +564,7 @@ def test_svg_attr_val_allows_ref():
}
text = ''
assert (
- bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ clean(text, tags=TAGS, attributes=ATTRS) ==
''
)
@@ -353,7 +586,7 @@ def test_svg_allow_local_href(text, expected):
ATTRS = {
'pattern': ['id', 'href'],
}
- assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+ assert clean(text, tags=TAGS, attributes=ATTRS) == expected
@pytest.mark.parametrize('text, expected', [
@@ -372,73 +605,77 @@ def test_svg_allow_local_href_nonlocal(text, expected):
ATTRS = {
'pattern': ['id', 'href'],
}
- assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+ assert clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.xfail(reason='regression from bleach 1.4')
+def test_weird_strings():
+ s = '3'
+ assert clean(s) == '3'
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
+@pytest.mark.xfail(reason='regression from bleach 1.4')
def test_sarcasm():
"""Jokes should crash. """
- dirty = 'Yeah right '
- clean = 'Yeah right <sarcasm/>'
- assert bleach.clean(dirty) == clean
-
+ assert (
+ clean('Yeah right ') ==
+ 'Yeah right <sarcasm/>'
+ )
-def test_user_defined_protocols_valid():
- valid_href = 'allowed href'
- assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
+@pytest.mark.parametrize('data, expected', [
+ # Convert bell
+ ('1\a23', '1?23'),
-def test_user_defined_protocols_invalid():
- invalid_href = 'invalid href'
- cleaned_href = 'invalid href'
- assert bleach.clean(invalid_href, protocols=['my_protocol']) == cleaned_href
+ # Convert backpsace
+ ('1\b23', '1?23'),
+ # Convert formfeed
+ ('1\v23', '1?23'),
-def test_filters():
- # Create a Filter that changes all the attr values to "moo"
- class MooFilter(Filter):
- def __iter__(self):
- for token in Filter.__iter__(self):
- if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
- for attr, value in token['data'].items():
- token['data'][attr] = 'moo'
+ # Convert vertical tab
+ ('1\f23', '1?23'),
- yield token
+ # Convert a bunch of characters in a string
+ ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
+])
+def test_invisible_characters(data, expected):
+ assert clean(data) == expected
- ATTRS = {
- 'img': ['rel', 'src']
- }
- TAGS = ['img']
- cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+def get_tests():
+ """Retrieves regression tests from data/ directory
- dirty = 'this is cute!
'
- assert (
- cleaner.clean(dirty) ==
- 'this is cute!
'
- )
+ :returns: list of ``(filename, filedata)`` tuples
+ """
+ datadir = os.path.join(os.path.dirname(__file__), 'data')
+ tests = [
+ os.path.join(datadir, fn) for fn in os.listdir(datadir)
+ if fn.endswith('.test')
+ ]
+ # Sort numerically which makes it easier to iterate through them
+ tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
-def test_clean_idempotent():
- """Make sure that applying the filter twice doesn't change anything."""
- dirty = 'invalid & < extra http://link.com'
- assert bleach.clean(bleach.clean(dirty)) == bleach.clean(dirty)
+ testcases = [
+ (fn, open(fn, 'r').read()) for fn in tests
+ ]
+ return testcases
-def test_only_text_is_cleaned():
- some_text = 'text'
- some_type = int
- no_type = None
- assert bleach.clean(some_text) == some_text
+@pytest.mark.parametrize('fn, test_case', get_tests())
+def test_regressions(fn, test_case):
+ """Regression tests for clean so we can see if there are issues"""
+ test_data, expected = test_case.split('\n--\n')
- with pytest.raises(TypeError) as e:
- bleach.clean(some_type)
- assert "argument cannot be of 'type' type" in str(e)
+ # NOTE(willkg): This strips input and expected which makes it easier to
+ # maintain the files. If there comes a time when the input needs whitespace
+ # at the beginning or end, then we'll have to figure out something else.
+ test_data = test_data.strip()
+ expected = expected.strip()
- with pytest.raises(TypeError) as e:
- bleach.clean(no_type)
- assert "NoneType" in str(e)
+ assert clean(test_data) == expected
class TestCleaner:
@@ -452,3 +689,27 @@ def test_basics(self):
cleaner.clean('a
test') ==
'a
test'
)
+
+ def test_filters(self):
+ # Create a Filter that changes all the attr values to "moo"
+ class MooFilter(Filter):
+ def __iter__(self):
+ for token in Filter.__iter__(self):
+ if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
+ for attr, value in token['data'].items():
+ token['data'][attr] = 'moo'
+
+ yield token
+
+ ATTRS = {
+ 'img': ['rel', 'src']
+ }
+ TAGS = ['img']
+
+ cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
+
+ dirty = 'this is cute!
'
+ assert (
+ cleaner.clean(dirty) ==
+ 'this is cute!
'
+ )
diff --git a/tests/test_security.py b/tests/test_security.py
deleted file mode 100644
index 9dd49338..00000000
--- a/tests/test_security.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""More advanced security tests"""
-
-import os
-
-import pytest
-import six
-
-from bleach import clean
-
-
-def test_escaped_entities():
- # html5lib unescapes character entities, so these would become ' and "
- # which makes it possible to break out of html attributes.
- #
- # Verify that bleach.clean() doesn't unescape entities.
- assert (
- clean(''"') ==
- ''"'
- )
-
-
-def test_nested_script_tag():
- assert (
- clean('</script>') ==
- '<<script>script>evil()<</script>/script>'
- )
- assert (
- clean('<script>evil()< /script>') ==
- '<<x>script>evil()<</x>/script>'
- )
-
-
-def test_nested_script_tag_r():
- assert (
- clean('>') ==
- '<script<script>>evil()></script<script>'
- )
-
-
-def test_invalid_attr():
- IMG = ['img', ]
- IMG_ATTR = ['src']
-
- assert (
- clean('test') ==
- 'test'
- )
- assert (
- clean('
', tags=IMG, attributes=IMG_ATTR) ==
- '
'
- )
- assert (
- clean('
', tags=IMG, attributes=IMG_ATTR) ==
- '
'
- )
-
-
-def test_unquoted_attr():
- assert (
- clean('myabbr') ==
- 'myabbr'
- )
-
-
-def test_unquoted_event_handler():
- assert (
- clean('xx.com') ==
- 'xx.com'
- )
-
-
-def test_invalid_attr_value():
- assert (
- clean('
') ==
- '<img src="javascript:alert(\'XSS\');">'
- )
-
-
-def test_invalid_href_attr():
- assert (
- clean('xss') ==
- 'xss'
- )
-
-
-def test_invalid_filter_attr():
- IMG = ['img', ]
- IMG_ATTR = {
- 'img': lambda tag, name, val: name == 'src' and val == "http://example.com/"
- }
-
- assert (
- clean('
', tags=IMG, attributes=IMG_ATTR) ==
- '
'
- )
- assert (
- clean('
', tags=IMG, attributes=IMG_ATTR) ==
- '
'
- )
-
-
-def test_invalid_tag_char():
- assert (
- clean('') in
- [
- '<script src="http://xx.com/xss.js" xss=""></script>',
- '<script xss="" src="http://xx.com/xss.js"></script>'
- ]
- )
- assert (
- clean('') ==
- '<script src="http://xx.com/xss.js"></script>'
- )
-
-
-def test_unclosed_tag():
- assert (
- clean('ipt>'
- assert clean(s, strip=True) == 'pt>alert(1)ipt>'
- s = 'pt>pt>alert(1)'
- assert clean(s, strip=True) == 'pt>pt>alert(1)'
-
-
-def test_poster_attribute():
- """Poster attributes should not allow javascript."""
- tags = ['video']
- attrs = {'video': ['poster']}
- test = ''
- assert clean(test, tags=tags, attributes=attrs) == ''
- ok = ''
- assert clean(ok, tags=tags, attributes=attrs) == ok
-
-
-def test_feed_protocol():
- assert clean('foo') == 'foo'
-
-
-@pytest.mark.parametrize('data, expected', [
- # Convert bell
- ('1\a23', '1?23'),
-
- # Convert backpsace
- ('1\b23', '1?23'),
-
- # Convert formfeed
- ('1\v23', '1?23'),
-
- # Convert vertical tab
- ('1\f23', '1?23'),
-
- # Convert a bunch of characters in a string
- ('import y\bose\bm\bi\bt\be\b', 'import y?ose?m?i?t?e?'),
-])
-def test_invisible_characters(data, expected):
- assert clean(data) == expected
-
-
-def get_tests():
- """Retrieves regression tests from data/ directory
-
- :returns: list of ``(filename, filedata)`` tuples
-
- """
- datadir = os.path.join(os.path.dirname(__file__), 'data')
- tests = [
- os.path.join(datadir, fn) for fn in os.listdir(datadir)
- if fn.endswith('.test')
- ]
- # Sort numerically which makes it easier to iterate through them
- tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
-
- testcases = [
- (fn, open(fn, 'r').read()) for fn in tests
- ]
-
- return testcases
-
-
-@pytest.mark.parametrize('fn, test_case', get_tests())
-def test_regressions(fn, test_case):
- """Regression tests for clean so we can see if there are issues"""
- test_data, expected = test_case.split('\n--\n')
-
- # NOTE(willkg): This strips input and expected which makes it easier to
- # maintain the files. If there comes a time when the input needs whitespace
- # at the beginning or end, then we'll have to figure out something else.
- test_data = test_data.strip()
- expected = expected.strip()
-
- assert clean(test_data) == expected
From 18ecceb5f61896e1a88e8d965b1e61e860ded2a5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 3 Mar 2018 11:15:49 -0500
Subject: [PATCH 146/314] Correct a regression comment and fix a test I
misunderstood
---
tests/test_clean.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 799ae186..221addba 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -608,13 +608,12 @@ def test_svg_allow_local_href_nonlocal(text, expected):
assert clean(text, tags=TAGS, attributes=ATTRS) == expected
-@pytest.mark.xfail(reason='regression from bleach 1.4')
def test_weird_strings():
s = '3'
- assert clean(s) == '3'
+ assert clean(s) == ''
-@pytest.mark.xfail(reason='regression from bleach 1.4')
+@pytest.mark.xfail(reason='regression from bleach 1.5')
def test_sarcasm():
"""Jokes should crash. """
assert (
From d580f0abba6ae62da22e59be4355ea1d690eb1f5 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 3 Mar 2018 18:15:22 -0500
Subject: [PATCH 147/314] Fix MANIFEST and data_to_json.py related to recent
changes
I squashed test cases into single files--no more .out files. This carries
that change through to MANIFEST.in and our tests_website system.
---
MANIFEST.in | 2 +-
tests_website/data_to_json.py | 38 +++++++++++++++++++----------------
2 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
index 1ae68e20..14ad79c7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -12,6 +12,6 @@ include docs/Makefile
recursive-include docs *.rst
-recursive-include tests *.py *.test *.out
+recursive-include tests *.py *.test
recursive-include tests_website *.html *.py *.rst
diff --git a/tests_website/data_to_json.py b/tests_website/data_to_json.py
index debe5a9d..5870d64c 100755
--- a/tests_website/data_to_json.py
+++ b/tests_website/data_to_json.py
@@ -2,12 +2,12 @@
"""
Util to write a directory of test cases with input filenames
-.test and output filenames .test.out as JSON to
-stdout.
+.test as JSON to stdout.
-example:
+example::
+
+ $ python tests/data_to_json.py tests/data > testcases.json
-python tests/data_to_json.py tests/data > testcases.json
"""
import argparse
@@ -21,29 +21,33 @@
def main():
parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument('data_dir',
- help='directory containing test cases with input files'
- ' named .test and output .test.out')
+ parser.add_argument(
+ 'data_dir',
+ help=(
+ 'directory containing test cases with names like .test'
+ )
+ )
args = parser.parse_args()
filenames = os.listdir(args.data_dir)
ins = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test')]
- outs = [os.path.join(args.data_dir, f) for f in filenames if fnmatch.fnmatch(f, '*.test.out')]
testcases = []
- for infn, outfn in zip(ins, outs):
+ for infn in ins:
case_name = infn.rsplit('.test', 1)[0]
- with open(infn, 'r') as fin, open(outfn, 'r') as fout:
- payload = fin.read()[:-1]
+ with open(infn, 'r') as fin:
+ data, expected = fin.read().split('\n--\n')
+ data = data.strip()
+ expected = expected.strip()
+
testcases.append({
- "title": case_name,
- "input_filename": infn,
- "output_filename": outfn,
- "payload": payload,
- "actual": bleach.clean(payload),
- "expected": fout.read(),
+ 'title': case_name,
+ 'input_filename': infn,
+ 'payload': data,
+ 'actual': bleach.clean(data),
+ 'expected': expected,
})
print(json.dumps(testcases, indent=4, sort_keys=True))
From 73dfef1d3b96c2e432660d8d2f2e9d0eaa230e36 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sat, 3 Mar 2018 09:58:37 -0500
Subject: [PATCH 148/314] Fix url sanitizing
Fixes a security issue where url sanitizing wouldn't work if there were
character entities breaking up the scheme. This allowed javascript urls
even when they were not explicitly allowed.
---
bleach/sanitizer.py | 134 ++++++++++++++++++++++++++++++++++++++------
tests/test_clean.py | 98 ++++++++++++++++++++++++++++++--
2 files changed, 210 insertions(+), 22 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 81df765b..ac6a55cb 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -4,6 +4,7 @@
import string
import six
+from six.moves.urllib.parse import urlparse
from xml.sax.saxutils import unescape
import html5lib
@@ -27,8 +28,11 @@
from bleach.utils import alphabetize_attributes, force_unicode
+#: Map of entity name to expanded entity
+ENTITIES = entities
+
#: Trie of html entity string -> character representation
-ENTITIES_TRIE = Trie(entities)
+ENTITIES_TRIE = Trie(ENTITIES)
#: List of allowed tags
ALLOWED_TAGS = [
@@ -79,13 +83,61 @@
INVISIBLE_REPLACEMENT_CHAR = '?'
+def convert_entity(value):
+ """Convert an entity (minus the & and ; part) into what it represents
+
+ This handles numeric, hex, and text entities.
+
+ :arg value: the string (minus the ``&`` and ``;`` part) to convert
+
+ :returns: unicode character
+
+ """
+ if value[0] == '#':
+ if value[1] in ('x', 'X'):
+ return six.unichr(int(value[2:], 16))
+ return six.unichr(int(value[1:], 10))
+
+ return ENTITIES[value]
+
+
+def convert_entities(text):
+ """Converts all found entities in the text
+
+ :arg text: the text to convert entities in
+
+ :returns: unicode text with converted entities
+
+ """
+ if '&' not in text:
+ return text
+
+ new_text = []
+ for part in next_possible_entity(text):
+ if not part:
+ continue
+
+ if part.startswith('&'):
+ entity = match_entity(part)
+ if entity is not None:
+ new_text.append(convert_entity(entity))
+ remainder = part[len(entity) + 2:]
+ if part:
+ new_text.append(remainder)
+ continue
+
+ new_text.append(part)
+
+ return u''.join(new_text)
+
+
class BleachHTMLTokenizer(HTMLTokenizer):
def consumeEntity(self, allowedChar=None, fromAttribute=False):
# We don't want to consume and convert entities, so this overrides the
# html5lib tokenizer's consumeEntity so that it's now a no-op.
#
# However, when that gets called, it's consumed an &, so we put that in
- # the steam.
+ # the stream.
if fromAttribute:
self.currentToken['data'][-1][1] += '&'
@@ -479,15 +531,69 @@ def sanitize_characters(self, token):
new_tokens.append({'type': 'Entity', 'name': entity})
# Length of the entity plus 2--one for & at the beginning
# and and one for ; at the end
- part = part[len(entity) + 2:]
- if part:
- new_tokens.append({'type': 'Characters', 'data': part})
+ remainder = part[len(entity) + 2:]
+ if remainder:
+ new_tokens.append({'type': 'Characters', 'data': remainder})
continue
new_tokens.append({'type': 'Characters', 'data': part})
return new_tokens
+ def sanitize_uri_value(self, value, allowed_protocols):
+ """Checks a uri value to see if it's allowed
+
+ :arg value: the uri value to sanitize
+ :arg allowed_protocols: list of allowed protocols
+
+ :returns: allowed value or None
+
+ """
+ # NOTE(willkg): This transforms the value into one that's easier to
+ # match and verify, but shouldn't get returned since it's vastly
+ # different than the original value.
+
+ # Convert all character entities in the value
+ new_value = convert_entities(value)
+
+ # Nix single quote, whitespace, and non-printable charcters
+ new_value = re.sub(
+ "[`\000-\040\177-\240\s]+",
+ '',
+ new_value
+ )
+
+ # Remove REPLACEMENT characters
+ new_value = new_value.replace('\ufffd', '')
+
+ # Lowercase it--this breaks the value, but makes it easier to match
+ # against
+ new_value = new_value.lower()
+
+ # Drop attributes with uri values that have protocols that aren't
+ # allowed
+ parsed = urlparse(new_value)
+ if parsed.scheme:
+ # If urlparse found a scheme, check that
+ if parsed.scheme in allowed_protocols:
+ return value
+
+ else:
+ # Allow uris that are just an anchor
+ if new_value.startswith('#'):
+ return value
+
+ # Handle protocols that urlparse doesn't recognize like "myprotocol"
+ if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
+ return value
+
+ # If there's no protocol/scheme specified, then assume it's "http"
+ # and see if that's allowed
+ if 'http' in allowed_protocols:
+ return value
+
+ return None
+
def allow_token(self, token):
"""Handles the case where we're allowing the tag"""
if 'data' in token:
@@ -508,21 +614,13 @@ def allow_token(self, token):
if not self.attr_filter(token['name'], name, val):
continue
- # Look at attributes that have uri values
+ # Drop attributes with uri values that use a disallowed protocol
+ # Sanitize attributes with uri values
if namespaced_name in self.attr_val_is_uri:
- val_unescaped = re.sub(
- "[`\000-\040\177-\240\s]+",
- '',
- unescape(val)).lower()
-
- # Remove replacement characters from unescaped characters.
- val_unescaped = val_unescaped.replace("\ufffd", "")
-
- # Drop attributes with uri values that have protocols that
- # aren't allowed
- if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
- (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+ new_value = self.sanitize_uri_value(val, self.allowed_protocols)
+ if new_value is None:
continue
+ val = new_value
# Drop values in svg attrs with non-local IRIs
if namespaced_name in self.svg_attr_val_allows_ref:
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 221addba..f680e8e1 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -213,7 +213,7 @@ def test_nested_script_tag():
('an < entity', 'an < entity'),
('tag < and entity', 'tag < and entity'),
])
-def test_bare_entities(text, expected):
+def test_bare_entities_get_escaped_correctly(text, expected):
assert clean(text) == expected
@@ -277,7 +277,7 @@ def test_bare_entities(text, expected):
# Verify that clean() doesn't unescape entities.
(''"', ''"'),
])
-def test_character_entities(text, expected):
+def test_character_entities_handling(text, expected):
assert clean(text) == expected
@@ -534,10 +534,100 @@ def test_attributes_list():
# Unspecified protocols are not allowed
(
- 'invalid href',
+ 'invalid href',
{'protocols': ['myprotocol']},
'invalid href'
- )
+ ),
+
+ # Anchors are ok
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+
+ # Allow implicit http if allowed
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+ (
+ 'valid',
+ {'protocols': ['http']},
+ 'valid'
+ ),
+
+ # Disallow implicit http if disallowed
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+ (
+ 'foo',
+ {'protocols': []},
+ 'foo'
+ ),
+
+ # Disallowed protocols with sneaky character entities
+ (
+ 'alert',
+ {},
+ 'alert'
+ ),
+ (
+ 'alert',
+ {},
+ 'alert'
+ ),
+
+ # Checking the uri should change it at all
+ (
+ 'foo',
+ {},
+ 'foo'
+ ),
])
def test_uri_value_allowed_protocols(data, kwargs, expected):
assert clean(data, **kwargs) == expected
From 61bf0e6db3bdce6294633555e08dd061af465c3c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 5 Mar 2018 16:08:49 -0500
Subject: [PATCH 149/314] Fix errant comment
---
bleach/sanitizer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index ac6a55cb..56f6d960 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -556,7 +556,7 @@ def sanitize_uri_value(self, value, allowed_protocols):
# Convert all character entities in the value
new_value = convert_entities(value)
- # Nix single quote, whitespace, and non-printable charcters
+ # Nix backtick, space characters, and control characters
new_value = re.sub(
"[`\000-\040\177-\240\s]+",
'',
From 9584f42051c0039cb0f27a617e8ab3e945018cc6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 5 Mar 2018 16:33:03 -0500
Subject: [PATCH 150/314] Prep for 2.1.3 release
---
CHANGES | 30 +++++++++++++++++++++++++++++-
bleach/__init__.py | 4 ++--
docs/dev.rst | 2 +-
3 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/CHANGES b/CHANGES
index 47bf3906..25789814 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,34 @@
-Bleach Changes
+Bleach changes
==============
+Version 2.1.3 (March 5th, 2018)
+-------------------------------
+
+**Security fixes**
+
+* Attributes that have URI values weren't properly sanitized if the
+ values contained character entities. Using character entities, it
+ was possible to construct a URI value with a scheme that was not
+ allowed that would slide through unsanitized.
+
+ This security issue was introduced in Bleach 2.1. Anyone using
+ Bleach 2.1 is highly encouraged to upgrade.
+
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Fixed some other edge cases for attribute URI value sanitizing and
+ improved testing of this code.
+
+
Version 2.1.2 (December 7th, 2017)
----------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8ed2c516..b81b0bbe 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
# yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20180305'
# x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.3.dev0'
+__version__ = '2.1.3'
VERSION = parse_version(__version__)
diff --git a/docs/dev.rst b/docs/dev.rst
index d27a62ed..b0302524 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -74,7 +74,7 @@ Release process
3. Run the doctests::
$ cd docs/
- $ make doctests
+ $ make doctest
4. Verify everything works
From 3e9b9ec55bbec5906800c3838d0840b4741f74d9 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 6 Mar 2018 09:26:05 -0500
Subject: [PATCH 151/314] Add tests for sanitizing urls in css properties
---
bleach/sanitizer.py | 4 ++--
tests/test_css.py | 52 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 56f6d960..09cae199 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -684,10 +684,10 @@ def disallowed_token(self, token):
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
- # disallow urls
+ # Drop any url values
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
- # gauntlet
+ # The gauntlet of sanitization
# Validate the css in the style tag and if it's not valid, then drop
# the whole thing.
diff --git a/tests/test_css.py b/tests/test_css.py
index d8880d78..ad81f594 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -66,7 +66,6 @@
),
])
def test_allowed_css(data, styles, expected):
-
p_single = 'bar
'
p_double = "bar
"
@@ -89,6 +88,57 @@ def test_valid_css():
)
+@pytest.mark.parametrize('data, expected', [
+ # No url--unchanged
+ (
+ 'foo
',
+ 'foo
'
+ ),
+
+ # Verify urls with no quotes, single quotes, and double quotes are all dropped
+ (
+ 'foo
',
+ 'foo
'
+ ),
+ (
+ 'foo
',
+ 'foo
'
+ ),
+ (
+ 'foo
',
+ 'foo
'
+ ),
+
+ # Verify urls with spacing
+ (
+ 'foo
',
+ 'foo
'
+ ),
+ (
+ 'foo
',
+ 'foo
'
+ ),
+ (
+ 'foo
',
+ 'foo
'
+ ),
+ (
+ 'foo
',
+ 'foo
'
+ ),
+
+ # Verify urls with character entities--this isn't valid, so the entire
+ # property is dropped
+ (
+ 'foo
',
+ 'foo
'
+ ),
+
+])
+def test_urls(data, expected):
+ assert clean(data, styles=['background']) == expected
+
+
def test_style_hang():
"""The sanitizer should not hang on any inline styles"""
style = [
From 28e7c3292bded1e91d194117e7d4d93ce855d698 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Fri, 16 Mar 2018 11:34:38 -0400
Subject: [PATCH 152/314] Handle ambiguous ampersands correctly
This fixes the ambiguous ampersand case in character entity handling in
attribute values.
Fixes #359
---
bleach/sanitizer.py | 24 ++++++++++++++++--------
tests/test_clean.py | 31 +++++++++++++++++++++++++++++--
2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 09cae199..12225efd 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -90,7 +90,8 @@ def convert_entity(value):
:arg value: the string (minus the ``&`` and ``;`` part) to convert
- :returns: unicode character
+ :returns: unicode character or None if it's an ambiguous ampersand that
+ doesn't match a character entity
"""
if value[0] == '#':
@@ -98,7 +99,7 @@ def convert_entity(value):
return six.unichr(int(value[2:], 16))
return six.unichr(int(value[1:], 10))
- return ENTITIES[value]
+ return ENTITIES.get(value, None)
def convert_entities(text):
@@ -120,11 +121,16 @@ def convert_entities(text):
if part.startswith('&'):
entity = match_entity(part)
if entity is not None:
- new_text.append(convert_entity(entity))
- remainder = part[len(entity) + 2:]
- if part:
- new_text.append(remainder)
- continue
+ converted = convert_entity(entity)
+
+ # If it's not an ambiguous ampersand, then replace with the
+ # unicode character. Otherwise, we leave the entity in.
+ if converted is not None:
+ new_text.append(converted)
+ remainder = part[len(entity) + 2:]
+ if part:
+ new_text.append(remainder)
+ continue
new_text.append(part)
@@ -731,7 +737,9 @@ def escape_base_amp(self, stoken):
if part.startswith('&'):
entity = match_entity(part)
- if entity is not None:
+ # Only leave entities in that are not ambiguous. If they're
+ # ambiguous, then we escape the ampersand.
+ if entity is not None and convert_entity(entity) is not None:
yield '&' + entity + ';'
# Length of the entity plus 2--one for & at the beginning
diff --git a/tests/test_clean.py b/tests/test_clean.py
index f680e8e1..1f3cbfc8 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -4,7 +4,7 @@
import pytest
from bleach import clean
-from bleach.sanitizer import Cleaner
+from bleach.sanitizer import convert_entities, Cleaner
def test_clean_idempotent():
@@ -246,7 +246,7 @@ def test_bare_entities_get_escaped_correctly(text, expected):
'http://example.com?active=true¤t=true'
),
- # Test entities in HTML attributes
+ # Test character entities in attribute values are left alone
(
'foo',
'foo'
@@ -255,11 +255,20 @@ def test_bare_entities_get_escaped_correctly(text, expected):
'foo',
'foo'
),
+
+ # Ambiguous ampersands get escaped in attributes
+ (
+ 'foo',
+ 'foo'
+ ),
(
'foo',
'foo'
),
+ # Ambiguous ampersands in text are not escaped
+ ('&xx;', '&xx;'),
+
# Test numeric entities
(''', '''),
('"', '"'),
@@ -732,6 +741,24 @@ def test_invisible_characters(data, expected):
assert clean(data) == expected
+@pytest.mark.parametrize('data, expected', [
+ # Strings without character entities pass through as is
+ ('', ''),
+ ('abc', 'abc'),
+
+ # Handles character entities--both named and numeric
+ (' ', u'\xa0'),
+ (' ', ' '),
+ (' ', ' '),
+
+ # Handles ambiguous ampersand
+ ('&xx;', '&xx;'),
+])
+def test_convert_entities(data, expected):
+ print(repr(convert_entities(data)))
+ assert convert_entities(data) == expected
+
+
def get_tests():
"""Retrieves regression tests from data/ directory
From 9818ffb81a362f4d141835a291225c2e65706ae2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sun, 18 Mar 2018 09:02:59 -0400
Subject: [PATCH 153/314] Add regression test with character entity in url
---
tests/data/6.test | 3 +++
1 file changed, 3 insertions(+)
create mode 100644 tests/data/6.test
diff --git a/tests/data/6.test b/tests/data/6.test
new file mode 100644
index 00000000..7755c813
--- /dev/null
+++ b/tests/data/6.test
@@ -0,0 +1,3 @@
+hi
+--
+hi
From a65f5c8ea664abbd54b4c711ebd0ca26c3509b7e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 19 Mar 2018 14:39:15 -0400
Subject: [PATCH 154/314] Update CHANGES
---
CHANGES | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/CHANGES b/CHANGES
index 25789814..5a9d5f84 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
Bleach changes
==============
+Version 2.1.4 (In development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+* Handle ambiguous ampersands in correctly. (#359)
+
+
Version 2.1.3 (March 5th, 2018)
-------------------------------
@@ -14,6 +34,7 @@ Version 2.1.3 (March 5th, 2018)
This security issue was introduced in Bleach 2.1. Anyone using
Bleach 2.1 is highly encouraged to upgrade.
+ https://bugzilla.mozilla.org/show_bug.cgi?id=1442745
**Backwards incompatible changes**
From 3f2270e42582d8f2d7392a54edff997b8675c797 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 19 Mar 2018 17:46:31 -0400
Subject: [PATCH 155/314] Handle nonexistent namespaces better
Issue 352 has a string that manages to tokenize an html attribute with
a namespace, but no name. Then the namespace doesn't exist in prefixes
and that throws a KeyError.
This alleviates that a bit such that if there's a namespace, but no
name, it swaps the two values. Further, if prefixes doesn't have the
namespace, then it ignores the namespace.
Fixes #352
---
bleach/sanitizer.py | 14 +++++++++++++-
tests/test_clean.py | 13 +++++++++++++
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 12225efd..faf8fd7a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -668,8 +668,20 @@ def disallowed_token(self, token):
assert token_type in ("StartTag", "EmptyTag")
attrs = []
for (ns, name), v in token["data"].items():
+ # If we end up with a namespace, but no name, switch them so we
+ # have a valid name to use.
+ if ns and not name:
+ ns, name = name, ns
+
+ # Figure out namespaced name if the namespace is appropriate
+ # and exists; if the ns isn't in prefixes, then drop it.
+ if ns is None or ns not in prefixes:
+ namespaced_name = name
+ else:
+ namespaced_name = '%s:%s' % (prefixes[ns], name)
+
attrs.append(' %s="%s"' % (
- name if ns is None else "%s:%s" % (prefixes[ns], name),
+ namespaced_name,
# NOTE(willkg): HTMLSerializer escapes attribute values
# already, so if we do it here (like HTMLSerializer does),
# then we end up double-escaping.
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 1f3cbfc8..9547d631 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -759,6 +759,19 @@ def test_convert_entities(data, expected):
assert convert_entities(data) == expected
+def test_nonexistent_namespace():
+ """Verify if the namespace doesn't exist, it doesn't fail with a KeyError
+
+ The tokenizer creates "c" as a namespace and that doesn't exist in the map
+ of namespaces, so then it fails with a KeyError. I don't understand why the
+ tokenizer makes "c" into a namespace in this string.
+
+ Issue #352.
+
+ """
+ assert clean('') == '<d c=""></d>'
+
+
def get_tests():
"""Retrieves regression tests from data/ directory
From 46fa500e2b3275af09e888feb495d1fcd541fb00 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Wed, 21 Mar 2018 21:16:19 -0400
Subject: [PATCH 156/314] Convert entities in CSS values before sanitizing
The CSS is in an HTML attribute value, so we need to convert character
entities in it which makes it proper CSS before we can sanitize it.
Fixes #363
---
bleach/sanitizer.py | 5 ++++-
tests/test_css.py | 22 +++++++++++++++++++---
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index faf8fd7a..7e5d0361 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -702,7 +702,10 @@ def disallowed_token(self, token):
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
- # Drop any url values
+ # Convert entities in the style so that it can be parsed as CSS
+ style = convert_entities(style)
+
+ # Drop any url values before we do anything else
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# The gauntlet of sanitization
diff --git a/tests/test_css.py b/tests/test_css.py
index ad81f594..12f27f3c 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -127,11 +127,10 @@ def test_valid_css():
'foo
'
),
- # Verify urls with character entities--this isn't valid, so the entire
- # property is dropped
+ # Verify urls with character entities
(
'foo
',
- 'foo
'
+ 'foo
'
),
])
@@ -201,3 +200,20 @@ def test_style_hang():
)
assert clean(html, styles=styles) == expected
+
+
+@pytest.mark.parametrize('data, styles, expected', [
+ (
+ 'text
',
+ ['font-family', 'white-space'],
+ 'text
'
+ ),
+ (
+ 'text
',
+ ['font-family', 'white-space'],
+ 'text
'
+ ),
+])
+def test_css_parsing_with_entities(data, styles, expected):
+ """The sanitizer should be ok with character entities"""
+ assert clean(data, tags=['p'], attributes={'p': ['style']}, styles=styles) == expected
From f1f04f6580e24bd1b977b8be0a1bc1e5d5f944da Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 7 Jun 2018 13:59:46 -0400
Subject: [PATCH 157/314] Nix pinning in dev requirements
---
requirements.txt | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 5cfec7f1..758459aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
-e .
-# Requirements to run the test suite:
-pytest==3.0.6
+# Requirements to run the test suite
+pytest
pytest-wholenodeid
-flake8==3.3.0
-tox==2.4.1
+flake8
+tox
# Requirements for building docs
-Sphinx==1.5.2
+Sphinx
# Requirements for updating package
-twine==1.8.1
+twine
From 8f6c2ea0b1155716ced070d87dc2c9d4f664ddcb Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 7 Jun 2018 14:08:11 -0400
Subject: [PATCH 158/314] Change requirements.txt to requirements-dev.txt
This change makes it clearer what the file is for.
---
MANIFEST.in | 2 +-
docs/dev.rst | 2 +-
requirements.txt => requirements-dev.txt | 0
tox.ini | 6 +++---
4 files changed, 5 insertions(+), 5 deletions(-)
rename requirements.txt => requirements-dev.txt (100%)
diff --git a/MANIFEST.in b/MANIFEST.in
index 14ad79c7..5a0f3385 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,7 @@ include CHANGES
include CONTRIBUTORS
include CONTRIBUTING.rst
include CODE_OF_CONDUCT.rst
-include requirements.txt
+include requirements-dev.txt
include tox.ini
include LICENSE
include README.rst
diff --git a/docs/dev.rst b/docs/dev.rst
index b0302524..abeaf913 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -50,7 +50,7 @@ Release process
1. Checkout master tip.
-2. Check to make sure ``setup.py`` and ``requirements.txt`` are
+2. Check to make sure ``setup.py`` and ``requirements-dev.txt`` are
correct and match requirements-wise.
3. Update version numbers in ``bleach/__init__.py``.
diff --git a/requirements.txt b/requirements-dev.txt
similarity index 100%
rename from requirements.txt
rename to requirements-dev.txt
diff --git a/tox.ini b/tox.ini
index d44521c9..c58bd532 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,7 +18,7 @@ basepython =
py35: python3.5
py36: python3.6
deps =
- -rrequirements.txt
+ -rrequirements-dev.txt
html5lib99999999: html5lib==0.99999999
html5lib999999999: html5lib==0.999999999
html5lib10b9: html5lib==1.0b9
@@ -59,7 +59,7 @@ commands =
[testenv:lint]
basepython = python
deps =
- -rrequirements.txt
+ -rrequirements-dev.txt
commands =
flake8 bleach/
@@ -67,6 +67,6 @@ commands =
basepython = python
changedir = docs
deps =
- -rrequirements.txt
+ -rrequirements-dev.txt
commands =
sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html
From 63076f4420498571027bb853703f06b3bfd469ff Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 7 Jun 2018 14:33:19 -0400
Subject: [PATCH 159/314] Fix lint and docs tox environments
---
tox.ini | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tox.ini b/tox.ini
index c58bd532..a5538a7b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -57,14 +57,14 @@ commands =
python setup.py build
[testenv:lint]
-basepython = python
+basepython = python3.6
deps =
-rrequirements-dev.txt
commands =
flake8 bleach/
[testenv:docs]
-basepython = python
+basepython = python3.6
changedir = docs
deps =
-rrequirements-dev.txt
From 9959a1a57c1574806e24ea29209af882c5bdbd95 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 7 Jun 2018 14:35:53 -0400
Subject: [PATCH 160/314] Update CHANGES re: Python 3.3 support
---
CHANGES | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGES b/CHANGES
index 5a9d5f84..5a01cd8a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,7 +10,7 @@ None
**Backwards incompatible changes**
-None
+* Dropped support for Python 3.3. (#328)
**Features**
From b8aae5660693f4d30d76a0b8e7525af1adcbc3cc Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 7 Jun 2018 15:13:27 -0400
Subject: [PATCH 161/314] Fix requirements file name in travis
---
.travis.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.travis.yml b/.travis.yml
index b6eea407..dfecccf7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,7 +15,7 @@ env:
install:
# html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
- pip install -U pip setuptools>=18.5
- - pip install -r requirements.txt
+ - pip install -r requirements-dev.txt
# stomp on html5lib install with the specified one
- pip install html5lib==$HTML5LIB
script:
From 9319ec77a06c582bd5e7726c0b3c69139ad67732 Mon Sep 17 00:00:00 2001
From: Antoine Leclair
Date: Fri, 29 Jun 2018 13:12:31 -0400
Subject: [PATCH 162/314] Fix error when parsing invalid URI
---
CONTRIBUTORS | 1 +
bleach/sanitizer.py | 11 ++++++++---
tests/test_clean.py | 3 +++
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 94276246..5783ab17 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -25,6 +25,7 @@ Contributors:
- Alireza Savand
- Andreas Malecki
- Andy Freeland
+- Antoine Leclair
- Anton Kovalyov
- Chris Beaven
- Dan Gayle
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 7e5d0361..31f12400 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -576,9 +576,14 @@ def sanitize_uri_value(self, value, allowed_protocols):
# against
new_value = new_value.lower()
- # Drop attributes with uri values that have protocols that aren't
- # allowed
- parsed = urlparse(new_value)
+ try:
+ # Drop attributes with uri values that have protocols that aren't
+ # allowed
+ parsed = urlparse(new_value)
+ except ValueError:
+ # URI is impossible to parse, therefore it's not allowed
+ return None
+
if parsed.scheme:
# If urlparse found a scheme, check that
if parsed.scheme in allowed_protocols:
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 9547d631..951d5b2a 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -58,6 +58,9 @@ def test_html_is_lowercased():
'foo'
)
+def test_invalid_uri_does_not_raise_error():
+ assert clean('text') == 'text'
+
@pytest.mark.parametrize('data, should_strip, expected', [
# Regular comment
From 8f88b41810ef82f5a1204e45ad8d6c9329b0c0b1 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 19 Jul 2018 20:56:38 -0400
Subject: [PATCH 163/314] Sync travis and tox environments
This makes sure travis and tox are testing Bleach with the same
configurations.
---
.travis.yml | 22 +++++++++++++---------
tox.ini | 2 ++
2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index dfecccf7..cd05d9aa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,21 @@
+# Note: If you update this, make sure to update tox.ini, too.
sudo: false
language: python
cache:
directories:
- "~/.cache/pip"
python:
-- "2.7"
-- "3.4"
-- "3.5"
-- "3.6"
-- "pypy"
+ - "2.7"
+ - "3.4"
+ - "3.5"
+ - "3.6"
+ - "pypy"
env:
-- HTML5LIB=0.99999999 # 8
-- HTML5LIB=0.999999999 # 9
+ - HTML5LIB=0.99999999 # 8
+ - HTML5LIB=0.999999999 # 9
+ - HTML5LIB=1.0b9
+ - HTML5LIB=1.0b10
+ - HTML5LIB=1.0.1
install:
# html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
- pip install -U pip setuptools>=18.5
@@ -19,8 +23,8 @@ install:
# stomp on html5lib install with the specified one
- pip install html5lib==$HTML5LIB
script:
-- py.test
-- flake8 bleach/
+ - py.test
+ - flake8 bleach/
deploy:
provider: pypi
user: jezdez
diff --git a/tox.ini b/tox.ini
index a5538a7b..d5539644 100644
--- a/tox.ini
+++ b/tox.ini
@@ -3,6 +3,8 @@
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
+# Note: If you update this, make sure to update .travis.yml, too.
+
[tox]
envlist =
py{27,34,35,36}-html5lib{99999999,999999999,10b9,10b10,101}
From 9960da4ddd777627fc39d8c1f4a36923102af06d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 16 Aug 2018 16:30:33 -0400
Subject: [PATCH 164/314] Update for v2.1.4 release
---
CHANGES | 4 ++--
bleach/__init__.py | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/CHANGES b/CHANGES
index 5a01cd8a..fd17745b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
Bleach changes
==============
-Version 2.1.4 (In development)
-------------------------------
+Version 2.1.4 (August 16th, 2018)
+---------------------------------
**Security fixes**
diff --git a/bleach/__init__.py b/bleach/__init__.py
index b81b0bbe..d0d84029 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
# yyyymmdd
-__releasedate__ = '20180305'
+__releasedate__ = '20180816'
# x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.3'
+__version__ = '2.1.4'
VERSION = parse_version(__version__)
From ff6e5c53d8888570f06d905cf31f2132b3b946a6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Thu, 16 Aug 2018 16:47:58 -0400
Subject: [PATCH 165/314] Update for 2.1.5 development
---
CHANGES | 20 ++++++++++++++++++++
bleach/__init__.py | 4 ++--
2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/CHANGES b/CHANGES
index fd17745b..ddd3e0a2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
Bleach changes
==============
+Version 2.1.5 (in development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+
Version 2.1.4 (August 16th, 2018)
---------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index d0d84029..367fbf42 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -33,9 +33,9 @@
# yyyymmdd
-__releasedate__ = '20180816'
+__releasedate__ = ''
# x.y.z or x.y.z.dev0 -- semver
-__version__ = '2.1.4'
+__version__ = '2.1.5.dev0'
VERSION = parse_version(__version__)
From a507a4ed7e37cd594b8af5b4722bd6b058e9c2c2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Mon, 27 Aug 2018 09:08:47 -0400
Subject: [PATCH 166/314] Drop easy_install instructions
Fixes #373
---
README.rst | 4 ----
1 file changed, 4 deletions(-)
diff --git a/README.rst b/README.rst
index 5f151dc7..6622ee46 100644
--- a/README.rst
+++ b/README.rst
@@ -58,10 +58,6 @@ Bleach is available on PyPI_, so you can install it with ``pip``::
$ pip install bleach
-Or with ``easy_install``::
-
- $ easy_install bleach
-
Upgrading Bleach
================
From 7970857c78bec0060f527277a91a8ca72aaabe8d Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Sun, 26 Aug 2018 15:21:35 -0400
Subject: [PATCH 167/314] vendor html5lib 1.0.1
This vendors html5lib 1.0.1 and in doing that, drops the requirement to
install html5lib.
Fixes #386
---
.gitignore | 6 +-
CHANGES | 3 +-
MANIFEST.in | 3 +-
bleach/__init__.py | 15 -
bleach/_vendor/README.rst | 21 +
bleach/_vendor/__init__.py | 0
.../html5lib-1.0.1.dist-info/DESCRIPTION.rst | 489 +++
.../html5lib-1.0.1.dist-info/INSTALLER | 1 +
.../html5lib-1.0.1.dist-info/LICENSE.txt | 20 +
.../_vendor/html5lib-1.0.1.dist-info/METADATA | 530 +++
.../_vendor/html5lib-1.0.1.dist-info/RECORD | 42 +
bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL | 6 +
.../html5lib-1.0.1.dist-info/metadata.json | 1 +
.../html5lib-1.0.1.dist-info/top_level.txt | 1 +
bleach/_vendor/html5lib/__init__.py | 35 +
bleach/_vendor/html5lib/_ihatexml.py | 288 ++
bleach/_vendor/html5lib/_inputstream.py | 923 ++++++
bleach/_vendor/html5lib/_tokenizer.py | 1721 ++++++++++
bleach/_vendor/html5lib/_trie/__init__.py | 14 +
bleach/_vendor/html5lib/_trie/_base.py | 37 +
bleach/_vendor/html5lib/_trie/datrie.py | 44 +
bleach/_vendor/html5lib/_trie/py.py | 67 +
bleach/_vendor/html5lib/_utils.py | 124 +
bleach/_vendor/html5lib/constants.py | 2947 +++++++++++++++++
bleach/_vendor/html5lib/filters/__init__.py | 0
.../filters/alphabeticalattributes.py | 29 +
bleach/_vendor/html5lib/filters/base.py | 12 +
.../html5lib/filters/inject_meta_charset.py | 73 +
bleach/_vendor/html5lib/filters/lint.py | 93 +
.../_vendor/html5lib/filters/optionaltags.py | 207 ++
bleach/_vendor/html5lib/filters/sanitizer.py | 896 +++++
bleach/_vendor/html5lib/filters/whitespace.py | 38 +
bleach/_vendor/html5lib/html5parser.py | 2791 ++++++++++++++++
bleach/_vendor/html5lib/serializer.py | 409 +++
.../_vendor/html5lib/treeadapters/__init__.py | 30 +
.../_vendor/html5lib/treeadapters/genshi.py | 54 +
bleach/_vendor/html5lib/treeadapters/sax.py | 50 +
.../_vendor/html5lib/treebuilders/__init__.py | 88 +
bleach/_vendor/html5lib/treebuilders/base.py | 417 +++
bleach/_vendor/html5lib/treebuilders/dom.py | 236 ++
bleach/_vendor/html5lib/treebuilders/etree.py | 340 ++
.../html5lib/treebuilders/etree_lxml.py | 366 ++
.../_vendor/html5lib/treewalkers/__init__.py | 154 +
bleach/_vendor/html5lib/treewalkers/base.py | 252 ++
bleach/_vendor/html5lib/treewalkers/dom.py | 43 +
bleach/_vendor/html5lib/treewalkers/etree.py | 130 +
.../html5lib/treewalkers/etree_lxml.py | 213 ++
bleach/_vendor/html5lib/treewalkers/genshi.py | 69 +
bleach/_vendor/pip_install_vendor.sh | 4 +
bleach/_vendor/vendor.txt | 1 +
bleach/linkifier.py | 8 +-
bleach/sanitizer.py | 20 +-
setup.cfg | 4 +
setup.py | 5 +-
tests/test_clean.py | 2 +-
tests/test_linkify.py | 5 +-
tox.ini | 9 +-
57 files changed, 14336 insertions(+), 50 deletions(-)
create mode 100644 bleach/_vendor/README.rst
create mode 100644 bleach/_vendor/__init__.py
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/INSTALLER
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/LICENSE.txt
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/METADATA
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/RECORD
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/WHEEL
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/metadata.json
create mode 100644 bleach/_vendor/html5lib-1.0.1.dist-info/top_level.txt
create mode 100644 bleach/_vendor/html5lib/__init__.py
create mode 100644 bleach/_vendor/html5lib/_ihatexml.py
create mode 100644 bleach/_vendor/html5lib/_inputstream.py
create mode 100644 bleach/_vendor/html5lib/_tokenizer.py
create mode 100644 bleach/_vendor/html5lib/_trie/__init__.py
create mode 100644 bleach/_vendor/html5lib/_trie/_base.py
create mode 100644 bleach/_vendor/html5lib/_trie/datrie.py
create mode 100644 bleach/_vendor/html5lib/_trie/py.py
create mode 100644 bleach/_vendor/html5lib/_utils.py
create mode 100644 bleach/_vendor/html5lib/constants.py
create mode 100644 bleach/_vendor/html5lib/filters/__init__.py
create mode 100644 bleach/_vendor/html5lib/filters/alphabeticalattributes.py
create mode 100644 bleach/_vendor/html5lib/filters/base.py
create mode 100644 bleach/_vendor/html5lib/filters/inject_meta_charset.py
create mode 100644 bleach/_vendor/html5lib/filters/lint.py
create mode 100644 bleach/_vendor/html5lib/filters/optionaltags.py
create mode 100644 bleach/_vendor/html5lib/filters/sanitizer.py
create mode 100644 bleach/_vendor/html5lib/filters/whitespace.py
create mode 100644 bleach/_vendor/html5lib/html5parser.py
create mode 100644 bleach/_vendor/html5lib/serializer.py
create mode 100644 bleach/_vendor/html5lib/treeadapters/__init__.py
create mode 100644 bleach/_vendor/html5lib/treeadapters/genshi.py
create mode 100644 bleach/_vendor/html5lib/treeadapters/sax.py
create mode 100644 bleach/_vendor/html5lib/treebuilders/__init__.py
create mode 100644 bleach/_vendor/html5lib/treebuilders/base.py
create mode 100644 bleach/_vendor/html5lib/treebuilders/dom.py
create mode 100644 bleach/_vendor/html5lib/treebuilders/etree.py
create mode 100644 bleach/_vendor/html5lib/treebuilders/etree_lxml.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/__init__.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/base.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/dom.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/etree.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/etree_lxml.py
create mode 100644 bleach/_vendor/html5lib/treewalkers/genshi.py
create mode 100755 bleach/_vendor/pip_install_vendor.sh
create mode 100644 bleach/_vendor/vendor.txt
diff --git a/.gitignore b/.gitignore
index 26bbdf8e..c4abbd13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,10 +4,14 @@ pip-log.txt
.coverage
dist
*.egg-info
-.noseids
build
.tox
docs/_build/
.cache/
.eggs/
.*env*/
+.pytest_cache/
+.python-version
+*~
+*.swp
+__pycache__
diff --git a/CHANGES b/CHANGES
index ddd3e0a2..423c1ecb 100644
--- a/CHANGES
+++ b/CHANGES
@@ -14,7 +14,8 @@ None
**Features**
-None
+* No longer depends on html5lib. html5lib==1.0.1 was vendored into Bleach.
+ (#386)
**Bug fixes**
diff --git a/MANIFEST.in b/MANIFEST.in
index 5a0f3385..2a85593e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -10,8 +10,7 @@ include README.rst
include docs/conf.py
include docs/Makefile
+recursive-include bleach *.py *.json *.rst *.sh *.txt INSTALLER METADATA RECORD WHEEL
recursive-include docs *.rst
-
recursive-include tests *.py *.test
-
recursive-include tests_website *.html *.py *.rst
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 367fbf42..f953fc51 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
-import warnings
from pkg_resources import parse_version
from bleach.linkifier import (
@@ -18,20 +17,6 @@
)
-import html5lib
-try:
- _html5lib_version = html5lib.__version__.split('.')
- if len(_html5lib_version) < 2:
- _html5lib_version = _html5lib_version + ['0']
-except Exception:
- _h5ml5lib_version = ['unknown', 'unknown']
-
-
-# Bleach 3.0.0 won't support html5lib-python < 1.0.0.
-if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]:
- warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning)
-
-
# yyyymmdd
__releasedate__ = ''
# x.y.z or x.y.z.dev0 -- semver
diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst
new file mode 100644
index 00000000..41c1d13e
--- /dev/null
+++ b/bleach/_vendor/README.rst
@@ -0,0 +1,21 @@
+=======================
+Vendored library policy
+=======================
+
+To simplify Bleach development, we're now vendoring certain libraries that
+we use.
+
+Vendored libraries must follow these rules:
+
+1. Vendored libraries must be pure Python--no compiling.
+2. Source code for the libary is included in this directory.
+3. License must be included in this repo and in the Bleach distribution.
+4. Requirements of the library become requirements of Bleach.
+5. No modifications to the library may be made.
+
+Way to vendor a library or update a version:
+
+1. Update ``vendor.txt`` with the library and version.
+2. Remove old files and directories.
+3. Run ``pip_install_vendor.sh`` and check everything it produced in including
+ the ``.dist-info`` directory and contents.
diff --git a/bleach/_vendor/__init__.py b/bleach/_vendor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
new file mode 100644
index 00000000..c05f8c00
--- /dev/null
+++ b/bleach/_vendor/html5lib-1.0.1.dist-info/DESCRIPTION.rst
@@ -0,0 +1,489 @@
+html5lib
+========
+
+.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
+ :target: https://travis-ci.org/html5lib/html5lib-python
+
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the WHATWG HTML specification, as is implemented by all major
+web browsers.
+
+
+Usage
+-----
+
+Simple usage follows this pattern:
+
+.. code-block:: python
+
+ import html5lib
+ with open("mydocument.html", "rb") as f:
+ document = html5lib.parse(f)
+
+or:
+
+.. code-block:: python
+
+ import html5lib
+ document = html5lib.parse("Hello World!")
+
+By default, the ``document`` will be an ``xml.etree`` element instance.
+Whenever possible, html5lib chooses the accelerated ``ElementTree``
+implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+
+Two other tree types are supported: ``xml.dom.minidom`` and
+``lxml.etree``. To use an alternative format, specify the name of
+a treebuilder:
+
+.. code-block:: python
+
+ import html5lib
+ with open("mydocument.html", "rb") as f:
+ lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+When using with ``urllib2`` (Python 2), the charset from HTTP should be
+pass into html5lib as follows:
+
+.. code-block:: python
+
+ from contextlib import closing
+ from urllib2 import urlopen
+ import html5lib
+
+ with closing(urlopen("http://example.com/")) as f:
+ document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
+
+When using with ``urllib.request`` (Python 3), the charset from HTTP
+should be pass into html5lib as follows:
+
+.. code-block:: python
+
+ from urllib.request import urlopen
+ import html5lib
+
+ with urlopen("http://example.com/") as f:
+ document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
+
+To have more control over the parser, create a parser object explicitly.
+For instance, to make the parser raise exceptions on parse errors, use:
+
+.. code-block:: python
+
+ import html5lib
+ with open("mydocument.html", "rb") as f:
+ parser = html5lib.HTMLParser(strict=True)
+ document = parser.parse(f)
+
+When you're instantiating parser objects explicitly, pass a treebuilder
+class as the ``tree`` keyword argument to use an alternative document
+format:
+
+.. code-block:: python
+
+ import html5lib
+ parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
+ minidom_document = parser.parse("
Hello World!")
+
+More documentation is available at https://html5lib.readthedocs.io/.
+
+
+Installation
+------------
+
+html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it,
+use:
+
+.. code-block:: bash
+
+ $ pip install html5lib
+
+
+Optional Dependencies
+---------------------
+
+The following third-party libraries may be used for additional
+functionality:
+
+- ``datrie`` can be used under CPython to improve parsing performance
+ (though in almost all cases the improvement is marginal);
+
+- ``lxml`` is supported as a tree format (for both building and
+ walking) under CPython (but *not* PyPy where it is known to cause
+ segfaults);
+
+- ``genshi`` has a treewalker (but not builder); and
+
+- ``chardet`` can be used as a fallback when character encoding cannot
+ be determined.
+
+
+Bugs
+----
+
+Please report any bugs on the `issue tracker
+`_.
+
+
+Tests
+-----
+
+Unit tests require the ``pytest`` and ``mock`` libraries and can be
+run using the ``py.test`` command in the root directory.
+
+Test data are contained in a separate `html5lib-tests
+`_ repository and included
+as a submodule, thus for git checkouts they must be initialized::
+
+ $ git submodule init
+ $ git submodule update
+
+If you have all compatible Python implementations available on your
+system, you can run tests on all of them using the ``tox`` utility,
+which can be found on PyPI.
+
+
+Questions?
+----------
+
+There's a mailing list available for support on Google Groups,
+`html5lib-discuss `_,
+though you may get a quicker response asking on IRC in `#whatwg on
+irc.freenode.net `_.
+
+Change Log
+----------
+
+1.0.1
+~~~~~
+
+Released on December 7, 2017
+
+Breaking changes:
+
+* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
+* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
+
+Features:
+
+* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
+ Will Kahn-Greene!)
+* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
+* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
+* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
+* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
+* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
+ Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
+* Semver-compliant version number.
+
+Bug fixes:
+
+* Add support for setuptools < 18.5 to support environment markers. (Thank you,
+ John Vandenberg!)
+* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
+* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
+ you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
+* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
+ Kahn-Greene!)
+* Include license file in generated wheel package. (#350) (Thank you, Jon
+ Dufresne!)
+* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
+* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
+ Komal Dembla, Hugo!)
+
+
+1.0
+~~~
+
+Released and unreleased on December 7, 2017. Badly packaged release.
+
+
+0.999999999/1.0b10
+~~~~~~~~~~~~~~~~~~
+
+Released on July 15, 2016
+
+* Fix attribute order going to the tree builder to be document order
+ instead of reverse document order(!).
+
+
+0.99999999/1.0b9
+~~~~~~~~~~~~~~~~
+
+Released on July 14, 2016
+
+* **Added ordereddict as a mandatory dependency on Python 2.6.**
+
+* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
+ extras that will do the right thing based on the specific
+ interpreter implementation.
+
+* Now requires the ``mock`` package for the testsuite.
+
+* Cease supporting DATrie under PyPy.
+
+* **Remove PullDOM support, as this hasn't ever been properly
+ tested, doesn't entirely work, and as far as I can tell is
+ completely unused by anyone.**
+
+* Move testsuite to ``py.test``.
+
+* **Fix #124: move to webencodings for decoding the input byte stream;
+ this makes html5lib compliant with the Encoding Standard, and
+ introduces a required dependency on webencodings.**
+
+* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
+
+* **Fix comments containing double-dash with lxml 3.5 and above.**
+
+* **Use scripting disabled by default (as we don't implement
+ scripting).**
+
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+ allowing attribute values to be escaped out of in old browser versions,
+ changing the quote_attr_values option on serializer to take one of
+ three values, "always" (the old True value), "legacy" (the new option,
+ and the new default), and "spec" (the old False value, and the old
+ default).**
+
+* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
+ (instead of the tokenizer); as such, this will require amending all
+ callers of it to use it via the treewalker API.**
+
+* **Drop support of charade, now that chardet is supported once more.**
+
+* **Replace the charset keyword argument on parse and related methods
+ with a set of keyword arguments: override_encoding, transport_encoding,
+ same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+ to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+ sanitizer.htmlsanitizer module and move that to sanitizer. This means
+ anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+ code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+ treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+ utils) to be underscore prefixed to clarify their status as private.
+
+
+0.9999999/1.0b8
+~~~~~~~~~~~~~~~
+
+Released on September 10, 2015
+
+* Fix #195: fix the sanitizer to drop broken URLs (it threw an
+ exception between 0.9999 and 0.999999).
+
+
+0.999999/1.0b7
+~~~~~~~~~~~~~~
+
+Released on July 7, 2015
+
+* Fix #189: fix the sanitizer to allow relative URLs again (as it did
+ prior to 0.9999/1.0b5).
+
+
+0.99999/1.0b6
+~~~~~~~~~~~~~
+
+Released on April 30, 2015
+
+* Fix #188: fix the sanitizer to not throw an exception when sanitizing
+ bogus data URLs.
+
+
+0.9999/1.0b5
+~~~~~~~~~~~~
+
+Released on April 29, 2015
+
+* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
+ this sounds, this has no known security implications. No known version
+ of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
+ Chrome (1 to current), or Opera (12 to current) will run any script
+ provided in these attributes.
+
+* Pass error message to the ParseError exception in strict parsing mode.
+
+* Allow data URIs in the sanitizer, with a whitelist of content-types.
+
+* Add support for Python implementations that don't support lone
+ surrogates (read: Jython). Fixes #2.
+
+* Remove localization of error messages. This functionality was totally
+ unused (and untested that everything was localizable), so we may as
+ well follow numerous browsers in not supporting translating technical
+ strings.
+
+* Expose treewalkers.pprint as a public API.
+
+* Add a documentEncoding property to HTML5Parser, fix #121.
+
+
+0.999
+~~~~~
+
+Released on December 23, 2013
+
+* Fix #127: add work-around for CPython issue #20007: .read(0) on
+ http.client.HTTPResponse drops the rest of the content.
+
+* Fix #115: lxml treewalker can now deal with fragments containing, at
+ their root level, text nodes with non-ASCII characters on Python 2.
+
+
+0.99
+~~~~
+
+Released on September 10, 2013
+
+* No library changes from 1.0b3; released as 0.99 as pip has changed
+ behaviour from 1.4 to avoid installing pre-release versions per
+ PEP 440.
+
+
+1.0b3
+~~~~~
+
+Released on July 24, 2013
+
+* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
+ implementation using it should be moved to
+ ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
+ for years.
+
+* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
+ object, thereby fixing any case where html5lib is passed a
+ non-seekable RawIOBase-like object.
+
+
+1.0b2
+~~~~~
+
+Released on June 27, 2013
+
+* Removed reordering of attributes within the serializer. There is now
+ an ``alphabetical_attributes`` option which preserves the previous
+ behaviour through a new filter. This allows attribute order to be
+ preserved through html5lib if the tree builder preserves order.
+
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+ ``treeadapters.sax.to_sax`` which is generic and supports any
+ treewalker; it also resolves all known bugs with ``dom2sax``.
+
+* Fix treewalker assertions on hitting bytes strings on
+ Python 2. Previous to 1.0b1, treewalkers coped with mixed
+ bytes/unicode data on Python 2; this reintroduces this prior
+ behaviour on Python 2. Behaviour is unchanged on Python 3.
+
+
+1.0b1
+~~~~~
+
+Released on May 17, 2013
+
+* Implementation updated to implement the `HTML specification
+ `_ as of 5th May
+ 2013 (`SVN `_ revision r7867).
+
+* Python 3.2+ supported in a single codebase using the ``six`` library.
+
+* Removed support for Python 2.5 and older.
+
+* Removed the deprecated Beautiful Soup 3 treebuilder.
+ ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
+ since it doesn't support namespaces, foreign content like SVG and
+ MathML is parsed incorrectly.
+
+* Removed ``simpletree`` from the package. The default tree builder is
+ now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
+ available, and ``xml.etree.ElementTree`` otherwise).
+
+* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
+ output was well-formed XML, and hence provided little of use.
+
+* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
+ longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
+ return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+
+* Optional heuristic character encoding detection now based on
+ ``charade`` for Python 2.6 - 3.3 compatibility.
+
+* Optional ``Genshi`` treewalker support fixed.
+
+* Many bugfixes, including:
+
+ * #33: null in attribute value breaks XML AttValue;
+
+ * #4: nested, indirect descendant,