diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 00000000..d13db676 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,37 @@ +--- +name: bug report +about: Create a report to a bug or regression +title: 'bug: ' +labels: untriaged +assignees: '' + +--- + +**Describe the bug** + +A clear and concise description of what the bug is. [e.g. "`bleach.clean` does not escape script tag contents"] + +** python and bleach versions (please complete the following information):** + + - Python Version: [e.g. 3.8.2] + - Bleach Version: [e.g. 3.2.0] + +**To Reproduce** + +Steps to reproduce the behavior: + +[e.g. ```python +>>> bleach.clean(">&") +">&" +```] + +**Expected behavior** + +[e.g. ```python +>>> bleach.clean(">&") +'>"><script>alert("XSS")</script>&' +```] + +**Additional context** + +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 00000000..79b7fcdc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,31 @@ +--- +name: feature request +about: Suggest an idea for this project +title: 'feature: ' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** + +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** + +A clear and concise description of what you want to happen. + +Does this feature modify an existing method e.g. `clean`, `linkify`? + +If you have an implementation in mind include an example call and output [e.g. ```python +>>> bleach.clean(always_error=True) +Exception("always raise exception enabled") +```] + +**Describe alternatives you've considered** + +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** + +Add any other context or screenshots about the feature request here. diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..b75d2e66 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,40 @@ +name: Lint + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [3.8] + os: [ubuntu-latest] + mode: [lint, vendorverify, docs] + + steps: + - uses: actions/checkout@v2 + + - name: Cache + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: + ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') + }} + restore-keys: | + ${{ matrix.os }}-${{ matrix.python-version }}- + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install -U pip setuptools>=18.5 + python -m pip install -r requirements-dev.txt + + - name: Tests + shell: bash + run: ./scripts/run_tests.sh ${{ matrix.mode }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..baea09bd --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,44 @@ +name: Test + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8, pypy3] + os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Get pip cache dir + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + + - name: Cache + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: + ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') + }} + restore-keys: | + ${{ matrix.os }}-${{ matrix.python-version }}- + + - name: Install dependencies + run: | + python -m pip install -U pip setuptools>=18.5 + python -m pip install -r requirements-dev.txt + + - name: Tests + shell: bash + run: ./scripts/run_tests.sh diff --git a/.gitignore b/.gitignore index c24310fb..88f9443f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,16 @@ pip-log.txt .coverage dist *.egg-info -.noseids build .tox docs/_build/ +.cache/ +.eggs/ +.*env*/ +.pytest_cache/ +.python-version +*~ +*.swp +__pycache__ +venv/ +.idea/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 193f70a0..00000000 --- a/.travis.yml +++ /dev/null @@ -1,14 +0,0 @@ -sudo: false -language: python -python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "pypy" -install: - - "pip install -r requirements.txt" -script: - - nosetests - - flake8 bleach/ diff --git a/CHANGES b/CHANGES index 4588b80a..cec900b4 100644 --- a/CHANGES +++ b/CHANGES @@ -1,31 +1,706 @@ -Bleach Changes +Bleach changes ============== -Version 1.5? (in progress) --------------------------- +Version 4.0.0 (August 3rd, 2021) +-------------------------------- + +**Backwards incompatible changes** + +* Drop support for unsupported Python versions <3.6 #520 + +**Security fixes** + +None + +**Features** + +* fix attribute name in the linkify docs (thanks @CheesyFeet!) + +Version 3.3.1 (July 14th, 2021) +------------------------------- + +**Security fixes** + +None + +**Features** + +* add more tests for CVE-2021-23980 / GHSA-vv2x-vrpj-qqpq +* bump python version to 3.8 for tox doc, vendorverify, and lint targets +* update bug report template tag +* update vendorverify script to detect and fail when extra files are vendored +* update release process docs to check vendorverify passes locally + +**Bug fixes** + +* remove extra vendored django present in the v3.3.0 whl #595 +* duplicate h1 header doc fix (thanks Nguyễn Gia Phong / @McSinyx!) + +Version 3.3.0 (February 1st, 2021) +---------------------------------- + +**Backwards incompatible changes** + +* clean escapes HTML comments even when strip_comments=False + +**Security fixes** + +* Fix bug 1621692 / GHSA-m6xf-fq7q-8743. See the advisory for details. + +**Features** + +None + +**Bug fixes** + +None + +Version 3.2.3 (January 26th, 2021) +---------------------------------- + +**Security fixes** + +None + +**Features** + +None + +**Bug fixes** + +* fix clean and linkify raising ValueErrors for certain inputs. Thank you @Google-Autofuzz. + +Version 3.2.2 (January 20th, 2021) +---------------------------------- + +**Security fixes** + +None + +**Features** + +* Migrate CI to Github Actions. Thank you @hugovk. + +**Bug fixes** + +* fix linkify raising an IndexError on certain inputs. Thank you @Google-Autofuzz. + +Version 3.2.1 (September 18th, 2020) +------------------------------------ + +**Security fixes** + +None + +**Features** + +None + +**Bug fixes** + +* change linkifier to add rel="nofollow" as documented. Thank you @mitar. +* suppress html5lib sanitizer DeprecationWarnings #557 + +Version 3.2.0 (September 16th, 2020) +------------------------------------ + +**Security fixes** + +None + +**Features** + +None + +**Bug fixes** + +* ``html5lib`` dependency to version 1.1.0. Thank you Sam Sneddon. +* update tests_website terminology. Thank you Thomas Grainger. + +Version 3.1.5 (April 29th, 2020) +-------------------------------- + +**Security fixes** + +None + +**Features** + +None + +**Bug fixes** + +* replace missing ``setuptools`` dependency with ``packaging``. Thank you Benjamin Peterson. + +Version 3.1.4 (March 24th, 2020) +-------------------------------- + +**Security fixes** + +* ``bleach.clean`` behavior parsing style attributes could result in a + regular expression denial of service (ReDoS). + + Calls to ``bleach.clean`` with an allowed tag with an allowed + ``style`` attribute were vulnerable to ReDoS. For example, + ``bleach.clean(..., attributes={'a': ['style']})``. + + This issue was confirmed in Bleach versions v3.1.3, v3.1.2, v3.1.1, + v3.1.0, v3.0.0, v2.1.4, and v2.1.3. Earlier versions used a similar + regular expression and should be considered vulnerable too. + + Anyone using Bleach <=v3.1.3 is encouraged to upgrade. + + https://bugzilla.mozilla.org/show_bug.cgi?id=1623633 + +**Backwards incompatible changes** + +* Style attributes with dashes, or single or double quoted values are + cleaned instead of passed through. + +**Features** + +None + +**Bug fixes** + +None + +Version 3.1.3 (March 17th, 2020) +-------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +* Add relative link to code of conduct. (#442) + +* Drop deprecated 'setup.py test' support. (#507) + +* Fix typo: curren -> current in tests/test_clean.py (#504) + +* Test on PyPy 7 + +* Drop test support for end of life Python 3.4 + +**Bug fixes** + +None + +Version 3.1.2 (March 11th, 2020) +-------------------------------- + +**Security fixes** + +* ``bleach.clean`` behavior parsing embedded MathML and SVG content + with RCDATA tags did not match browser behavior and could result in + a mutation XSS. + + Calls to ``bleach.clean`` with ``strip=False`` and ``math`` or + ``svg`` tags and one or more of the RCDATA tags ``script``, + ``noscript``, ``style``, ``noframes``, ``iframe``, ``noembed``, or + ``xmp`` in the allowed tags whitelist were vulnerable to a mutation + XSS. + + This security issue was confirmed in Bleach version v3.1.1. Earlier + versions are likely affected too. + + Anyone using Bleach <=v3.1.1 is encouraged to upgrade. + + https://bugzilla.mozilla.org/show_bug.cgi?id=1621692 + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +None + +Version 3.1.1 (February 13th, 2020) +----------------------------------- + +**Security fixes** + +* ``bleach.clean`` behavior parsing ``noscript`` tags did not match + browser behavior. + + Calls to ``bleach.clean`` allowing ``noscript`` and one or more of + the raw text tags (``title``, ``textarea``, ``script``, ``style``, + ``noembed``, ``noframes``, ``iframe``, and ``xmp``) were vulnerable + to a mutation XSS. + + This security issue was confirmed in Bleach versions v2.1.4, v3.0.2, + and v3.1.0. Earlier versions are probably affected too. + + Anyone using Bleach <=v3.1.0 is highly encouraged to upgrade. + + https://bugzilla.mozilla.org/show_bug.cgi?id=1615315 + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +None + +Version 3.1.0 (January 9th, 2019) +--------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +* Add ``recognized_tags`` argument to the linkify ``Linker`` class. This + fixes issues when linkifying on its own and having some tags get escaped. + It defaults to a list of HTML5 tags. Thank you, Chad Birch! (#409) + +**Bug fixes** + +* Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416) + +* Fix cases where attribute names could have invalid characters in them. + (#419) + +* Fix problems with ``LinkifyFilter`` not being able to match links + across ``&``. (#422) + +* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is + parsing ``meta`` tags. (#431) + +* Fix doctests. (#357) + + +Version 3.0.2 (October 11th, 2018) +---------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Merge ``Characters`` tokens after sanitizing them. This fixes issues in the + ``LinkifyFilter`` where it was only linkifying parts of urls. (#374) + + +Version 3.0.1 (October 9th, 2018) +--------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +* Support Python 3.7. It supported Python 3.7 just fine, but we added 3.7 to + the list of Python environments we test so this is now officially supported. + (#377) + +**Bug fixes** + +* Fix ``list`` object has no attribute ``lower`` in ``clean``. (#398) +* Fix ``abbr`` getting escaped in ``linkify``. (#400) + + +Version 3.0.0 (October 3rd, 2018) +--------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +* A bunch of functions were moved from one module to another. + + These were moved from ``bleach.sanitizer`` to ``bleach.html5lib_shim``: + + * ``convert_entity`` + * ``convert_entities`` + * ``match_entity`` + * ``next_possible_entity`` + * ``BleachHTMLSerializer`` + * ``BleachHTMLTokenizer`` + * ``BleachHTMLParser`` + + These functions and classes weren't documented and aren't part of the + public API, but people read code and might be using them so we're + considering it an incompatible API change. + + If you're using them, you'll need to update your code. + +**Features** + +* Bleach no longer depends on html5lib. html5lib==1.0.1 is now vendored into + Bleach. You can remove it from your requirements file if none of your other + requirements require html5lib. + + This means Bleach will now work fine with other libraries that depend on + html5lib regardless of what version of html5lib they require. (#386) + +**Bug fixes** + +* Fixed tags getting added when using clean or linkify. This was a + long-standing regression from the Bleach 2.0 rewrite. (#280, #392) + +* Fixed ```` getting replaced with a string. Now it gets escaped or + stripped depending on whether it's in the allowed tags or not. (#279) + + +Version 2.1.4 (August 16th, 2018) +--------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +* Dropped support for Python 3.3. (#328) + +**Features** + +None + +**Bug fixes** + +* Handle ambiguous ampersands in correctly. (#359) + + +Version 2.1.3 (March 5th, 2018) +------------------------------- + +**Security fixes** + +* Attributes that have URI values weren't properly sanitized if the + values contained character entities. Using character entities, it + was possible to construct a URI value with a scheme that was not + allowed that would slide through unsanitized. + + This security issue was introduced in Bleach 2.1. Anyone using + Bleach 2.1 is highly encouraged to upgrade. + + https://bugzilla.mozilla.org/show_bug.cgi?id=1442745 + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Fixed some other edge cases for attribute URI value sanitizing and + improved testing of this code. + + +Version 2.1.2 (December 7th, 2017) +---------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Support html5lib-python 1.0.1. (#337) + +* Add deprecation warning for supporting html5lib-python < 1.0. + +* Switch to semver. + + +Version 2.1.1 (October 2nd, 2017) +--------------------------------- + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +None + +**Bug fixes** + +* Fix ``setup.py`` opening files when ``LANG=``. (#324) + + +Version 2.1 (September 28th, 2017) +---------------------------------- + +**Security fixes** + +* Convert control characters (backspace particularly) to "?" preventing + malicious copy-and-paste situations. (#298) + + See ``_ for more details. + + This affects all previous versions of Bleach. Check the comments on that + issue for ways to alleviate the issue if you can't upgrade to Bleach 2.1. + + +**Backwards incompatible changes** + +* Redid versioning. ``bleach.VERSION`` is no longer available. Use the string + version at ``bleach.__version__`` and parse it with + ``pkg_resources.parse_version``. (#307) + +* clean, linkify: linkify and clean should only accept text types; thank you, + Janusz! (#292) + +* clean, linkify: accept only unicode or utf-8-encoded str (#176) + + +**Features** + + +**Bug fixes** + +* ``bleach.clean()`` no longer unescapes entities including ones that are missing + a ``;`` at the end which can happen in urls and other places. (#143) + +* linkify: fix http links inside of mailto links; thank you, sedrubal! (#300) + +* clarify security policy in docs (#303) + +* fix dependency specification for html5lib 1.0b8, 1.0b9, and 1.0b10; thank you, + Zoltán! (#268) + +* add Bleach vs. html5lib comparison to README; thank you, Stu Cox! (#278) + +* fix KeyError exceptions on tags without href attr; thank you, Alex Defsen! + (#273) + +* add test website and scripts to test ``bleach.clean()`` output in browser; + thank you, Greg Guthe! + + +Version 2.0 (March 8th, 2017) +----------------------------- + +**Security fixes** + +* None + + +**Backwards incompatible changes** + +* Removed support for Python 2.6. #206 + +* Removed support for Python 3.2. #224 + +* Bleach no longer supports html5lib < 0.99999999 (8 9s). + + This version is a rewrite to use the new sanitizing API since the old + one was dropped in html5lib 0.99999999 (8 9s). + + If you're using 0.9999999 (7 9s) upgrade to 0.99999999 (8 9s) or higher. + + If you're using 1.0b8 (equivalent to 0.9999999 (7 9s)), upgrade to 1.0b9 + (equivalent to 0.99999999 (8 9s)) or higher. + +* ``bleach.clean`` and friends were rewritten + + ``clean`` was reimplemented as an html5lib filter and happens at a different + step in the HTML parsing -> traversing -> serializing process. Because of + that, there are some differences in clean's output as compared with previous + versions. + + Amongst other things, this version will add end tags even if the tag in + question is to be escaped. + +* ``bleach.clean`` and friends attribute callables now take three arguments: + tag, attribute name and attribute value. Previously they only took attribute + name and attribute value. + + All attribute callables will need to be updated. + +* ``bleach.linkify`` was rewritten + + ``linkify`` was reimplemented as an html5lib Filter. As such, it no longer + accepts a ``tokenizer`` argument. + + The callback functions for adjusting link attributes now takes a namespaced + attribute. + + Previously you'd do something like this:: + + def check_protocol(attrs, is_new): + if not attrs.get('href', '').startswith('http:', 'https:')): + return None + return attrs + + Now it's more like this:: + + def check_protocol(attrs, is_new): + if not attrs.get((None, u'href'), u'').startswith(('http:', 'https:')): + # ^^^^^^^^^^^^^^^ + return None + return attrs + + Further, you need to make sure you're always using unicode values. If you + don't then html5lib will raise an assertion error that the value is not + unicode. + + All linkify filters will need to be updated. + +* ``bleach.linkify`` and friends had a ``skip_pre`` argument--that's been + replaced with a more general ``skip_tags`` argument. + + Before, you might do:: + + bleach.linkify(some_text, skip_pre=True) + + The equivalent with Bleach 2.0 is:: + + bleach.linkify(some_text, skip_tags=['pre']) + + You can skip other tags, too, like ``style`` or ``script`` or other places + where you don't want linkification happening. + + All uses of linkify that use ``skip_pre`` will need to be updated. + + +**Changes** + +* Supports Python 3.6. + +* Supports html5lib >= 0.99999999 (8 9s). + +* There's a ``bleach.sanitizer.Cleaner`` class that you can instantiate with your + favorite clean settings for easy reuse. + +* There's a ``bleach.linkifier.Linker`` class that you can instantiate with your + favorite linkify settings for easy reuse. + +* There's a ``bleach.linkifier.LinkifyFilter`` which is an htm5lib filter that + you can pass as a filter to ``bleach.sanitizer.Cleaner`` allowing you to clean + and linkify in one pass. + +* ``bleach.clean`` and friends can now take a callable as an attributes arg value. + +* Tons of bug fixes. + +* Cleaned up tests. + +* Documentation fixes. + + +Version 1.5 (November 4th, 2016) +-------------------------------- + +**Security fixes** + +* None **Backwards incompatible changes** -- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and - mailto. Previously it was a long list of protocols something like ed2k, ftp, - http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, - feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 +* clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and + mailto. + + Previously it was a long list of protocols something like ed2k, ftp, http, + https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed, + urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 **Changes** -- clean: Added ``protocols`` to arguments list to let you override the list of +* clean: Added ``protocols`` to arguments list to let you override the list of allowed protocols. Thank you, Andreas Malecki! #149 +* linkify: Fix a bug involving periods at the end of an email address. Thank you, + Lorenz Schori! #219 + +* linkify: Fix linkification of non-ascii ports. Thank you Alexandre, Macabies! + #207 + +* linkify: Fix linkify inappropriately removing node tails when dropping nodes. + #132 + +* Fixed a test that failed periodically. #161 + +* Switched from nose to py.test. #204 + +* Add test matrix for all supported Python and html5lib versions. #230 + +* Limit to html5lib ``>=0.999,!=0.9999,!=0.99999,<0.99999999`` because 0.9999 + and 0.99999 are busted. + +* Add support for ``python setup.py test``. #97 + + +Version 1.4.3 (May 23rd, 2016) +------------------------------ + +**Security fixes** + +* None + +**Changes** + +* Limit to html5lib ``>=0.999,<0.99999999`` because of impending change to + sanitizer api. #195 + Version 1.4.2 (September 11, 2015) ---------------------------------- **Changes** -- linkify: Fix hang in linkify with parse_email=True. #124 -- linkify: Fix crash in linkify when removing a link that is a first-child. #136 -- Updated TLDs. -- linkify: Don't remove exterior brackets when linkifying. #146 +* linkify: Fix hang in linkify with ``parse_email=True``. #124 + +* linkify: Fix crash in linkify when removing a link that is a first-child. #136 + +* Updated TLDs. + +* linkify: Don't remove exterior brackets when linkifying. #146 Version 1.4.1 (December 15, 2014) @@ -33,8 +708,9 @@ Version 1.4.1 (December 15, 2014) **Changes** -- Consistent order of attributes in output. -- Python 3.4 support. +* Consistent order of attributes in output. + +* Python 3.4 support. Version 1.4 (January 12, 2014) @@ -42,44 +718,54 @@ Version 1.4 (January 12, 2014) **Changes** -- linkify: Update linkify to use etree type Treewalker instead of simpletree. -- Updated html5lib to version >= 0.999. -- Update all code to be compatible with Python 3 and 2 using six. -- Switch to Apache License. +* linkify: Update linkify to use etree type Treewalker instead of simpletree. + +* Updated html5lib to version ``>=0.999``. + +* Update all code to be compatible with Python 3 and 2 using six. + +* Switch to Apache License. Version 1.3 ----------- -- Used by Python 3-only fork. +* Used by Python 3-only fork. Version 1.2.2 (May 18, 2013) ---------------------------- -- Pin html5lib to version 0.95 for now due to major API break. +* Pin html5lib to version 0.95 for now due to major API break. + Version 1.2.1 (February 19, 2013) --------------------------------- -- clean() no longer considers "feed:" an acceptable protocol due to +* ``clean()`` no longer considers ``feed:`` an acceptable protocol due to inconsistencies in browser behavior. Version 1.2 (January 28, 2013) ------------------------------ -- linkify() has changed considerably. Many keyword arguments have been - replaced with a single callbacks list. Please see the documentation - for more information. -- Bleach will no longer consider unacceptable protocols when linkifying. -- linkify() now takes a tokenizer argument that allows it to skip +* ``linkify()`` has changed considerably. Many keyword arguments have been + replaced with a single callbacks list. Please see the documentation for more + information. + +* Bleach will no longer consider unacceptable protocols when linkifying. + +* ``linkify()`` now takes a tokenizer argument that allows it to skip sanitization. -- delinkify() is gone. -- Removed exception handling from _render. clean() and linkify() may now - throw. -- linkify() correctly ignores case for protocols and domain names. -- linkify() correctly handles markup within an tag. + +* ``delinkify()`` is gone. + +* Removed exception handling from ``_render``. ``clean()`` and ``linkify()`` may + now throw. + +* ``linkify()`` correctly ignores case for protocols and domain names. + +* ``linkify()`` correctly handles markup within an tag. Version 1.1.5 @@ -93,61 +779,75 @@ Version 1.1.4 Version 1.1.3 (July 10, 2012) ----------------------------- -- Fix parsing bare URLs when parse_email=True. +* Fix parsing bare URLs when parse_email=True. Version 1.1.2 (June 1, 2012) ---------------------------- -- Fix hang in style attribute sanitizer. (#61) -- Allow '/' in style attribute values. +* Fix hang in style attribute sanitizer. (#61) + +* Allow ``/`` in style attribute values. Version 1.1.1 (February 17, 2012) --------------------------------- -- Fix tokenizer for html5lib 0.9.5. +* Fix tokenizer for html5lib 0.9.5. Version 1.1.0 (October 24, 2011) -------------------------------- -- linkify() now understands port numbers. (#38) -- Documented character encoding behavior. (#41) -- Add an optional target argument to linkify(). -- Add delinkify() method. (#45) -- Support subdomain whitelist for delinkify(). (#47, #48) +* ``linkify()`` now understands port numbers. (#38) + +* Documented character encoding behavior. (#41) + +* Add an optional target argument to ``linkify()``. + +* Add ``delinkify()`` method. (#45) + +* Support subdomain whitelist for ``delinkify()``. (#47, #48) Version 1.0.4 (September 2, 2011) --------------------------------- -- Switch to SemVer git tags. -- Make linkify() smarter about trailing punctuation. (#30) -- Pass exc_info to logger during rendering issues. -- Add wildcard key for attributes. (#19) -- Make linkify() use the HTMLSanitizer tokenizer. (#36) -- Fix URLs wrapped in parentheses. (#23) -- Make linkify() UTF-8 safe. (#33) +* Switch to SemVer git tags. + +* Make ``linkify()`` smarter about trailing punctuation. (#30) + +* Pass ``exc_info`` to logger during rendering issues. + +* Add wildcard key for attributes. (#19) + +* Make ``linkify()`` use the ``HTMLSanitizer`` tokenizer. (#36) + +* Fix URLs wrapped in parentheses. (#23) + +* Make ``linkify()`` UTF-8 safe. (#33) Version 1.0.3 (June 14, 2011) ----------------------------- -- linkify() works with 3rd level domains. (#24) -- clean() supports vendor prefixes in style values. (#31, #32) -- Fix linkify() email escaping. +* ``linkify()`` works with 3rd level domains. (#24) + +* ``clean()`` supports vendor prefixes in style values. (#31, #32) + +* Fix ``linkify()`` email escaping. Version 1.0.2 (June 6, 2011) ---------------------------- -- linkify() supports email addresses. -- clean() supports callables in attributes filter. +* ``linkify()`` supports email addresses. + +* ``clean()`` supports callables in attributes filter. Version 1.0.1 (April 12, 2011) ------------------------------ -- linkify() doesn't drop trailing slashes. (#21) -- linkify() won't linkify 'libgl.so.1'. (#22) +* ``linkify()`` doesn't drop trailing slashes. (#21) +* ``linkify()`` won't linkify 'libgl.so.1'. (#22) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..498baa3f --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,15 @@ +# Community Participation Guidelines + +This repository is governed by Mozilla's code of conduct and etiquette guidelines. +For more details, please read the +[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). + +## How to Report +For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. + + diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst new file mode 100644 index 00000000..da20d8db --- /dev/null +++ b/CODE_OF_CONDUCT.rst @@ -0,0 +1,9 @@ +Code of conduct +=============== + +This project and repository is governed by Mozilla's code of conduct and +etiquette guidelines. For more details please see the `Mozilla Community +Participation Guidelines +`_ and +`Developer Etiquette Guidelines +`_. diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 3eb6c7f8..20624e19 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,14 +1,17 @@ Bleach was originally written and maintained by James Socol and various contributors within and without the Mozilla Corporation and Foundation. -It is currently maintained by Jannis Leidel and Will Kahn-Greene. + +It is currently maintained by Will Kahn-Greene, Greg Guthe, and Jon Dufresne. Maintainers: -- Jannis Leidel - Will Kahn-Greene +- Greg Guthe +- Jon Dufresne Maintainer emeritus: +- Jannis Leidel - James Socol Contributors: @@ -16,30 +19,65 @@ Contributors: - Adam Lofts - Adrian "ThiefMaster" - Alek +- Alex Defsen - Alex Ehlke +- Alexandre Macabies +- Alexandr N. Zamaraev - Alireza Savand - Andreas Malecki - Andy Freeland +- Antoine Leclair +- Anton Backer - Anton Kovalyov +- Benjamin Peterson +- Chad Birch +- CheesyFeet - Chris Beaven +- Dan Gayle +- dave-shawley +- dbxnr - Erik Rose - Gaurav Dadhania +- Geoffrey Sneddon +- Google Autofuzz Team +- Greg Guthe +- hugovk +- Istvan Albert - Jaime Irurzun +- James Socol +- Jannis Leidel +- Janusz Kamieński - Jeff Balogh +- Jonathan Vanasco +- Jon Dufresne - Lee, Cheon-il - Les Orchard +- Lorenz Schori - Luis Nell - Marc Abramowitz - Marc DM - Mark Lee - Mark Paschal +- mdxs +- mitar +- Nguyễn Gia Phong +- Nikita Sobolev +- nikolas - Oh Jinkyun - Paul Craciunoiu - Ricky Rosario - Ryan Niemeyer +- Sam Sneddon - Sébastien Fievet +- sedrubal +- Stephane Blondon +- Stu Cox +- Thomas Grainger +- Tim Dumol - Timothy Fitz +- Tim Gates +- Vadim Kotov - Vitaly Volkov -- mdxs -- nikolas +- Will Kahn-Greene +- Zoltán - zyegfryed diff --git a/LICENSE b/LICENSE index b0cde3ee..467c38e4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014-2015, Mozilla Foundation +Copyright (c) 2014-2017, Mozilla Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/MANIFEST.in b/MANIFEST.in index 9d5d250d..70e794fd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,18 @@ +include CHANGES +include CONTRIBUTORS +include CONTRIBUTING.rst +include CODE_OF_CONDUCT.rst +include requirements-dev.txt +include tox.ini include LICENSE include README.rst + +include docs/conf.py +include docs/Makefile + +include scripts/* + +recursive-include bleach *.py *.json *.rst *.sh *.txt INSTALLER METADATA RECORD WHEEL +recursive-include docs *.rst +recursive-include tests *.py *.test +recursive-include tests_website *.html *.py *.rst diff --git a/README.rst b/README.rst index 8f9ce05d..23562409 100644 --- a/README.rst +++ b/README.rst @@ -2,13 +2,16 @@ Bleach ====== -.. image:: https://travis-ci.org/mozilla/bleach.png?branch=master - :target: https://travis-ci.org/mozilla/bleach +.. image:: https://github.com/mozilla/bleach/workflows/Test/badge.svg + :target: https://github.com/mozilla/bleach/actions?query=workflow%3ATest -.. image:: https://badge.fury.io/py/Bleach.svg - :target: http://badge.fury.io/py/Bleach +.. image:: https://github.com/mozilla/bleach/workflows/Lint/badge.svg + :target: https://github.com/mozilla/bleach/actions?query=workflow%3ALint -Bleach is a whitelist-based HTML sanitizing library that escapes or strips +.. image:: https://badge.fury.io/py/bleach.svg + :target: http://badge.fury.io/py/bleach + +Bleach is an allowed-list-based HTML sanitizing library that escapes or strips markup and attributes. Bleach can also linkify text safely, applying filters that Django's ``urlize`` @@ -30,7 +33,6 @@ fixes. You can find full documentation on `ReadTheDocs`_. :Code: https://github.com/mozilla/bleach :Documentation: https://bleach.readthedocs.io/ :Issue tracker: https://github.com/mozilla/bleach/issues -:IRC: ``#bleach`` on irc.mozilla.org :License: Apache License v2; see LICENSE file @@ -51,24 +53,27 @@ please read our wiki page at ``_. -Installing Bleach -================= +Security +======== -Bleach is available on PyPI_, so you can install it with ``pip``:: +Bleach is a security-focused library. - $ pip install bleach +We have a responsible security vulnerability reporting process. Please use +that if you're reporting a security issue. -Or with ``easy_install``:: +Security issues are fixed in private. After we land such a fix, we'll do a +release. - $ easy_install bleach +For every release, we mark security issues we've fixed in the ``CHANGES`` in +the **Security issues** section. We include any relevant CVE links. -Or by cloning the repo from GitHub_:: - $ git clone git://github.com/mozilla/bleach.git +Installing Bleach +================= -Then install it by running:: +Bleach is available on PyPI_, so you can install it with ``pip``:: - $ python setup.py install + $ pip install bleach Upgrading Bleach @@ -94,12 +99,18 @@ The simplest way to use Bleach is: u'an <script>evil()</script> example' >>> bleach.linkify('an http://example.com url') - u'an http://example.com url + u'an http://example.com url' + + +Code of Conduct +=============== + +This project and repository is governed by Mozilla's code of conduct and +etiquette guidelines. For more details please see the `CODE_OF_CONDUCT.md +`_ .. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/mozilla/bleach .. _ReadTheDocs: https://bleach.readthedocs.io/ -.. _PyPI: http://pypi.python.org/pypi/bleach - - +.. _PyPI: https://pypi.org/project/bleach/ diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..83299538 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,20 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 4.0.x | :white_check_mark: | +| < 4 | :x: | + +## Reporting a Vulnerability + +If you believe that you've found a security vulnerability, please [file a secure +bug report in our bug tracker](https://bugzilla.mozilla.org/enter_bug.cgi?assigned_to=nobody%40mozilla.org&product=Webtools&component=Bleach-security&groups=webtools-security) or send an email to *security AT mozilla DOT org*. + +For more information on security-related bug disclosure and the PGP key to use +for sending encrypted mail or to verify responses received from that address, +please read our wiki page at https://www.mozilla.org/en-US/security/#For_Developers diff --git a/bleach/__init__.py b/bleach/__init__.py index aec2d340..c2fe89e0 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,401 +1,131 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals -import logging -import re - -import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer.htmlserializer import HTMLSerializer - -from . import callbacks as linkify_callbacks -from .encoding import force_unicode -from .sanitizer import BleachSanitizer - - -VERSION = (1, 4, 2) -__version__ = '.'.join([str(n) for n in VERSION]) - -__all__ = ['clean', 'linkify'] - -log = logging.getLogger('bleach') - -ALLOWED_TAGS = [ - 'a', - 'abbr', - 'acronym', - 'b', - 'blockquote', - 'code', - 'em', - 'i', - 'li', - 'ol', - 'strong', - 'ul', -] - -ALLOWED_ATTRIBUTES = { - 'a': ['href', 'title'], - 'abbr': ['title'], - 'acronym': ['title'], -} - -ALLOWED_STYLES = [] - -ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] - -TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az - ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat - cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk - dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg - gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il - im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp - kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk - ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne - net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post - pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl - sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to - tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws - xn xxx ye yt yu za zm zw""".split() - -# Make sure that .com doesn't get matched by .co first -TLDS.reverse() - -PROTOCOLS = HTMLSanitizer.acceptable_protocols - -url_re = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, - # except for # and ~, which happen in practice) - """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), - re.IGNORECASE | re.VERBOSE | re.UNICODE) - -proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) - -punct_re = re.compile(r'([\.,]+)$') - -email_re = re.compile( - r"""(? tag replaced by the text within it - adj = replace_nodes(tree, _text, node, - current_child) - current_child -= 1 - # pull back current_child by 1 to scan the - # new nodes again. - else: - text = force_unicode(attrs.pop('_text')) - for attr_key, attr_val in attrs.items(): - node.set(attr_key, attr_val) - - for n in reversed(list(node)): - node.remove(n) - text = parser.parseFragment(text) - node.text = text.text - for n in text: - node.append(n) - _seen.add(node) - - elif current_child >= 0: - if node.tag == ETREE_TAG('pre') and skip_pre: - linkify_nodes(node, False) - elif not (node in _seen): - linkify_nodes(node, True) - - current_child += 1 - - def email_repl(match): - addr = match.group(0).replace('"', '"') - link = { - '_text': addr, - 'href': 'mailto:{0!s}'.format(addr), - } - link = apply_callbacks(link, True) - - if link is None: - return addr - - _href = link.pop('href') - _text = link.pop('_text') - - repl = '{2!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl.format(_href, attribs, _text) - - def link_repl(match): - url = match.group(0) - open_brackets = close_brackets = 0 - if url.startswith('('): - _wrapping = strip_wrapping_parentheses(url) - url, open_brackets, close_brackets = _wrapping - end = '' - m = re.search(punct_re, url) - if m: - end = m.group(0) - url = url[0:m.start()] - if re.search(proto_re, url): - href = url - else: - href = ''.join(['http://', url]) - - link = { - '_text': url, - 'href': href, - } - - link = apply_callbacks(link, True) - - if link is None: - return '(' * open_brackets + url + ')' * close_brackets - - _text = link.pop('_text') - _href = link.pop('href') - - repl = '{0!s}{3!s}{4!s}{5!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - - return repl.format('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) - - try: - linkify_nodes(forest) - except RuntimeError as e: - # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: {0!r}'.format(e)) - - return _render(forest) - - -def _render(tree): - """Try rendering as HTML, then XML, then give up.""" - return force_unicode(_serialize(tree)) - - -def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('etree') - stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values=True, - alphabetical_attributes=True, - omit_optional_tags=False) - return serializer.render(stream) + linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email) + return linker.linkify(text) diff --git a/bleach/_vendor/README.rst b/bleach/_vendor/README.rst new file mode 100644 index 00000000..ba001b10 --- /dev/null +++ b/bleach/_vendor/README.rst @@ -0,0 +1,40 @@ +======================= +Vendored library policy +======================= + +To simplify Bleach development, we're now vendoring certain libraries that +we use. + +Vendored libraries must follow these rules: + +1. Vendored libraries must be pure Python--no compiling. +2. Source code for the libary is included in this directory. +3. License must be included in this repo and in the Bleach distribution. +4. Requirements of the library become requirements of Bleach. +5. No modifications to the library may be made. + + +Adding/Updating a vendored library +================================== + +Way to vendor a library or update a version: + +1. Update ``vendor.txt`` with the library, version, and hash. You can use + `hashin `_. +2. Remove all old files and directories of the old version. +3. Run ``pip_install_vendor.sh`` and check everything it produced in including + the ``.dist-info`` directory and contents. + + +Reviewing a change involving a vendored library +=============================================== + +Way to verify a vendored library addition/update: + +1. Pull down the branch. +2. Delete all the old files and directories of the old version. +3. Run ``pip_install_vendor.sh``. +4. Run ``git diff`` and verify there are no changes. + + +NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info diff --git a/bleach/tests/__init__.py b/bleach/_vendor/__init__.py similarity index 100% rename from bleach/tests/__init__.py rename to bleach/_vendor/__init__.py diff --git a/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst b/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst new file mode 100644 index 00000000..90401390 --- /dev/null +++ b/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst @@ -0,0 +1,66 @@ +Credits +======= + +``html5lib`` is written and maintained by: + +- James Graham +- Sam Sneddon +- Łukasz Langa +- Will Kahn-Greene + + +Patches and suggestions +----------------------- +(In chronological order, by first commit:) + +- Anne van Kesteren +- Lachlan Hunt +- lantis63 +- Sam Ruby +- Thomas Broyer +- Tim Fletcher +- Mark Pilgrim +- Ryan King +- Philip Taylor +- Edward Z. Yang +- fantasai +- Philip Jägenstedt +- Ms2ger +- Mohammad Taha Jahangir +- Andy Wingo +- Andreas Madsack +- Karim Valiev +- Juan Carlos Garcia Segovia +- Mike West +- Marc DM +- Simon Sapin +- Michael[tm] Smith +- Ritwik Gupta +- Marc Abramowitz +- Tony Lopes +- lilbludevil +- Kevin +- Drew Hubl +- Austin Kumbera +- Jim Baker +- Jon Dufresne +- Donald Stufft +- Alex Gaynor +- Nik Nyby +- Jakub Wilk +- Sigmund Cherem +- Gabi Davar +- Florian Mounier +- neumond +- Vitalik Verhovodov +- Kovid Goyal +- Adam Chainz +- John Vandenberg +- Eric Amorde +- Benedikt Morbach +- Jonathan Vanasco +- Tom Most +- Ville Skyttä +- Hugo van Kemenade +- Mark Vasilkov + diff --git a/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER b/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER new file mode 100644 index 00000000..a1b589e3 --- /dev/null +++ b/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/bleach/_vendor/html5lib-1.1.dist-info/LICENSE b/bleach/_vendor/html5lib-1.1.dist-info/LICENSE new file mode 100644 index 00000000..c87fa7a0 --- /dev/null +++ b/bleach/_vendor/html5lib-1.1.dist-info/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2006-2013 James Graham and other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/bleach/_vendor/html5lib-1.1.dist-info/METADATA b/bleach/_vendor/html5lib-1.1.dist-info/METADATA new file mode 100644 index 00000000..ee83c1f8 --- /dev/null +++ b/bleach/_vendor/html5lib-1.1.dist-info/METADATA @@ -0,0 +1,552 @@ +Metadata-Version: 2.1 +Name: html5lib +Version: 1.1 +Summary: HTML parser based on the WHATWG HTML specification +Home-page: https://github.com/html5lib/html5lib-python +Maintainer: James Graham +Maintainer-email: james@hoppipolla.co.uk +License: MIT License +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing :: Markup :: HTML +Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* +Requires-Dist: six (>=1.9) +Requires-Dist: webencodings +Provides-Extra: all +Requires-Dist: genshi ; extra == 'all' +Requires-Dist: chardet (>=2.2) ; extra == 'all' +Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all' +Provides-Extra: chardet +Requires-Dist: chardet (>=2.2) ; extra == 'chardet' +Provides-Extra: genshi +Requires-Dist: genshi ; extra == 'genshi' +Provides-Extra: lxml +Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml' + +html5lib +======== + +.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master + :target: https://travis-ci.org/html5lib/html5lib-python + + +html5lib is a pure-python library for parsing HTML. It is designed to +conform to the WHATWG HTML specification, as is implemented by all major +web browsers. + + +Usage +----- + +Simple usage follows this pattern: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + document = html5lib.parse(f) + +or: + +.. code-block:: python + + import html5lib + document = html5lib.parse("

Hello World!") + +By default, the ``document`` will be an ``xml.etree`` element instance. +Whenever possible, html5lib chooses the accelerated ``ElementTree`` +implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x). + +Two other tree types are supported: ``xml.dom.minidom`` and +``lxml.etree``. To use an alternative format, specify the name of +a treebuilder: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + lxml_etree_document = html5lib.parse(f, treebuilder="lxml") + +When using with ``urllib2`` (Python 2), the charset from HTTP should be +pass into html5lib as follows: + +.. code-block:: python + + from contextlib import closing + from urllib2 import urlopen + import html5lib + + with closing(urlopen("http://example.com/")) as f: + document = html5lib.parse(f, transport_encoding=f.info().getparam("charset")) + +When using with ``urllib.request`` (Python 3), the charset from HTTP +should be pass into html5lib as follows: + +.. code-block:: python + + from urllib.request import urlopen + import html5lib + + with urlopen("http://example.com/") as f: + document = html5lib.parse(f, transport_encoding=f.info().get_content_charset()) + +To have more control over the parser, create a parser object explicitly. +For instance, to make the parser raise exceptions on parse errors, use: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + parser = html5lib.HTMLParser(strict=True) + document = parser.parse(f) + +When you're instantiating parser objects explicitly, pass a treebuilder +class as the ``tree`` keyword argument to use an alternative document +format: + +.. code-block:: python + + import html5lib + parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + minidom_document = parser.parse("

Hello World!") + +More documentation is available at https://html5lib.readthedocs.io/. + + +Installation +------------ + +html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install: + +.. code-block:: bash + + $ pip install html5lib + +The goal is to support a (non-strict) superset of the versions that `pip +supports +`_. + +Optional Dependencies +--------------------- + +The following third-party libraries may be used for additional +functionality: + +- ``lxml`` is supported as a tree format (for both building and + walking) under CPython (but *not* PyPy where it is known to cause + segfaults); + +- ``genshi`` has a treewalker (but not builder); and + +- ``chardet`` can be used as a fallback when character encoding cannot + be determined. + + +Bugs +---- + +Please report any bugs on the `issue tracker +`_. + + +Tests +----- + +Unit tests require the ``pytest`` and ``mock`` libraries and can be +run using the ``py.test`` command in the root directory. + +Test data are contained in a separate `html5lib-tests +`_ repository and included +as a submodule, thus for git checkouts they must be initialized:: + + $ git submodule init + $ git submodule update + +If you have all compatible Python implementations available on your +system, you can run tests on all of them using the ``tox`` utility, +which can be found on PyPI. + + +Questions? +---------- + +There's a mailing list available for support on Google Groups, +`html5lib-discuss `_, +though you may get a quicker response asking on IRC in `#whatwg on +irc.freenode.net `_. + +Change Log +---------- + +1.1 +~~~ + +UNRELEASED + +Breaking changes: + +* Drop support for Python 3.3. (#358) +* Drop support for Python 3.4. (#421) + +Deprecations: + +* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and + ``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach + `. Please let us know if Bleach doesn't suffice for your + use. (#443) + +Other changes: + +* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure + ``html5lib`` keeps working in future Python versions. (#403) +* Drop optional ``datrie`` dependency. (#442) + + +1.0.1 +~~~~~ + +Released on December 7, 2017 + +Breaking changes: + +* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) +* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + +Features: + +* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) +* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) +* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) +* Support Python 3.6. (#333) (Thank you, Jon Dufresne!) +* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) +* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!) +* Semver-compliant version number. + +Bug fixes: + +* Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) +* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) +* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) +* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) +* Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) +* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) +* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + +1.0 +~~~ + +Released and unreleased on December 7, 2017. Badly packaged release. + + +0.999999999/1.0b10 +~~~~~~~~~~~~~~~~~~ + +Released on July 15, 2016 + +* Fix attribute order going to the tree builder to be document order + instead of reverse document order(!). + + +0.99999999/1.0b9 +~~~~~~~~~~~~~~~~ + +Released on July 14, 2016 + +* **Added ordereddict as a mandatory dependency on Python 2.6.** + +* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all`` + extras that will do the right thing based on the specific + interpreter implementation. + +* Now requires the ``mock`` package for the testsuite. + +* Cease supporting DATrie under PyPy. + +* **Remove PullDOM support, as this hasn't ever been properly + tested, doesn't entirely work, and as far as I can tell is + completely unused by anyone.** + +* Move testsuite to ``py.test``. + +* **Fix #124: move to webencodings for decoding the input byte stream; + this makes html5lib compliant with the Encoding Standard, and + introduces a required dependency on webencodings.** + +* **Cease supporting Python 3.2 (in both CPython and PyPy forms).** + +* **Fix comments containing double-dash with lxml 3.5 and above.** + +* **Use scripting disabled by default (as we don't implement + scripting).** + +* **Fix #11, avoiding the XSS bug potentially caused by serializer + allowing attribute values to be escaped out of in old browser versions, + changing the quote_attr_values option on serializer to take one of + three values, "always" (the old True value), "legacy" (the new option, + and the new default), and "spec" (the old False value, and the old + default).** + +* **Fix #72 by rewriting the sanitizer to apply only to treewalkers + (instead of the tokenizer); as such, this will require amending all + callers of it to use it via the treewalker API.** + +* **Drop support of charade, now that chardet is supported once more.** + +* **Replace the charset keyword argument on parse and related methods + with a set of keyword arguments: override_encoding, transport_encoding, + same_origin_parent_encoding, likely_encoding, and default_encoding.** + +* **Move filters._base, treebuilder._base, and treewalkers._base to .base + to clarify their status as public.** + +* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the + sanitizer.htmlsanitizer module and move that to sanitizer. This means + anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no + code changes.** + +* **Rename treewalkers.lxmletree to .etree_lxml and + treewalkers.genshistream to .genshi to have a consistent API.** + +* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer, + utils) to be underscore prefixed to clarify their status as private. + + +0.9999999/1.0b8 +~~~~~~~~~~~~~~~ + +Released on September 10, 2015 + +* Fix #195: fix the sanitizer to drop broken URLs (it threw an + exception between 0.9999 and 0.999999). + + +0.999999/1.0b7 +~~~~~~~~~~~~~~ + +Released on July 7, 2015 + +* Fix #189: fix the sanitizer to allow relative URLs again (as it did + prior to 0.9999/1.0b5). + + +0.99999/1.0b6 +~~~~~~~~~~~~~ + +Released on April 30, 2015 + +* Fix #188: fix the sanitizer to not throw an exception when sanitizing + bogus data URLs. + + +0.9999/1.0b5 +~~~~~~~~~~~~ + +Released on April 29, 2015 + +* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how + this sounds, this has no known security implications. No known version + of IE (5.5 to current), Firefox (3 to current), Safari (6 to current), + Chrome (1 to current), or Opera (12 to current) will run any script + provided in these attributes. + +* Pass error message to the ParseError exception in strict parsing mode. + +* Allow data URIs in the sanitizer, with a whitelist of content-types. + +* Add support for Python implementations that don't support lone + surrogates (read: Jython). Fixes #2. + +* Remove localization of error messages. This functionality was totally + unused (and untested that everything was localizable), so we may as + well follow numerous browsers in not supporting translating technical + strings. + +* Expose treewalkers.pprint as a public API. + +* Add a documentEncoding property to HTML5Parser, fix #121. + + +0.999 +~~~~~ + +Released on December 23, 2013 + +* Fix #127: add work-around for CPython issue #20007: .read(0) on + http.client.HTTPResponse drops the rest of the content. + +* Fix #115: lxml treewalker can now deal with fragments containing, at + their root level, text nodes with non-ASCII characters on Python 2. + + +0.99 +~~~~ + +Released on September 10, 2013 + +* No library changes from 1.0b3; released as 0.99 as pip has changed + behaviour from 1.4 to avoid installing pre-release versions per + PEP 440. + + +1.0b3 +~~~~~ + +Released on July 24, 2013 + +* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any + implementation using it should be moved to + ``NonRecursiveTreeWalker``, as everything bundled with html5lib has + for years. + +* Fix #67 so that ``BufferedStream`` to correctly returns a bytes + object, thereby fixing any case where html5lib is passed a + non-seekable RawIOBase-like object. + + +1.0b2 +~~~~~ + +Released on June 27, 2013 + +* Removed reordering of attributes within the serializer. There is now + an ``alphabetical_attributes`` option which preserves the previous + behaviour through a new filter. This allows attribute order to be + preserved through html5lib if the tree builder preserves order. + +* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by + ``treeadapters.sax.to_sax`` which is generic and supports any + treewalker; it also resolves all known bugs with ``dom2sax``. + +* Fix treewalker assertions on hitting bytes strings on + Python 2. Previous to 1.0b1, treewalkers coped with mixed + bytes/unicode data on Python 2; this reintroduces this prior + behaviour on Python 2. Behaviour is unchanged on Python 3. + + +1.0b1 +~~~~~ + +Released on May 17, 2013 + +* Implementation updated to implement the `HTML specification + `_ as of 5th May + 2013 (`SVN `_ revision r7867). + +* Python 3.2+ supported in a single codebase using the ``six`` library. + +* Removed support for Python 2.5 and older. + +* Removed the deprecated Beautiful Soup 3 treebuilder. + ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that + since it doesn't support namespaces, foreign content like SVG and + MathML is parsed incorrectly. + +* Removed ``simpletree`` from the package. The default tree builder is + now ``etree`` (using the ``xml.etree.cElementTree`` implementation if + available, and ``xml.etree.ElementTree`` otherwise). + +* Removed the ``XHTMLSerializer`` as it never actually guaranteed its + output was well-formed XML, and hence provided little of use. + +* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no + longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will + return the default DOM treebuilder, which uses ``xml.dom.minidom``. + +* Optional heuristic character encoding detection now based on + ``charade`` for Python 2.6 - 3.3 compatibility. + +* Optional ``Genshi`` treewalker support fixed. + +* Many bugfixes, including: + + * #33: null in attribute value breaks XML AttValue; + + * #4: nested, indirect descendant,

+

+

clean when dirty HTML changes

+ +

+ +

+ +

+ + + + + + + + diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py new file mode 100755 index 00000000..23e15277 --- /dev/null +++ b/tests_website/open_test_page.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +import webbrowser + + +TEST_BROWSERS = { + # 'mozilla', + "firefox", + # 'netscape', + # 'galeon', + # 'epiphany', + # 'skipstone', + # 'kfmclient', + # 'konqueror', + # 'kfm', + # 'mosaic', + # 'opera', + # 'grail', + # 'links', + # 'elinks', + # 'lynx', + # 'w3m', + "windows-default", + # 'macosx', + "safari", + # 'google-chrome', + "chrome", + # 'chromium', + # 'chromium-browser', +} + + +if __name__ == "__main__": + for browser_name in TEST_BROWSERS: + try: + browser = webbrowser.get(browser_name) + browser.open_new_tab("http://localhost:8080") + except Exception as error: + print("error getting test browser %s: %s" % (browser_name, error)) diff --git a/tests_website/server.py b/tests_website/server.py new file mode 100755 index 00000000..2d25ea25 --- /dev/null +++ b/tests_website/server.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +""" +Simple Test/Demo Server for running bleach.clean output on various +desktops. + +Usage: + + python server.py + +""" + +import http.server +import socketserver + +import bleach + + +PORT = 8080 + + +class BleachCleanHandler(http.server.SimpleHTTPRequestHandler): + + # Prevent 'cannot bind to address' errors on restart + allow_reuse_address = True + + def do_POST(self): + content_len = int(self.headers.get("content-length", 0)) + body = self.rfile.read(content_len) + print("read %s bytes: %s" % (content_len, body)) + + body = body.decode("utf-8") + print("input: %r" % body) + cleaned = bleach.clean(body) + + self.send_response(200) + self.send_header("Content-Length", len(cleaned)) + self.send_header("Content-Type", "text/plain;charset=UTF-8") + self.end_headers() + + cleaned = bytes(cleaned, encoding="utf-8") + print("cleaned: %r" % cleaned) + self.wfile.write(cleaned) + + +if __name__ == "__main__": + httpd = socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler) + print("listening on localhost port %d" % PORT) + httpd.serve_forever() diff --git a/tox.ini b/tox.ini index 5d4fe518..244e9b53 100644 --- a/tox.ini +++ b/tox.ini @@ -1,14 +1,63 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests -# in multiple virtualenvs. This configuration file will run the -# test suite on all supported python versions. To use it, "pip install tox" -# and then run "tox" from this directory. +# Note: If you update this, make sure to update .github/workflows/, too. [tox] -envlist = py26, py27, py32, py33, py34, pypy +envlist = + py{36,37,38,py3} + py{36,37,38}-build-no-lang + docs + format-check + lint + vendorverify [testenv] -commands = nosetests {posargs:-v} deps = - six - html5lib==0.999 - nose + -rrequirements-dev.txt +commands = + pytest {posargs:-v} + python setup.py build + +[testenv:py36-build-no-lang] +setenv = + LANG= +commands = + python setup.py build + +[testenv:py37-build-no-lang] +setenv = + LANG= +commands = + python setup.py build + +[testenv:lint] +basepython = python3.8 +changedir = scripts +deps = + -rrequirements-dev.txt +commands = + ./run_tests.sh lint + +[testenv:vendorverify] +basepython = python3.8 +changedir = scripts +deps = + -rrequirements-dev.txt +commands = + ./run_tests.sh vendorverify + +[testenv:format-check] +basepython = python3.8 +changedir = scripts +deps = + -rrequirements-dev.txt + black +commands = + ./run_tests.sh format-check + +[testenv:docs] +basepython = python3.8 +changedir = docs +deps = + -rrequirements-dev.txt +commands = + sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html + sphinx-build -b doctest -d {envtmpdir}/doctrees . {envtmpdir}/doctest