From 406a02e656a7b62e685b88596ed0a7a38feb5d77 Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Thu, 6 Nov 2025 16:48:24 -0500 Subject: [PATCH 1/5] introduce `archive_url_data` to feedparser.parse --- feedparser/api.py | 35 +++++++++++++++++++++++------------ tests/test_open_resource.py | 20 +++++++++++++++----- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/feedparser/api.py b/feedparser/api.py index 8b95ed5c..3e3b780d 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -30,6 +30,7 @@ import urllib.error import urllib.parse import xml.sax +from typing import Any from typing import IO from . import http @@ -70,11 +71,10 @@ "json1": "JSON feed 1", } - def _open_resource( - url_file_stream_or_string, - result, -): + url_file_stream_or_string: Any, + result: dict, +) -> tuple[str, Any]: """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -83,7 +83,7 @@ def _open_resource( to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. - :return: A seekable, readable file object. + :return: A Tuple of [the method used, a seekable and readable file object]. """ # Some notes on the history of the implementation of _open_resource(). @@ -104,8 +104,8 @@ def _open_resource( if callable(getattr(url_file_stream_or_string, "read", None)): if callable(getattr(url_file_stream_or_string, "seekable", None)): if url_file_stream_or_string.seekable(): - return url_file_stream_or_string - return _to_in_memory_file(url_file_stream_or_string.read()) + return "seekable", url_file_stream_or_string + return "read", _to_in_memory_file(url_file_stream_or_string.read()) looks_like_url = isinstance( url_file_stream_or_string, str @@ -115,11 +115,11 @@ def _open_resource( ) if looks_like_url: data = http.get(url_file_stream_or_string, result) - return io.BytesIO(data) + return "url", io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) try: - return open(url_file_stream_or_string, "rb") + return "filepath", open(url_file_stream_or_string, "rb") except (OSError, TypeError, ValueError): # if url_file_stream_or_string is a str object that # cannot be converted to the encoding returned by @@ -131,7 +131,7 @@ def _open_resource( pass # treat url_file_stream_or_string as bytes/string - return _to_in_memory_file(url_file_stream_or_string) + return "raw_data", _to_in_memory_file(url_file_stream_or_string) def _to_in_memory_file(data): @@ -154,6 +154,7 @@ def parse( resolve_relative_uris: bool | None = None, sanitize_html: bool | None = None, optimistic_encoding_detection: bool | None = None, + archive_url_data: bool | None = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -188,7 +189,9 @@ def parse( (uses less memory, but the wrong encoding may be detected in rare cases). Defaults to the value of :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``. - + :param archive_url_data: + Should feedparser archive the URL headers and content into + :attr:`FeedParserDict.raw` ? Defaults to ``False``` """ result = FeedParserDict( @@ -196,13 +199,21 @@ def parse( entries=[], feed=FeedParserDict(), headers={}, + raw={}, ) try: - file = _open_resource( + _method, file = _open_resource( url_file_stream_or_string, result, ) + if _method == "url" and archive_url_data: + # archive the headers before they are mutated by `response_headers` + result.raw["headers"] = result["headers"].copy() + # archive the content, then reset the file + result.raw["content"] = file.read() + file.seek(0) + except urllib.error.URLError as error: result.update( { diff --git a/tests/test_open_resource.py b/tests/test_open_resource.py index db575711..71c3510a 100644 --- a/tests/test_open_resource.py +++ b/tests/test_open_resource.py @@ -4,29 +4,39 @@ def test_fileobj(): - r = feedparser.api._open_resource(io.BytesIO(b""), {}).read() + method, filelike = feedparser.api._open_resource(io.BytesIO(b""), {}) + r = filelike.read() assert r == b"" + assert method == "seekable" def testbytes(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_string(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_unicode_1(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_unicode_2(): s = rb"t\u00e9xt" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" From 9d7bf4ee32d11b68c613f3f1f66548c8d30b7bfb Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Thu, 6 Nov 2025 18:06:19 -0500 Subject: [PATCH 2/5] supporting requests hooks --- CONTRIBUTORS.rst | 3 ++- feedparser/api.py | 13 ++++++++++++- feedparser/http.py | 23 ++++++++++++++++++++++- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 5d9f94fe..c2e526db 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -24,8 +24,9 @@ bug report! * `Ade Oshineye `_ * `Tom Parker-Shemilt `_ * `Martin Pool `_ +* `Nestor Rodriguez `_ * `Sam Ruby `_ * `Bernd Schlapsi `_ * `Aaron Swartz `_ +* `Jonathan Vanasco `_ * `Jakub Wilk `_ -* `Nestor Rodriguez `_ diff --git a/feedparser/api.py b/feedparser/api.py index 3e3b780d..da09d215 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -32,6 +32,7 @@ import xml.sax from typing import Any from typing import IO +from typing import Optional from . import http from .encodings import MissingEncoding, convert_file_to_utf8 @@ -74,6 +75,7 @@ def _open_resource( url_file_stream_or_string: Any, result: dict, + requests_hooks: Optional[http.RequestHooks] = None, ) -> tuple[str, Any]: """URL, filename, or string --> stream @@ -83,6 +85,10 @@ def _open_resource( to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. + :param requests_hooks: + A dict of hooks to pass onto :method:`requests.get` if a URL is parsed. + See `feedparser.http.RequestHooks` + :return: A Tuple of [the method used, a seekable and readable file object]. """ @@ -114,7 +120,7 @@ def _open_resource( "https", ) if looks_like_url: - data = http.get(url_file_stream_or_string, result) + data = http.get(url_file_stream_or_string, result, hooks=requests_hooks) return "url", io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) @@ -155,6 +161,7 @@ def parse( sanitize_html: bool | None = None, optimistic_encoding_detection: bool | None = None, archive_url_data: bool | None = None, + requests_hooks: Optional[http.RequestHooks] = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -192,6 +199,9 @@ def parse( :param archive_url_data: Should feedparser archive the URL headers and content into :attr:`FeedParserDict.raw` ? Defaults to ``False``` + :param requests_hooks: + A dict of hooks to pass onto :method:`requests.get` if a URL is parsed. + See `feedparser.http.RequestHooks` """ result = FeedParserDict( @@ -206,6 +216,7 @@ def parse( _method, file = _open_resource( url_file_stream_or_string, result, + requests_hooks=requests_hooks, ) if _method == "url" and archive_url_data: # archive the headers before they are mutated by `response_headers` diff --git a/feedparser/http.py b/feedparser/http.py index 7768dae5..832f330d 100644 --- a/feedparser/http.py +++ b/feedparser/http.py @@ -45,19 +45,40 @@ ";q=0.1" ) +# This dict defines the allowable hooks. +# `response` is the only valid hook in `requests`. +# `response.postprocess` is used +RequestHooks = typing.TypedDict( + "RequestHooks", + { + "response": typing.Union[typing.Callable, typing.Sequence[typing.Callable]], + "response.postprocess": typing.NotRequired[typing.Callable], + } +) -def get(url: str, result: dict[str, typing.Any]) -> bytes: +def get( + url: str, + result: dict[str, typing.Any], + hooks: typing.Optional[RequestHooks]=None, +) -> bytes: + _postprocess: typing.Optional[typing.Callable] = None + if hooks is not None: + _postprocess = hooks.pop("response.postprocess", None) try: response = requests.get( url, headers={"Accept": ACCEPT_HEADER}, timeout=10, + hooks=hooks, ) except requests.RequestException as exception: result["bozo"] = True result["bozo_exception"] = exception return b"" + if _postprocess is not None: + _postprocess(response, result) + # Lowercase the HTTP header keys for comparisons per RFC 2616. result["headers"] = {k.lower(): v for k, v in response.headers.items()} From 034c5e1dc0c069d50dbb1114e8bf0d864dd5bb2b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Nov 2025 23:12:27 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedparser/api.py | 9 ++++----- feedparser/http.py | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/feedparser/api.py b/feedparser/api.py index da09d215..f156f0fa 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -30,9 +30,7 @@ import urllib.error import urllib.parse import xml.sax -from typing import Any -from typing import IO -from typing import Optional +from typing import IO, Any, Optional from . import http from .encodings import MissingEncoding, convert_file_to_utf8 @@ -72,10 +70,11 @@ "json1": "JSON feed 1", } + def _open_resource( url_file_stream_or_string: Any, result: dict, - requests_hooks: Optional[http.RequestHooks] = None, + requests_hooks: http.RequestHooks | None = None, ) -> tuple[str, Any]: """URL, filename, or string --> stream @@ -161,7 +160,7 @@ def parse( sanitize_html: bool | None = None, optimistic_encoding_detection: bool | None = None, archive_url_data: bool | None = None, - requests_hooks: Optional[http.RequestHooks] = None, + requests_hooks: http.RequestHooks | None = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. diff --git a/feedparser/http.py b/feedparser/http.py index 832f330d..d49b35fc 100644 --- a/feedparser/http.py +++ b/feedparser/http.py @@ -53,15 +53,16 @@ { "response": typing.Union[typing.Callable, typing.Sequence[typing.Callable]], "response.postprocess": typing.NotRequired[typing.Callable], - } + }, ) + def get( url: str, result: dict[str, typing.Any], - hooks: typing.Optional[RequestHooks]=None, + hooks: RequestHooks | None = None, ) -> bytes: - _postprocess: typing.Optional[typing.Callable] = None + _postprocess: typing.Callable | None = None if hooks is not None: _postprocess = hooks.pop("response.postprocess", None) try: From b5b6d1162c455030b9b751be4cedf49ad15a32b8 Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Thu, 6 Nov 2025 19:18:36 -0500 Subject: [PATCH 4/5] fix changes from pre-commit.ci -- how did those even happen?!? --- feedparser/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedparser/api.py b/feedparser/api.py index f156f0fa..f6e7c920 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -30,7 +30,7 @@ import urllib.error import urllib.parse import xml.sax -from typing import IO, Any, Optional +from typing import IO, Any from . import http from .encodings import MissingEncoding, convert_file_to_utf8 From a7a03d62e67dacf5348d0ad7ebd6fa2db97f52c5 Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Thu, 6 Nov 2025 19:39:49 -0500 Subject: [PATCH 5/5] NotRequired is not available on python 3.10 --- feedparser/http.py | 3 ++- pyproject.toml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/feedparser/http.py b/feedparser/http.py index d49b35fc..e21357a4 100644 --- a/feedparser/http.py +++ b/feedparser/http.py @@ -30,6 +30,7 @@ import typing import requests +from typing_extensions import NotRequired # >=py311 from .datetimes import _parse_date @@ -52,7 +53,7 @@ "RequestHooks", { "response": typing.Union[typing.Callable, typing.Sequence[typing.Callable]], - "response.postprocess": typing.NotRequired[typing.Callable], + "response.postprocess": NotRequired[typing.Callable], }, ) diff --git a/pyproject.toml b/pyproject.toml index dbc32538..d4ae584e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ classifiers = [ dependencies = [ "sgmllib3k==1.0.0", "requests>=2.20.0", + "typing_extensions>=4.0.0", # NotRequired >=py311 ] [project.urls]