Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ bug report!
* `Ade Oshineye <http://blog.oshineye.com/>`_
* `Tom Parker-Shemilt <https://tevps.net>`_
* `Martin Pool <http://sourcefrog.net/>`_
* `Nestor Rodriguez <https://github.com/n3s7or>`_
* `Sam Ruby <http://intertwingly.net/>`_
* `Bernd Schlapsi <https://github.com/brot>`_
* `Aaron Swartz <http://www.aaronsw.com/>`_
* `Jonathan Vanasco <https://github.com/jvanasco>`_
* `Jakub Wilk <http://jwilk.net/>`_
* `Nestor Rodriguez <https://github.com/n3s7or>`_
47 changes: 34 additions & 13 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import urllib.error
import urllib.parse
import xml.sax
from typing import IO
from typing import IO, Any

from . import http
from .encodings import MissingEncoding, convert_file_to_utf8
Expand Down Expand Up @@ -72,9 +72,10 @@


def _open_resource(
url_file_stream_or_string,
result,
):
url_file_stream_or_string: Any,
result: dict,
requests_hooks: http.RequestHooks | None = None,
) -> tuple[str, Any]:
"""URL, filename, or string --> stream

This function lets you define parsers that take any input source
Expand All @@ -83,7 +84,11 @@ def _open_resource(
to have all the basic stdio read methods (read, readline, readlines).
Just .close() the object when you're done with it.

:return: A seekable, readable file object.
:param requests_hooks:
A dict of hooks to pass onto :method:`requests.get` if a URL is parsed.
See `feedparser.http.RequestHooks`

:return: A Tuple of [the method used, a seekable and readable file object].
"""

# Some notes on the history of the implementation of _open_resource().
Expand All @@ -104,8 +109,8 @@ def _open_resource(
if callable(getattr(url_file_stream_or_string, "read", None)):
if callable(getattr(url_file_stream_or_string, "seekable", None)):
if url_file_stream_or_string.seekable():
return url_file_stream_or_string
return _to_in_memory_file(url_file_stream_or_string.read())
return "seekable", url_file_stream_or_string
return "read", _to_in_memory_file(url_file_stream_or_string.read())

looks_like_url = isinstance(
url_file_stream_or_string, str
Expand All @@ -114,12 +119,12 @@ def _open_resource(
"https",
)
if looks_like_url:
data = http.get(url_file_stream_or_string, result)
return io.BytesIO(data)
data = http.get(url_file_stream_or_string, result, hooks=requests_hooks)
return "url", io.BytesIO(data)

# try to open with native open function (if url_file_stream_or_string is a filename)
try:
return open(url_file_stream_or_string, "rb")
return "filepath", open(url_file_stream_or_string, "rb")
except (OSError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by
Expand All @@ -131,7 +136,7 @@ def _open_resource(
pass

# treat url_file_stream_or_string as bytes/string
return _to_in_memory_file(url_file_stream_or_string)
return "raw_data", _to_in_memory_file(url_file_stream_or_string)


def _to_in_memory_file(data):
Expand All @@ -154,6 +159,8 @@ def parse(
resolve_relative_uris: bool | None = None,
sanitize_html: bool | None = None,
optimistic_encoding_detection: bool | None = None,
archive_url_data: bool | None = None,
requests_hooks: http.RequestHooks | None = None,
) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string.

Expand Down Expand Up @@ -188,21 +195,35 @@ def parse(
(uses less memory, but the wrong encoding may be detected in rare cases).
Defaults to the value of
:data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.

:param archive_url_data:
Should feedparser archive the URL headers and content into
:attr:`FeedParserDict.raw` ? Defaults to ``False```
:param requests_hooks:
A dict of hooks to pass onto :method:`requests.get` if a URL is parsed.
See `feedparser.http.RequestHooks`
"""

result = FeedParserDict(
bozo=False,
entries=[],
feed=FeedParserDict(),
headers={},
raw={},
)

try:
file = _open_resource(
_method, file = _open_resource(
url_file_stream_or_string,
result,
requests_hooks=requests_hooks,
)
if _method == "url" and archive_url_data:
# archive the headers before they are mutated by `response_headers`
result.raw["headers"] = result["headers"].copy()
# archive the content, then reset the file
result.raw["content"] = file.read()
file.seek(0)

except urllib.error.URLError as error:
result.update(
{
Expand Down
25 changes: 24 additions & 1 deletion feedparser/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import typing

import requests
from typing_extensions import NotRequired # >=py311

from .datetimes import _parse_date

Expand All @@ -45,19 +46,41 @@
";q=0.1"
)

# This dict defines the allowable hooks.
# `response` is the only valid hook in `requests`.
# `response.postprocess` is used
RequestHooks = typing.TypedDict(
"RequestHooks",
{
"response": typing.Union[typing.Callable, typing.Sequence[typing.Callable]],
"response.postprocess": NotRequired[typing.Callable],
},
)


def get(url: str, result: dict[str, typing.Any]) -> bytes:
def get(
url: str,
result: dict[str, typing.Any],
hooks: RequestHooks | None = None,
) -> bytes:
_postprocess: typing.Callable | None = None
if hooks is not None:
_postprocess = hooks.pop("response.postprocess", None)
try:
response = requests.get(
url,
headers={"Accept": ACCEPT_HEADER},
timeout=10,
hooks=hooks,
)
except requests.RequestException as exception:
result["bozo"] = True
result["bozo_exception"] = exception
return b""

if _postprocess is not None:
_postprocess(response, result)

# Lowercase the HTTP header keys for comparisons per RFC 2616.
result["headers"] = {k.lower(): v for k, v in response.headers.items()}

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ classifiers = [
dependencies = [
"sgmllib3k==1.0.0",
"requests>=2.20.0",
"typing_extensions>=4.0.0", # NotRequired >=py311
]

[project.urls]
Expand Down
20 changes: 15 additions & 5 deletions tests/test_open_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,39 @@


def test_fileobj():
r = feedparser.api._open_resource(io.BytesIO(b""), {}).read()
method, filelike = feedparser.api._open_resource(io.BytesIO(b""), {})
r = filelike.read()
assert r == b""
assert method == "seekable"


def testbytes():
s = b"<feed><item><title>text</title></item></feed>"
r = feedparser.api._open_resource(s, {}).read()
method, filelike = feedparser.api._open_resource(s, {})
r = filelike.read()
assert s == r
assert method == "raw_data"


def test_string():
s = b"<feed><item><title>text</title></item></feed>"
r = feedparser.api._open_resource(s, {}).read()
method, filelike = feedparser.api._open_resource(s, {})
r = filelike.read()
assert s == r
assert method == "raw_data"


def test_unicode_1():
s = b"<feed><item><title>text</title></item></feed>"
r = feedparser.api._open_resource(s, {}).read()
method, filelike = feedparser.api._open_resource(s, {})
r = filelike.read()
assert s == r
assert method == "raw_data"


def test_unicode_2():
s = rb"<feed><item><title>t\u00e9xt</title></item></feed>"
r = feedparser.api._open_resource(s, {}).read()
method, filelike = feedparser.api._open_resource(s, {})
r = filelike.read()
assert s == r
assert method == "raw_data"