Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/scrape/base_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from src.utility.lib import CustomException, Logger
from src.utility.utils import EndpointType, Utils, ViewType
from typing import Dict, Type, TypeVar
from urllib.parse import urlencode

T = TypeVar("T", bound="BaseScraper")

Expand All @@ -23,15 +22,15 @@ def scrape(cls: Type[T], endpoint: Endpoint, req: Request) -> T | None:

if endpoint.type == EndpointType.QUERY:
params = req.query_params
url = Utils.create_filmarks_link(endpoint.path + "?" + urlencode(params))
url = Utils.create_filmarks_link(endpoint.path + "?" + Utils.safe_encode(params))

elif endpoint.type == EndpointType.PATH:
params = req.path_params
url = Utils.create_filmarks_link(endpoint.path.format(**params))

elif endpoint.type == EndpointType.COMBINED:
params = {**req.query_params, **req.path_params}
url = Utils.create_filmarks_link(endpoint.path.format(**req.path_params) + "?" + urlencode(req.query_params))
url = Utils.create_filmarks_link(endpoint.path.format(**req.path_params) + "?" + Utils.safe_encode(req.query_params))

else:
raise ValueError(f"Unexpected EndpointType: {endpoint.type}") # pragma: no cover
Expand Down
2 changes: 2 additions & 0 deletions src/scrape/info/info_anime_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def set_info_data(self) -> None:
value = self._get_person_info(field)
if value: self.data[field.key] = value

self.data["episodes"] = self._get_episode_info()

Logger.info(self.get_logging(id=[self.series_id, self.season_id], text=self.data))

def set_review_data(self) -> None:
Expand Down
2 changes: 2 additions & 0 deletions src/scrape/info/info_drama_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def set_info_data(self) -> None:
value = self._get_person_info(field)
if value: self.data[field.key] = value

self.data["episodes"] = self._get_episode_info()

Logger.info(self.get_logging(id=[self.series_id, self.season_id], text=self.data))

def set_review_data(self) -> None:
Expand Down
14 changes: 14 additions & 0 deletions src/scrape/info/info_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,20 @@ def _get_person_info(self, field: PersonInfo) -> List[Dict[str, Any]] | None:
in info_elem.find_next_sibling("ul").find_all("li")
] if info_elem else None

def _get_episode_info(self) -> List[Dict[str, Any]] | None:
info_elem = self.detail_head.select("div.c2-episode-list-item")

return [
Utils.create_episode_info(
episode=episode.select_one("div.c2-episode-list-item__header-text-number").text,
title=episode.select_one("div.c2-episode-list-item__header-text-title").text,
outline=epi.text.replace("\n", "") if (epi := episode.select_one("div.c2-episode-list-item__outline-text")) else "",
link=episode.select_one("a").attrs["href"]
)
for episode
in info_elem
] if info_elem else None

def _is_reviews_empty(self) -> Tag | None:
condition = self.detail_foot.select_one("div.p2-empty-reviews-message__text")

Expand Down
21 changes: 20 additions & 1 deletion src/utility/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from datetime import datetime, timezone
from enum import Enum
from fastapi.datastructures import QueryParams
from msgspec import Struct
from src.utility.models import AnimeDataClip, AnimeDataMark, DramaDataClip, DramaDataMark, MovieDataClip, MovieDataMark
from typing import Any, Dict, Set, Tuple
from urllib.parse import urljoin
from urllib.parse import unquote, urlencode, urljoin


class EndpointType(str, Enum):
Expand Down Expand Up @@ -102,6 +103,11 @@ class Utils:
@staticmethod
def get_scrape_date() -> datetime:
return datetime.now(timezone.utc).isoformat(sep=" ", timespec="microseconds")

@staticmethod
def safe_encode(params: QueryParams) -> str:
params = {k: unquote(v) for k, v in params.items()}
return urlencode(params)

@staticmethod
def create_filmarks_link(url: str) -> str:
Expand Down Expand Up @@ -129,6 +135,19 @@ def create_person_info(name: str, link: str, character: str = "") -> Dict[str, A

return person_info

@staticmethod
def create_episode_info(episode: str, title: str, link: str, outline: str = "") -> Dict[str, Any]:
episode_info = {}

episode_info["episode"] = int(episode)
episode_info["title"] = title
if outline:
episode_info["outline"] = outline
episode_info["id"] = int(link.split("/")[-1])
episode_info["link"] = Utils.create_filmarks_link(link)

return episode_info

@staticmethod
def create_review_info(user_name: str, user_link: str, review_date: str, review_rating: str, review_contents: str = "", review_link: str = "") -> Dict[str, Any]:
review_info = {}
Expand Down
33 changes: 0 additions & 33 deletions tests/anime/test_anime_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from pydantic import Field
from requests.exceptions import RequestException
from src.utility.models import ListParams, ReviewParams, SearchParams
from tests.conftest import get_json_val
import pytest

Expand Down Expand Up @@ -150,34 +148,3 @@ def test_scrape_error_503_service_unavailable_session(client_nc, mocker, path, c
assert resp.status_code == 503
assert get_json_val(resp_data, "$.detail") == "The service is currently unavailable."
assert "Request to Filmarks failed: 'Testing - 503 Service Unavailable'" in caplog.text


@pytest.mark.parametrize("path", [
"/animes/2592/3304/reviews?page=9999999999999999999",
"/list-anime/trend?page=999999999999999999",
"/list-anime/vod/prime_video?page=999999999999999999",
"/list-anime/year/2020s?page=999999999999999999",
"/list-anime/year/2025?page=999999999999999999",
"/list-anime/year/2019/1?page=999999999999999999",
"/list-anime/year/2019/99",
"/list-anime/company/41?page=999999999999999999",
"/list-anime/tag/駄作?page=999999999999999999",
"/list-anime/person/274563?page=999999999999999999",
])
def test_scrape_error_503_service_unavailable_filmarks(client_nc, path, caplog) -> None:
class CustomParams():
page: int = Field(1, gt=0)
client_nc.app.dependency_overrides[SearchParams] = CustomParams
client_nc.app.dependency_overrides[ReviewParams] = CustomParams
client_nc.app.dependency_overrides[ListParams] = CustomParams

resp = client_nc.get(path)
resp_data = resp.json()

assert resp.status_code == 503
assert get_json_val(resp_data, "$.detail") == "The service is currently unavailable."
assert "Filmarks is temporarily unavailable" in caplog.text

del client_nc.app.dependency_overrides[SearchParams]
del client_nc.app.dependency_overrides[ReviewParams]
del client_nc.app.dependency_overrides[ListParams]
39 changes: 39 additions & 0 deletions tests/anime/test_anime_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,15 @@ def test_review_input_more_than_max_threshold(client_nc, path) -> None:
"link": "https://filmarks.com/people/214282",
},
],
"episodes": [
{
"episode": 1,
"title": "漂流? 冒険の島!",
"outline": "干ばつ。洪水。真夏に降る雪・・・。世界中がおかしかったその夏。日本からは見えるはずのないオーロラを目撃した太一たちは、オーロラの裂け目から飛来した謎の光に異世界へと連れ去られてしまう。すべてが未知のその世界で太一たちが最初に出会ったのは、自分たちを「待っていた」という奇妙な生物、デジタルモンスターだった。",
"id": 73115,
"link": "https://filmarks.com/animes/2592/3304/episodes/73115"
},
],
},
],
)
Expand Down Expand Up @@ -214,6 +223,7 @@ def test_info_with_results_single_1(client_nc, test_data, caplog) -> None:
for field in fields:
assert get_json_val(resp_data, f"$.data.{field}") == get_json_val(test_data, f"$.{field}")

assert get_json_val(test_data, "$.episodes[0]") in get_json_val(resp_data, "$.data.episodes")
assert get_json_val(resp_data, "$.data.rating") == pytest.approx(get_json_val(test_data, "$.rating"), abs=0.5)
assert get_json_val(resp_data, "$.data.mark_count") >= get_json_val(test_data, "$.mark_count")
assert get_json_val(resp_data, "$.data.clip_count") >= get_json_val(test_data, "$.clip_count")
Expand Down Expand Up @@ -302,6 +312,14 @@ def test_info_with_results_single_1(client_nc, test_data, caplog) -> None:
"link": "https://filmarks.com/people/275804",
},
],
"episodes": [
{
"episode": 1,
"title": "第1話",
"id": 39867,
"link": "https://filmarks.com/animes/1533/2046/episodes/39867"
},
],
},
],
)
Expand Down Expand Up @@ -331,6 +349,7 @@ def test_info_with_results_single_2(client_nc, test_data, caplog) -> None:
for field in fields:
assert get_json_val(resp_data, f"$.data.{field}") == get_json_val(test_data, f"$.{field}")

assert get_json_val(test_data, "$.episodes[0]") in get_json_val(resp_data, "$.data.episodes")
assert get_json_val(resp_data, "$.data.rating") == pytest.approx(get_json_val(test_data, "$.rating"), abs=0.5)
assert get_json_val(resp_data, "$.data.mark_count") >= get_json_val(test_data, "$.mark_count")
assert get_json_val(resp_data, "$.data.clip_count") >= get_json_val(test_data, "$.clip_count")
Expand Down Expand Up @@ -413,6 +432,15 @@ def test_info_with_results_single_2(client_nc, test_data, caplog) -> None:
"link": "https://filmarks.com/people/307449",
},
],
"episodes": [
{
"episode": 1,
"title": "第1章 目覚め",
"outline": "長い眠りから目覚めたアン。火の国の軍艦に乗っていることに気づき驚くが、それは仲間が乗っ取ったものだった。これまでの経緯を聞き、アンは死んだことになっていると知ってショックを受ける。サカの計画では、日食の時に火の国を奇襲するとのことだったが、アンは自分一人で戦うべきだと飛び出す。3年ぶりに火の国に戻ったズーコは父である王と再会。王はズーコがアバターを殺したと聞いて喜ぶが、ズーコはアンが生きているのではと思っていた。",
"id": 114131,
"link": "https://filmarks.com/animes/3691/4983/episodes/114131"
},
],
},
],
)
Expand Down Expand Up @@ -441,6 +469,7 @@ def test_info_with_results_single_3(client_nc, test_data, caplog) -> None:
for field in fields:
assert get_json_val(resp_data, f"$.data.{field}") == get_json_val(test_data, f"$.{field}")

assert get_json_val(test_data, "$.episodes[0]") in get_json_val(resp_data, "$.data.episodes")
assert get_json_val(resp_data, "$.data.rating") == pytest.approx(get_json_val(test_data, "$.rating"), abs=0.5)
assert get_json_val(resp_data, "$.data.mark_count") >= get_json_val(test_data, "$.mark_count")
assert get_json_val(resp_data, "$.data.clip_count") >= get_json_val(test_data, "$.clip_count")
Expand Down Expand Up @@ -514,6 +543,15 @@ def test_info_with_results_single_3(client_nc, test_data, caplog) -> None:
"link": "https://filmarks.com/people/287199"
}
],
"episodes": [
{
"episode": 1,
"title": "2016年のきみへ",
"outline": "ハッピーを広めるため地球に降り立ったタコピーは、人間の女の子しずかと出会う。ピンチを救ってもらったタコピーは、不思議な力を持つハッピー道具で彼女のために奔走するのだが、しずかは笑顔すら見せない。どうやらその背景には学校のお友達とおうちの複雑な事情が関係しているようで……。",
"id": 143599,
"link": "https://filmarks.com/animes/4809/6518/episodes/143599"
},
],
},
],
)
Expand Down Expand Up @@ -544,6 +582,7 @@ def test_info_with_results_single_4(client_nc, test_data, caplog) -> None:
for field in fields:
assert get_json_val(resp_data, f"$.data.{field}") == get_json_val(test_data, f"$.{field}")

assert get_json_val(test_data, "$.episodes[0]") in get_json_val(resp_data, "$.data.episodes")
assert get_json_val(resp_data, "$.data.rating") == pytest.approx(get_json_val(test_data, "$.rating"), abs=0.5)
assert get_json_val(resp_data, "$.data.mark_count") >= get_json_val(test_data, "$.mark_count")
assert get_json_val(resp_data, "$.data.clip_count") >= get_json_val(test_data, "$.clip_count")
Expand Down
33 changes: 0 additions & 33 deletions tests/drama/test_drama_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from pydantic import Field
from requests.exceptions import RequestException
from src.utility.models import ListParams, ReviewParams, SearchParams
from tests.conftest import get_json_val
import pytest

Expand Down Expand Up @@ -152,34 +150,3 @@ def test_scrape_error_503_service_unavailable_session(client_nc, mocker, path, c
assert resp.status_code == 503
assert get_json_val(resp_data, "$.detail") == "The service is currently unavailable."
assert "Request to Filmarks failed: 'Testing - 503 Service Unavailable'" in caplog.text


@pytest.mark.parametrize("path", [
"/dramas/6055/8586/reviews?page=9999999999999999999",
"/list-drama/trend?page=999999999999999999",
"/list-drama/vod/prime_video?page=999999999999999999",
"/list-drama/year/2020s?page=999999999999999999",
"/list-drama/year/2025?page=999999999999999999",
"/list-drama/country/144?page=999999999999999999",
"/list-drama/genre/9?page=999999999999999999",
"/list-drama/tag/駄作?page=999999999999999999",
"/list-drama/person/25499?page=999999999999999999",
])
def test_scrape_error_503_service_unavailable_filmarks(client_nc, path, caplog) -> None:
class CustomParams():
page: int = Field(1, gt=0)
client_nc.app.dependency_overrides[SearchParams] = CustomParams
client_nc.app.dependency_overrides[ReviewParams] = CustomParams
client_nc.app.dependency_overrides[ListParams] = CustomParams

resp = client_nc.get(path)
resp_data = resp.json()

assert resp.status_code == 503
assert get_json_val(resp_data, "$.detail") == "The service is currently unavailable."
assert "Filmarks is temporarily unavailable" in caplog.text

del client_nc.app.dependency_overrides[SearchParams]
del client_nc.app.dependency_overrides[ReviewParams]
del client_nc.app.dependency_overrides[ListParams]

Loading