Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 8 additions & 29 deletions src/intelstream/adapters/strategies/llm_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,8 @@ async def discover(
if isinstance(posts_data, list):
posts = []
for p in posts_data:
if (
isinstance(p, dict)
and isinstance(p.get("url"), str)
and p.get("url")
):
posts.append(
DiscoveredPost(url=p["url"], title=p.get("title", ""))
)
if isinstance(p, dict) and isinstance(p.get("url"), str) and p.get("url"):
posts.append(DiscoveredPost(url=p["url"], title=p.get("title", "")))
if posts:
logger.debug(
"Using cached LLM extraction",
Expand Down Expand Up @@ -131,12 +125,7 @@ def _get_content_hash(self, html: str) -> str:
):
tag.decompose()

main = (
soup.find("main")
or soup.find("article")
or soup.find(id="content")
or soup.body
)
main = soup.find("main") or soup.find("article") or soup.find(id="content") or soup.body

if main:
text = " ".join(main.get_text().split())
Expand All @@ -150,16 +139,10 @@ async def _fetch_html(self, url: str) -> str | None:
}
try:
if self._http_client:
response = await self._http_client.get(
url, headers=headers, follow_redirects=True
)
response = await self._http_client.get(url, headers=headers, follow_redirects=True)
else:
async with httpx.AsyncClient(
timeout=get_settings().http_timeout_seconds
) as client:
response = await client.get(
url, headers=headers, follow_redirects=True
)
async with httpx.AsyncClient(timeout=get_settings().http_timeout_seconds) as client:
response = await client.get(url, headers=headers, follow_redirects=True)
response.raise_for_status()
return response.text
except httpx.HTTPError as e:
Expand All @@ -169,9 +152,7 @@ async def _fetch_html(self, url: str) -> str | None:
def _clean_html(self, html: str) -> str:
soup = BeautifulSoup(html, "lxml")

for tag in soup.find_all(
["script", "style", "noscript", "svg", "path", "iframe"]
):
for tag in soup.find_all(["script", "style", "noscript", "svg", "path", "iframe"]):
tag.decompose()

for tag in soup.find_all(True):
Expand Down Expand Up @@ -286,7 +267,5 @@ def parse_and_validate(data: str) -> list[dict[str, str]] | None:
if result is not None:
return result

logger.warning(
"Failed to extract JSON from LLM response", response_preview=text[:200]
)
logger.warning("Failed to extract JSON from LLM response", response_preview=text[:200])
return []
16 changes: 4 additions & 12 deletions src/intelstream/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,12 @@ class Settings(BaseSettings):
description="LLM provider for summarization: anthropic, openai, gemini, or kimi",
)

anthropic_api_key: str | None = Field(
default=None, description="Anthropic API key for Claude"
)
anthropic_api_key: str | None = Field(default=None, description="Anthropic API key for Claude")
openai_api_key: str | None = Field(default=None, description="OpenAI API key")
gemini_api_key: str | None = Field(
default=None, description="Google Gemini API key"
)
kimi_api_key: str | None = Field(
default=None, description="Kimi (Moonshot AI) API key"
)
gemini_api_key: str | None = Field(default=None, description="Google Gemini API key")
kimi_api_key: str | None = Field(default=None, description="Kimi (Moonshot AI) API key")

youtube_api_key: str | None = Field(
default=None, description="YouTube Data API key (optional)"
)
youtube_api_key: str | None = Field(default=None, description="YouTube Data API key (optional)")

twitter_bearer_token: str | None = Field(
default=None,
Expand Down
3 changes: 1 addition & 2 deletions src/intelstream/discord/cogs/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,11 @@ async def search(self, interaction: discord.Interaction, query: str) -> None:

title = _truncate(item.title, 100)
preview = _truncate(item.summary or "", MAX_SUMMARY_PREVIEW)
score_pct = f"{result.score * 100:.0f}%"

value_parts = []
if item.original_url:
value_parts.append(f"[Link]({item.original_url})")
value_parts.append(f"Relevance: {score_pct}")
value_parts.append(f"Similarity score: {result.score:.2f}")
if preview:
value_parts.append(preview)

Expand Down
24 changes: 6 additions & 18 deletions src/intelstream/services/page_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,7 @@ async def analyze(self, url: str) -> ExtractionProfile:

validation_result = self._validate_profile(html, profile)
if not validation_result["valid"]:
raise PageAnalysisError(
f"Profile validation failed: {validation_result['reason']}"
)
raise PageAnalysisError(f"Profile validation failed: {validation_result['reason']}")

logger.info(
"Page analysis complete",
Expand All @@ -141,24 +139,16 @@ async def _fetch_html(self, url: str) -> str:

try:
if self._http_client:
response = await self._http_client.get(
url, headers=headers, follow_redirects=True
)
response = await self._http_client.get(url, headers=headers, follow_redirects=True)
else:
async with httpx.AsyncClient(
timeout=get_settings().http_timeout_seconds
) as client:
response = await client.get(
url, headers=headers, follow_redirects=True
)
async with httpx.AsyncClient(timeout=get_settings().http_timeout_seconds) as client:
response = await client.get(url, headers=headers, follow_redirects=True)

response.raise_for_status()
return response.text

except httpx.HTTPStatusError as e:
raise PageAnalysisError(
f"Failed to fetch page: HTTP {e.response.status_code}"
) from e
raise PageAnalysisError(f"Failed to fetch page: HTTP {e.response.status_code}") from e
except httpx.RequestError as e:
raise PageAnalysisError(f"Failed to fetch page: {e}") from e

Expand Down Expand Up @@ -255,9 +245,7 @@ async def _extract_profile_with_llm(self, url: str, html: str) -> dict[str, Any]
logger.error("Anthropic API error during page analysis", error=str(e))
raise PageAnalysisError(f"LLM API error: {e}") from e

def _validate_profile(
self, html: str, profile: ExtractionProfile
) -> dict[str, Any]:
def _validate_profile(self, html: str, profile: ExtractionProfile) -> dict[str, Any]:
soup = BeautifulSoup(html, "lxml")

try:
Expand Down
40 changes: 10 additions & 30 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ def test_settings_from_env(self, monkeypatch: pytest.MonkeyPatch) -> None:
assert settings.default_poll_interval_minutes == 5
assert settings.log_level == "INFO"

def test_settings_with_optional_youtube(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_settings_with_optional_youtube(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_CHANNEL_ID", "987654321")
Expand All @@ -42,9 +40,7 @@ def test_settings_with_optional_youtube(

assert settings.youtube_api_key == "yt-api-key"

def test_settings_poll_interval_bounds(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_settings_poll_interval_bounds(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_CHANNEL_ID", "987654321")
Expand Down Expand Up @@ -103,9 +99,7 @@ def test_repr_handles_none_keys(self, monkeypatch: pytest.MonkeyPatch) -> None:
assert "youtube_api_key=None" in repr_str
assert "openai_api_key=None" in repr_str

def test_empty_discord_bot_token_rejected(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_empty_discord_bot_token_rejected(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_OWNER_ID", "111222333")
Expand Down Expand Up @@ -137,9 +131,7 @@ def test_llm_api_key_returns_correct_provider_key(
settings = Settings(_env_file=None)
assert settings.llm_api_key == "sk-openai-test"

def test_llm_api_key_raises_when_key_missing(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_llm_api_key_raises_when_key_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_OWNER_ID", "111222333")
Expand All @@ -149,9 +141,7 @@ def test_llm_api_key_raises_when_key_missing(
with pytest.raises(ValidationError, match="No API key configured"):
Settings(_env_file=None)

def test_invalid_llm_provider_rejected(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_invalid_llm_provider_rejected(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_OWNER_ID", "111222333")
Expand All @@ -161,9 +151,7 @@ def test_invalid_llm_provider_rejected(
with pytest.raises(ValidationError):
Settings(_env_file=None)

def test_valid_llm_providers_accepted(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_valid_llm_providers_accepted(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_OWNER_ID", "111222333")
Expand All @@ -181,9 +169,7 @@ def test_valid_llm_providers_accepted(
assert settings.llm_api_key == key_val
monkeypatch.delenv(key_env, raising=False)

def test_missing_api_key_fails_at_construction(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_missing_api_key_fails_at_construction(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test_token")
monkeypatch.setenv("DISCORD_GUILD_ID", "123456789")
monkeypatch.setenv("DISCORD_OWNER_ID", "111222333")
Expand Down Expand Up @@ -261,9 +247,7 @@ def test_explicit_model_overrides_provider_default(
assert settings.summary_model == "my-custom-model"
assert settings.summary_model_interactive == "my-custom-interactive"

def test_partial_override_uses_default_for_unset(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_partial_override_uses_default_for_unset(self, monkeypatch: pytest.MonkeyPatch) -> None:
self._base_env(monkeypatch)
monkeypatch.setenv("LLM_PROVIDER", "openai")
monkeypatch.setenv("OPENAI_API_KEY", "sk-openai-test")
Expand Down Expand Up @@ -301,9 +285,7 @@ def test_falls_back_to_default(self, monkeypatch: pytest.MonkeyPatch) -> None:
assert settings.get_poll_interval(SourceType.YOUTUBE) == 10
assert settings.get_poll_interval(SourceType.RSS) == 10

def test_type_specific_overrides_default(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
def test_type_specific_overrides_default(self, monkeypatch: pytest.MonkeyPatch) -> None:
self._base_env(monkeypatch)
monkeypatch.setenv("DEFAULT_POLL_INTERVAL_MINUTES", "5")
monkeypatch.setenv("TWITTER_POLL_INTERVAL_MINUTES", "20")
Expand All @@ -329,9 +311,7 @@ def test_returns_parent_directory_for_sqlite_file(self) -> None:
assert result == Path("./data")

def test_returns_parent_for_absolute_path(self) -> None:
result = get_database_directory(
"sqlite+aiosqlite:////home/user/data/intelstream.db"
)
result = get_database_directory("sqlite+aiosqlite:////home/user/data/intelstream.db")
assert result == Path("/home/user/data")

def test_returns_none_for_memory_database(self) -> None:
Expand Down
13 changes: 3 additions & 10 deletions tests/test_discord/test_channel_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,7 @@ async def test_summary_with_different_channel(self, cog, mock_interaction):
messages = [_make_message(f"msg {i}", f"user{i}") for i in range(6)]
target_channel.history = MagicMock(return_value=_async_iter(messages))

await cog.summary.callback(
cog, mock_interaction, count=200, channel=target_channel
)
await cog.summary.callback(cog, mock_interaction, count=200, channel=target_channel)

target_channel.history.assert_called_once()
sent_text = mock_interaction.followup.send.call_args.args[0]
Expand All @@ -183,19 +181,14 @@ async def test_summary_handles_summarization_failure(self, cog, mock_interaction
channel = mock_interaction.channel
channel.history = MagicMock(return_value=_async_iter(messages))

cog._summarizer.summarize_chat = AsyncMock(
side_effect=SummarizationError("API error")
)
cog._summarizer.summarize_chat = AsyncMock(side_effect=SummarizationError("API error"))

await cog.summary.callback(cog, mock_interaction, count=200, channel=None)

mock_interaction.followup.send.assert_called_once()
sent_kwargs = mock_interaction.followup.send.call_args.kwargs
assert sent_kwargs.get("ephemeral") is True
assert (
"Failed to generate summary"
in mock_interaction.followup.send.call_args.args[0]
)
assert "Failed to generate summary" in mock_interaction.followup.send.call_args.args[0]

async def test_summary_filters_empty_messages(self, cog, mock_interaction):
messages = [
Expand Down
Loading
Loading