From e195d383f957a0267815c040d397846b1689710a Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 00:38:31 -0800 Subject: [PATCH 1/9] Remove performance config, hardcode maximum throughput values Simplify configuration by removing most performance-related env vars and hardcoding values optimized for maximum resource usage: - ThreadPoolExecutor: 500 workers (vs default 32) - aiohttp connections: unlimited (limit=0) - curl_cffi pool: 10000 max_clients - Image downloads: no concurrency limit (removed semaphore) Keep only 3 user-configurable limits via env vars: - MAX_USER_QUEUE_SIZE (default 0 = no limit) - STREAMING_DURATION_THRESHOLD (default 300s) - MAX_VIDEO_DURATION (default 0 = no limit) --- .env.example | 56 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/.env.example b/.env.example index da4dce6..f56eb79 100644 --- a/.env.example +++ b/.env.example @@ -1,17 +1,10 @@ -TG_SERVER=http://telegram-bot-api:8081 -TELEGRAM_API_ID=1234567 -TELEGRAM_API_HASH=abc123 - -DB_URL=postgresql://postgres:postgres@db/ttbot-db # tt-bot BOT_TOKEN=12345:abcde ADMIN_IDS=[1234567] # SECOND_IDS=[1234567] JOIN_LOGS=-1234567 STORAGE_CHANNEL_ID=12345 -# API settings -BOTSTAT=abcdefg12345 -MONETAG_URL=https://example.com/your-monetag-link/ + # stats-bot STATS_BOT_TOKEN=12345:abcde STATS_IDS=[-1234567] @@ -19,32 +12,53 @@ STATS_CHAT=-1234567 STATS_MESSAGE_ID=23 DAILY_STATS_MESSAGE_ID=24 +# API settings +BOTSTAT=abcdefg12345 +MONETAG_URL=https://example.com/your-monetag-link/ + # Logging settings (optional) # LOG_LEVEL=INFO # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL # yt-dlp settings (optional) -YTDLP_COOKIES=cookies.txt +# YTDLP_COOKIES=cookies.txt # Proxy settings (load balancing with multiple proxies) # Path to file with proxy list (one proxy URL per line) -PROXY_FILE=proxies.txt +# PROXY_FILE=proxies.txt # Use proxy only for TikTok API requests, not for media downloads # PROXY_DATA_ONLY=false # Include host machine's direct IP in round-robin rotation # PROXY_INCLUDE_HOST=false +# Performance settings (for high-throughput scenarios) +# ThreadPoolExecutor workers for sync yt-dlp extraction (default: 128) +# THREAD_POOL_SIZE=128 +# Total aiohttp connection pool size for URL resolution (default: 200) +# AIOHTTP_POOL_SIZE=200 +# Per-host connection limit (default: 50) +# AIOHTTP_LIMIT_PER_HOST=50 +# Max parallel image downloads per slideshow (default: 20) +# MAX_CONCURRENT_IMAGES=20 +# curl_cffi connection pool size for media downloads (default: 200) +# CURL_POOL_SIZE=200 +# Use streaming for videos longer than this (seconds, default: 300 = 5 min) +# STREAMING_DURATION_THRESHOLD=300 +# Maximum video duration in seconds (default: 1800 = 30 min, 0 = no limit) +# MAX_VIDEO_DURATION=1800 + +# Queue settings (optional, defaults shown) +# MAX_USER_QUEUE_SIZE=3 + # Retry settings - 3-part retry strategy with proxy rotation # Part 1: URL resolution retries (short URLs to full URLs) -URL_RESOLVE_MAX_RETRIES=3 +# URL_RESOLVE_MAX_RETRIES=3 # Part 2: Video info extraction retries (metadata) -VIDEO_INFO_MAX_RETRIES=3 +# VIDEO_INFO_MAX_RETRIES=3 # Part 3: Download retries (video/images/audio) -DOWNLOAD_MAX_RETRIES=3 - -# Limits (optional, 0 = no limit) -# Max concurrent videos per user in queue -MAX_USER_QUEUE_SIZE=0 -# Use streaming for videos longer than this (seconds, 0 = never stream) -STREAMING_DURATION_THRESHOLD=300 -# Maximum video duration in seconds (0 = no limit) -MAX_VIDEO_DURATION=0 +# DOWNLOAD_MAX_RETRIES=3 + +# Telegram Bot API +TG_SERVER=http://telegram-bot-api:8081 + +# db +DB_URL=postgresql://postgres:postgres@db/ttbot-db From 7295e49416700e1d0134e6e86d0a025cdcac24c2 Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 00:49:11 -0800 Subject: [PATCH 2/9] Change positions of values in .env.example --- .env.example | 53 ++++++++++++++++++---------------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/.env.example b/.env.example index f56eb79..2ba7825 100644 --- a/.env.example +++ b/.env.example @@ -1,10 +1,14 @@ +TG_SERVER=http://telegram-bot-api:8081 +DB_URL=postgresql://postgres:postgres@db/ttbot-db # tt-bot BOT_TOKEN=12345:abcde ADMIN_IDS=[1234567] # SECOND_IDS=[1234567] JOIN_LOGS=-1234567 STORAGE_CHANNEL_ID=12345 - +# API settings +BOTSTAT=abcdefg12345 +MONETAG_URL=https://example.com/your-monetag-link/ # stats-bot STATS_BOT_TOKEN=12345:abcde STATS_IDS=[-1234567] @@ -12,53 +16,32 @@ STATS_CHAT=-1234567 STATS_MESSAGE_ID=23 DAILY_STATS_MESSAGE_ID=24 -# API settings -BOTSTAT=abcdefg12345 -MONETAG_URL=https://example.com/your-monetag-link/ - # Logging settings (optional) # LOG_LEVEL=INFO # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL # yt-dlp settings (optional) -# YTDLP_COOKIES=cookies.txt +YTDLP_COOKIES=cookies.txt # Proxy settings (load balancing with multiple proxies) # Path to file with proxy list (one proxy URL per line) -# PROXY_FILE=proxies.txt +PROXY_FILE=proxies.txt # Use proxy only for TikTok API requests, not for media downloads # PROXY_DATA_ONLY=false # Include host machine's direct IP in round-robin rotation # PROXY_INCLUDE_HOST=false -# Performance settings (for high-throughput scenarios) -# ThreadPoolExecutor workers for sync yt-dlp extraction (default: 128) -# THREAD_POOL_SIZE=128 -# Total aiohttp connection pool size for URL resolution (default: 200) -# AIOHTTP_POOL_SIZE=200 -# Per-host connection limit (default: 50) -# AIOHTTP_LIMIT_PER_HOST=50 -# Max parallel image downloads per slideshow (default: 20) -# MAX_CONCURRENT_IMAGES=20 -# curl_cffi connection pool size for media downloads (default: 200) -# CURL_POOL_SIZE=200 -# Use streaming for videos longer than this (seconds, default: 300 = 5 min) -# STREAMING_DURATION_THRESHOLD=300 -# Maximum video duration in seconds (default: 1800 = 30 min, 0 = no limit) -# MAX_VIDEO_DURATION=1800 - -# Queue settings (optional, defaults shown) -# MAX_USER_QUEUE_SIZE=3 - # Retry settings - 3-part retry strategy with proxy rotation # Part 1: URL resolution retries (short URLs to full URLs) -# URL_RESOLVE_MAX_RETRIES=3 +URL_RESOLVE_MAX_RETRIES=3 # Part 2: Video info extraction retries (metadata) -# VIDEO_INFO_MAX_RETRIES=3 +VIDEO_INFO_MAX_RETRIES=3 # Part 3: Download retries (video/images/audio) -# DOWNLOAD_MAX_RETRIES=3 - -# Telegram Bot API -TG_SERVER=http://telegram-bot-api:8081 - -# db -DB_URL=postgresql://postgres:postgres@db/ttbot-db +DOWNLOAD_MAX_RETRIES=3 + +# Limits (optional, 0 = no limit) +# Max concurrent videos per user in queue +MAX_USER_QUEUE_SIZE=0 +# Use streaming for videos longer than this (seconds, 0 = never stream) +STREAMING_DURATION_THRESHOLD=300 +# Maximum video duration in seconds (0 = no limit) +MAX_VIDEO_DURATION=0 From 8d109a8f6c3c58b6bdc0722858f586846a91ef5e Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 00:53:16 -0800 Subject: [PATCH 3/9] Add Telegram API credentials to .env.example --- .env.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.env.example b/.env.example index 2ba7825..da4dce6 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,7 @@ TG_SERVER=http://telegram-bot-api:8081 +TELEGRAM_API_ID=1234567 +TELEGRAM_API_HASH=abc123 + DB_URL=postgresql://postgres:postgres@db/ttbot-db # tt-bot BOT_TOKEN=12345:abcde From acff6be9cc76c9a79f9f80ce12f1718c3537b0e8 Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 21:10:54 -0800 Subject: [PATCH 4/9] Update CODEBASE_MAP.md --- docs/CODEBASE_MAP.md | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/CODEBASE_MAP.md b/docs/CODEBASE_MAP.md index 720af88..4e769f6 100644 --- a/docs/CODEBASE_MAP.md +++ b/docs/CODEBASE_MAP.md @@ -1,12 +1,12 @@ --- -last_mapped: 2026-01-14T22:45:00Z +last_mapped: 2026-01-15T12:00:00Z total_files: 61 -total_tokens: 71358 +total_tokens: 69443 --- # Codebase Map -> Auto-generated by Cartographer. Last mapped: 2026-01-14 +> Auto-generated by Cartographer. Last mapped: 2026-01-15 ## System Overview @@ -168,11 +168,13 @@ tt-bot/ | File | Purpose | Tokens | |------|---------|--------| -| client.py | Main TikTokClient + ProxySession + 3-part retry | 18,676 | +| client.py | Main TikTokClient + ProxySession + 3-part retry | 16,499 | | proxy_manager.py | Thread-safe round-robin proxy rotation | 1,303 | -| models.py | VideoInfo, MusicInfo dataclasses | 1,091 | +| models.py | VideoInfo, MusicInfo dataclasses | 1,079 | | exceptions.py | Exception hierarchy (9 error types) | 233 | +**Note:** `VideoInfo.author` field was removed (unused in codebase). + **Key Classes:** - `TikTokClient`: Main extraction client with integrated retry - `ProxySession`: Manages proxy state per request flow (sticky until retry) @@ -263,8 +265,8 @@ tt-bot/ | File | Purpose | Tokens | |------|---------|--------| -| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,374 | -| queue_manager.py | Per-user concurrency limits | 1,021 | +| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,322 | +| queue_manager.py | Per-user concurrency limits | 1,029 | | utils.py | Helpers (lang resolution, user registration) | 692 | **Key Functions:** @@ -273,6 +275,10 @@ tt-bot/ - `send_music_result()`: Send audio with cover - `QueueManager.info_queue()`: Acquire/release queue slot +**Note:** Thumbnail download thresholds: +- Inline messages: >30s (lowered from 60s) +- Regular messages: >60s + --- ### Stats Module (`stats/`) @@ -516,6 +522,8 @@ sequenceDiagram | `BOT_TOKEN` | Main bot token | | `DB_URL` | PostgreSQL connection string | | `TG_SERVER` | Telegram API server URL | +| `TELEGRAM_API_ID` | Telegram API ID (for custom Bot API server) | +| `TELEGRAM_API_HASH` | Telegram API hash (for custom Bot API server) | ### Retry Configuration (NEW) | Variable | Default | Description | @@ -527,10 +535,11 @@ sequenceDiagram ### Performance | Variable | Default | Description | |----------|---------|-------------| -| `THREAD_POOL_SIZE` | 128 | ThreadPoolExecutor workers | -| `MAX_USER_QUEUE_SIZE` | 3 | Max concurrent per user | -| `MAX_CONCURRENT_IMAGES` | 20 | Max parallel image downloads | -| `MAX_VIDEO_DURATION` | 1800 | Max video duration (seconds, 0=unlimited) | +| `MAX_USER_QUEUE_SIZE` | 0 | Max concurrent per user (0=unlimited) | +| `MAX_VIDEO_DURATION` | 0 | Max video duration (seconds, 0=unlimited) | +| `STREAMING_DURATION_THRESHOLD` | 300 | Stream videos longer than this (seconds) | | `LOG_LEVEL` | INFO | Logging level | +**Note:** Thread pool (500 workers) and curl_cffi connections (10,000) are hardcoded for maximum throughput. + See `.env.example` for complete list. From 8745d5a8c071d0ee40357e80add728d86a0df366 Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 21:30:22 -0800 Subject: [PATCH 5/9] Fix TikTok extraction with proxies by using direct connection for metadata TikTok's browser impersonation (impersonate=True) doesn't work through HTTP proxies, causing extraction to fail with "Unable to extract webpage video data". Changed approach: - Use direct connection (no proxy) for video info extraction with impersonate - Use proxy for media downloads to hide server IP This fixes the issue where all proxy attempts would fail due to TikTok's JavaScript challenge blocking non-browser requests through proxies. --- tiktok_api/client.py | 99 +++++++++++--------------------------------- 1 file changed, 25 insertions(+), 74 deletions(-) diff --git a/tiktok_api/client.py b/tiktok_api/client.py index c36aab5..8ba7359 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -848,76 +848,25 @@ def _extract_with_context_sync( try: # Use yt-dlp's internal method to get raw webpage data # This also sets up all necessary cookies - # NOTE: When using a proxy, yt-dlp's impersonate=True feature - # doesn't work correctly. We need to download without impersonate. + # NOTE: TikTok's impersonate feature doesn't work through HTTP proxies. + # Always use direct connection for extraction, proxy is used for downloads. + saved_proxy = None # Will store proxy for download context if self.proxy_manager and self.proxy_manager.has_proxies(): - # Download webpage without impersonate to avoid proxy issues - res = ie._download_webpage_handle( - normalized_url, video_id, fatal=False, impersonate=False - ) - if res is False: - raise TikTokExtractionError( - f"Failed to download webpage for video {video_id}" - ) - - webpage, urlh = res - - # Check for login redirect - import urllib.parse - - if urllib.parse.urlparse(urlh.url).path == "/login": - raise TikTokExtractionError( - "TikTok is requiring login for access to this content" - ) - - # Extract data manually using yt-dlp's helper methods - video_data = None - status = -1 - - # Try universal data first - if universal_data := ie._get_universal_data(webpage, video_id): - from yt_dlp.utils import traverse_obj - - status = ( - traverse_obj( - universal_data, - ("webapp.video-detail", "statusCode", {int}), - ) - or 0 - ) - video_data = traverse_obj( - universal_data, - ("webapp.video-detail", "itemInfo", "itemStruct", {dict}), - ) - - # Try sigi state data - elif sigi_data := ie._get_sigi_state(webpage, video_id): - from yt_dlp.utils import traverse_obj - - status = ( - traverse_obj(sigi_data, ("VideoPage", "statusCode", {int})) - or 0 - ) - video_data = traverse_obj( - sigi_data, ("ItemModule", video_id, {dict}) - ) - - # Try next.js data - elif next_data := ie._search_nextjs_data( - webpage, video_id, default={} - ): - from yt_dlp.utils import traverse_obj + # Download webpage without proxy but with impersonate + # Save current proxy setting and temporarily disable it + saved_proxy = ydl_opts.get("proxy") + if "proxy" in ydl_opts: + del ydl_opts["proxy"] + # Recreate YDL without proxy for extraction + ydl.close() + ydl = yt_dlp.YoutubeDL(ydl_opts) + ie = ydl.get_info_extractor("TikTok") + ie.set_downloader(ydl) - status = ( - traverse_obj( - next_data, ("props", "pageProps", "statusCode", {int}) - ) - or 0 - ) - video_data = traverse_obj( - next_data, - ("props", "pageProps", "itemInfo", "itemStruct", {dict}), - ) + # Use standard extraction with impersonate (no proxy) + video_data, status = ie._extract_web_data_and_status( + normalized_url, video_id + ) # Check TikTok status codes for errors # 10204 = Video not found / deleted @@ -929,11 +878,6 @@ def _extract_with_context_sync( return None, "private", None elif status == 10216: return None, "deleted", None # Treat under review as deleted - - if not video_data: - raise TikTokExtractionError( - f"Unable to extract webpage video data (status: {status})" - ) else: # No proxy, use the standard method with impersonate video_data, status = ie._extract_web_data_and_status( @@ -958,11 +902,18 @@ def _extract_with_context_sync( ) from e # Create download context with the live instances + # For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy) + # For non-proxy path, use request_proxy as before + context_proxy = ( + saved_proxy + if self.proxy_manager and self.proxy_manager.has_proxies() + else request_proxy + ) download_context = { "ydl": ydl, "ie": ie, "referer_url": url, - "proxy": request_proxy, # Store proxy for per-request assignment + "proxy": context_proxy, # Store proxy for per-request assignment } # Success - transfer ownership of ydl to caller via download_context From 248c0501626cf520f70a79dbcf32834a4187edaa Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 21:51:22 -0800 Subject: [PATCH 6/9] Improve error handling in ydl recreation for proxy extraction Create the new YoutubeDL instance before closing the old one to ensure we have a valid ydl even if initialization fails. --- tiktok_api/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tiktok_api/client.py b/tiktok_api/client.py index 8ba7359..f00d911 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -858,8 +858,11 @@ def _extract_with_context_sync( if "proxy" in ydl_opts: del ydl_opts["proxy"] # Recreate YDL without proxy for extraction - ydl.close() + # Create new instance first to ensure we have a valid ydl + # even if something goes wrong during recreation + old_ydl = ydl ydl = yt_dlp.YoutubeDL(ydl_opts) + old_ydl.close() # Close old instance after new one is ready ie = ydl.get_info_extractor("TikTok") ie.set_downloader(ydl) From 14c1abe74bdebc14145996fa9f738e43f908a82a Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 21:56:26 -0800 Subject: [PATCH 7/9] Add video_data validation after TikTok extraction Return extraction error if video_data is None despite a non-error status code, preventing downstream issues from invalid data. --- tiktok_api/client.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tiktok_api/client.py b/tiktok_api/client.py index f00d911..ecb7f76 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -881,6 +881,11 @@ def _extract_with_context_sync( return None, "private", None elif status == 10216: return None, "deleted", None # Treat under review as deleted + + # Validate that we got video data + if not video_data: + logger.error(f"No video data returned for {video_id} (status={status})") + return None, "extraction", None else: # No proxy, use the standard method with impersonate video_data, status = ie._extract_web_data_and_status( @@ -894,6 +899,11 @@ def _extract_with_context_sync( return None, "private", None elif status == 10216: return None, "deleted", None # Treat under review as deleted + + # Validate that we got video data + if not video_data: + logger.error(f"No video data returned for {video_id} (status={status})") + return None, "extraction", None except AttributeError as e: logger.error( f"Failed to call yt-dlp internal method: {e}. " From 74283324b15c2ab4be33bec3e3746863b7cb89bd Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 22:26:05 -0800 Subject: [PATCH 8/9] Always use proxy for TikTok extraction Remove logic that stripped proxy from ydl_opts during extraction. Datacenter IPs are typically blocked by TikTok, so extraction must use the configured proxy to work on servers. --- tiktok_api/client.py | 75 ++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 55 deletions(-) diff --git a/tiktok_api/client.py b/tiktok_api/client.py index ecb7f76..8186ff2 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -848,62 +848,27 @@ def _extract_with_context_sync( try: # Use yt-dlp's internal method to get raw webpage data # This also sets up all necessary cookies - # NOTE: TikTok's impersonate feature doesn't work through HTTP proxies. - # Always use direct connection for extraction, proxy is used for downloads. - saved_proxy = None # Will store proxy for download context - if self.proxy_manager and self.proxy_manager.has_proxies(): - # Download webpage without proxy but with impersonate - # Save current proxy setting and temporarily disable it - saved_proxy = ydl_opts.get("proxy") - if "proxy" in ydl_opts: - del ydl_opts["proxy"] - # Recreate YDL without proxy for extraction - # Create new instance first to ensure we have a valid ydl - # even if something goes wrong during recreation - old_ydl = ydl - ydl = yt_dlp.YoutubeDL(ydl_opts) - old_ydl.close() # Close old instance after new one is ready - ie = ydl.get_info_extractor("TikTok") - ie.set_downloader(ydl) - - # Use standard extraction with impersonate (no proxy) - video_data, status = ie._extract_web_data_and_status( - normalized_url, video_id - ) - - # Check TikTok status codes for errors - # 10204 = Video not found / deleted - # 10216 = Video under review - # 10222 = Private video - if status == 10204: - return None, "deleted", None - elif status == 10222: - return None, "private", None - elif status == 10216: - return None, "deleted", None # Treat under review as deleted - - # Validate that we got video data - if not video_data: - logger.error(f"No video data returned for {video_id} (status={status})") - return None, "extraction", None - else: - # No proxy, use the standard method with impersonate - video_data, status = ie._extract_web_data_and_status( - normalized_url, video_id - ) + # NOTE: Always use proxy for extraction if configured, as datacenter + # IPs are typically blocked by TikTok. + video_data, status = ie._extract_web_data_and_status( + normalized_url, video_id + ) - # Check TikTok status codes for errors (same as proxy path) - if status == 10204: - return None, "deleted", None - elif status == 10222: - return None, "private", None - elif status == 10216: - return None, "deleted", None # Treat under review as deleted - - # Validate that we got video data - if not video_data: - logger.error(f"No video data returned for {video_id} (status={status})") - return None, "extraction", None + # Check TikTok status codes for errors + # 10204 = Video not found / deleted + # 10216 = Video under review + # 10222 = Private video + if status == 10204: + return None, "deleted", None + elif status == 10222: + return None, "private", None + elif status == 10216: + return None, "deleted", None # Treat under review as deleted + + # Validate that we got video data + if not video_data: + logger.error(f"No video data returned for {video_id} (status={status})") + return None, "extraction", None except AttributeError as e: logger.error( f"Failed to call yt-dlp internal method: {e}. " From bbc59c1e357c84643cc3b80ed756196df97737d5 Mon Sep 17 00:00:00 2001 From: Kyryl Andreiev Date: Thu, 15 Jan 2026 23:28:09 -0800 Subject: [PATCH 9/9] Fix TikTok WAF blocking by using Chrome 120 impersonation TikTok's WAF blocks newer Chrome versions (136+) when used with proxies due to TLS fingerprint / User-Agent mismatches. This commit: - Use fixed Chrome 120 impersonation target instead of auto-selecting newest - Set matching User-Agent header for yt-dlp extraction and media downloads - Add per-proxy session pool to avoid proxy contamination between requests - Bake proxy into curl_cffi sessions at construction time --- tiktok_api/client.py | 171 ++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 93 deletions(-) diff --git a/tiktok_api/client.py b/tiktok_api/client.py index 8186ff2..8d292d3 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -29,10 +29,22 @@ # This ensures impersonation targets update automatically with yt-dlp try: from yt_dlp.networking._curlcffi import BROWSER_TARGETS, _TARGETS_COMPAT_LOOKUP + from yt_dlp.networking.impersonate import ImpersonateTarget except ImportError: # Fallback if yt-dlp structure changes or curl_cffi not available during import BROWSER_TARGETS = {} _TARGETS_COMPAT_LOOKUP = {} + ImpersonateTarget = None + +# Note: yt-dlp's CurlCFFIRH already handles proxies correctly per-request via +# session.curl.setopt(CurlOpt.PROXY, proxy) in _send(). No monkey-patching needed. +# The session caching by cookiejar is fine because proxy is set on each request. + +# TikTok WAF blocks newer Chrome versions (136+) when used with proxies due to +# TLS fingerprint / User-Agent mismatches. Use Chrome 120 which is known to work. +# The User-Agent must match the impersonation target to avoid WAF detection. +TIKTOK_IMPERSONATE_TARGET = "chrome120" +TIKTOK_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" from .exceptions import ( TikTokDeletedError, @@ -179,112 +191,76 @@ class TikTokClient: _aiohttp_connector: Optional[TCPConnector] = None _connector_lock = threading.Lock() - # curl_cffi session for browser-impersonated media downloads - _curl_session: Optional[CurlAsyncSession] = None + # curl_cffi session pool for browser-impersonated media downloads + # Keyed by proxy URL (None for direct connection) to avoid proxy contamination + _curl_session_pool: dict[Optional[str], CurlAsyncSession] = {} _curl_session_lock = threading.Lock() _impersonate_target: Optional[str] = None @classmethod def _get_impersonate_target(cls) -> str: - """Get the best impersonation target from yt-dlp's BROWSER_TARGETS. - - Uses the same priority as yt-dlp: - 1. Prioritize desktop over mobile (non-ios, non-android) - 2. Prioritize Chrome > Safari > Firefox > Edge > Tor - 3. Prioritize newest version + """Get the impersonation target for TikTok requests. - This ensures the impersonation target updates automatically when you - update yt-dlp, without any hardcoded values. + TikTok's WAF blocks newer Chrome versions (136+) when used with proxies + due to TLS fingerprint / User-Agent mismatches. Chrome 120 is known to + work reliably with proxies. Returns: - curl_cffi-compatible impersonate string (e.g., "chrome136") + curl_cffi-compatible impersonate string (e.g., "chrome120") """ - import itertools - - # Get curl_cffi version as tuple for comparison - try: - curl_cffi_version = tuple( - int(x) for x in curl_cffi.__version__.split(".")[:2] - ) - except (ValueError, AttributeError): - curl_cffi_version = (0, 9) # Minimum supported version - - # Collect all available targets for our curl_cffi version - available_targets: dict[str, Any] = {} - for version, targets in BROWSER_TARGETS.items(): - if curl_cffi_version >= version: - available_targets.update(targets) - - if not available_targets: - # Fallback to a common target if BROWSER_TARGETS is empty - logger.warning( - "No BROWSER_TARGETS available from yt-dlp, using 'chrome' fallback" - ) - return "chrome" - - # Sort by yt-dlp's priority (same logic as _curlcffi.py) - # This ensures we pick the same target yt-dlp would use - sorted_targets = sorted( - available_targets.items(), - key=lambda x: ( - # deprioritize mobile targets since they give very different behavior - x[1].os not in ("ios", "android"), - # prioritize tor < edge < firefox < safari < chrome - ("tor", "edge", "firefox", "safari", "chrome").index(x[1].client) - if x[1].client in ("tor", "edge", "firefox", "safari", "chrome") - else -1, - # prioritize newest version - float(x[1].version) if x[1].version else 0, - # group by os name - x[1].os or "", - ), - reverse=True, - ) - - # Get the best target name - best_name = sorted_targets[0][0] - - # Apply compatibility lookup for older curl_cffi versions - if curl_cffi_version < (0, 11): - best_name = _TARGETS_COMPAT_LOOKUP.get(best_name, best_name) - + # Use fixed Chrome 120 target that works with TikTok's WAF + # This must match TIKTOK_IMPERSONATE_TARGET and TIKTOK_USER_AGENT logger.debug( - f"Selected impersonation target: {best_name} " + f"Using impersonation target: {TIKTOK_IMPERSONATE_TARGET} " f"(curl_cffi {curl_cffi.__version__})" ) - return best_name + return TIKTOK_IMPERSONATE_TARGET @classmethod - def _get_curl_session(cls) -> CurlAsyncSession: - """Get or create shared curl_cffi AsyncSession with browser impersonation. + def _get_curl_session(cls, proxy: Optional[str] = None) -> CurlAsyncSession: + """Get or create curl_cffi AsyncSession for a specific proxy. + + Sessions are pooled by proxy URL to avoid proxy contamination. + curl_cffi bakes the proxy into the session at creation time, so we need + separate sessions for different proxies. The session uses yt-dlp's BROWSER_TARGETS to select the best impersonation target, ensuring TLS fingerprint matches a real browser. + + Args: + proxy: Proxy URL string, or None for direct connection. + + Returns: + CurlAsyncSession configured with the specified proxy. """ with cls._curl_session_lock: - # Check if session needs to be created - # Note: CurlAsyncSession doesn't have is_closed, we track via _curl_session being None - if cls._curl_session is None: - pool_size = 10000 # High value for maximum throughput - cls._impersonate_target = cls._get_impersonate_target() - cls._curl_session = CurlAsyncSession( + # Check if session exists for this proxy + if proxy not in cls._curl_session_pool: + pool_size = 1000 # Per-proxy pool size + if cls._impersonate_target is None: + cls._impersonate_target = cls._get_impersonate_target() + + # Create session with proxy baked in at construction time + cls._curl_session_pool[proxy] = CurlAsyncSession( impersonate=cls._impersonate_target, + proxy=proxy, # curl_cffi converts this to {"all": proxy} max_clients=pool_size, ) logger.info( - f"Created curl_cffi session with impersonate={cls._impersonate_target}, " - f"max_clients={pool_size}" + f"Created curl_cffi session for proxy={_strip_proxy_auth(proxy)}, " + f"impersonate={cls._impersonate_target}, max_clients={pool_size}" ) - return cls._curl_session + return cls._curl_session_pool[proxy] @classmethod async def close_curl_session(cls) -> None: - """Close shared curl_cffi session. Call on application shutdown.""" + """Close all curl_cffi sessions in the pool. Call on application shutdown.""" with cls._curl_session_lock: - session = cls._curl_session - cls._curl_session = None + sessions = list(cls._curl_session_pool.values()) + cls._curl_session_pool.clear() cls._impersonate_target = None - if session is not None: + + for session in sessions: try: await session.close() except Exception as e: @@ -382,10 +358,11 @@ def _get_proxy_info(self) -> str: return "None" def _get_bypass_headers(self, referer_url: str) -> dict[str, str]: - """Get bypass headers dynamically from yt-dlp. + """Get bypass headers for TikTok media downloads. - Uses yt-dlp's standard headers which are updated with each yt-dlp release. - We add Origin and Referer for CORS compliance with TikTok CDN. + Uses headers matching our impersonation target (Chrome 120) to avoid + TikTok WAF detection. The User-Agent must match the curl_cffi + impersonation target. Args: referer_url: The referer URL to set in headers @@ -394,6 +371,8 @@ def _get_bypass_headers(self, referer_url: str) -> dict[str, str]: Dict of headers for media download """ headers = dict(YTDLP_STD_HEADERS) # Copy to avoid mutation + # Override User-Agent to match our impersonation target + headers["User-Agent"] = TIKTOK_USER_AGENT headers["Referer"] = referer_url headers["Origin"] = "https://www.tiktok.com" headers["Accept"] = "*/*" @@ -487,17 +466,21 @@ async def _download_media_async( if not self.data_only_proxy: proxy = download_context.get("proxy") - session = self._get_curl_session() + # Get session with proxy baked in (avoids proxy contamination between requests) + session = self._get_curl_session(proxy=proxy) for attempt in range(1, max_retries + 1): - logger.debug(f"CDN download attempt {attempt}/{max_retries} for media URL") + logger.debug( + f"CDN download attempt {attempt}/{max_retries} for media URL " + f"via {_strip_proxy_auth(proxy)}" + ) response = None try: + # Note: proxy is already configured in the session (baked in at creation) response = await session.get( media_url, headers=headers, cookies=cookies, - proxy=proxy, timeout=60, allow_redirects=True, stream=use_streaming, @@ -770,6 +753,13 @@ def _get_ydl_opts( "no_warnings": True, } + # Set impersonation target and matching User-Agent to avoid TikTok WAF detection. + # TikTok blocks newer Chrome versions (136+) when used with proxies due to + # TLS fingerprint mismatches. Chrome 120 is known to work reliably. + if ImpersonateTarget is not None: + opts["impersonate"] = ImpersonateTarget("chrome", "120", "macos", None) + opts["http_headers"] = {"User-Agent": TIKTOK_USER_AGENT} + # Use explicit proxy decision if it was provided (even if None = direct connection) if explicit_proxy is not ...: if explicit_proxy is not None: @@ -880,18 +870,12 @@ def _extract_with_context_sync( ) from e # Create download context with the live instances - # For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy) - # For non-proxy path, use request_proxy as before - context_proxy = ( - saved_proxy - if self.proxy_manager and self.proxy_manager.has_proxies() - else request_proxy - ) + # Use the same proxy for downloads that was used for extraction download_context = { "ydl": ydl, "ie": ie, "referer_url": url, - "proxy": context_proxy, # Store proxy for per-request assignment + "proxy": request_proxy if request_proxy is not ... else None, } # Success - transfer ownership of ydl to caller via download_context @@ -1423,15 +1407,16 @@ async def detect_image_format(self, image_url: str, video_info: VideoInfo) -> st if not self.data_only_proxy: proxy = video_info._download_context.get("proxy") - session = self._get_curl_session() + # Get session with proxy baked in (avoids proxy contamination between requests) + session = self._get_curl_session(proxy=proxy) response = None try: + # Note: proxy is already configured in the session (baked in at creation) response = await session.get( image_url, headers=headers, cookies=cookies, - proxy=proxy, timeout=10, allow_redirects=True, )