diff --git a/docs/CODEBASE_MAP.md b/docs/CODEBASE_MAP.md index 720af88..4e769f6 100644 --- a/docs/CODEBASE_MAP.md +++ b/docs/CODEBASE_MAP.md @@ -1,12 +1,12 @@ --- -last_mapped: 2026-01-14T22:45:00Z +last_mapped: 2026-01-15T12:00:00Z total_files: 61 -total_tokens: 71358 +total_tokens: 69443 --- # Codebase Map -> Auto-generated by Cartographer. Last mapped: 2026-01-14 +> Auto-generated by Cartographer. Last mapped: 2026-01-15 ## System Overview @@ -168,11 +168,13 @@ tt-bot/ | File | Purpose | Tokens | |------|---------|--------| -| client.py | Main TikTokClient + ProxySession + 3-part retry | 18,676 | +| client.py | Main TikTokClient + ProxySession + 3-part retry | 16,499 | | proxy_manager.py | Thread-safe round-robin proxy rotation | 1,303 | -| models.py | VideoInfo, MusicInfo dataclasses | 1,091 | +| models.py | VideoInfo, MusicInfo dataclasses | 1,079 | | exceptions.py | Exception hierarchy (9 error types) | 233 | +**Note:** `VideoInfo.author` field was removed (unused in codebase). + **Key Classes:** - `TikTokClient`: Main extraction client with integrated retry - `ProxySession`: Manages proxy state per request flow (sticky until retry) @@ -263,8 +265,8 @@ tt-bot/ | File | Purpose | Tokens | |------|---------|--------| -| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,374 | -| queue_manager.py | Per-user concurrency limits | 1,021 | +| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,322 | +| queue_manager.py | Per-user concurrency limits | 1,029 | | utils.py | Helpers (lang resolution, user registration) | 692 | **Key Functions:** @@ -273,6 +275,10 @@ tt-bot/ - `send_music_result()`: Send audio with cover - `QueueManager.info_queue()`: Acquire/release queue slot +**Note:** Thumbnail download thresholds: +- Inline messages: >30s (lowered from 60s) +- Regular messages: >60s + --- ### Stats Module (`stats/`) @@ -516,6 +522,8 @@ sequenceDiagram | `BOT_TOKEN` | Main bot token | | `DB_URL` | PostgreSQL connection string | | `TG_SERVER` | Telegram API server URL | +| `TELEGRAM_API_ID` | Telegram API ID (for custom Bot API server) | +| `TELEGRAM_API_HASH` | Telegram API hash (for custom Bot API server) | ### Retry Configuration (NEW) | Variable | Default | Description | @@ -527,10 +535,11 @@ sequenceDiagram ### Performance | Variable | Default | Description | |----------|---------|-------------| -| `THREAD_POOL_SIZE` | 128 | ThreadPoolExecutor workers | -| `MAX_USER_QUEUE_SIZE` | 3 | Max concurrent per user | -| `MAX_CONCURRENT_IMAGES` | 20 | Max parallel image downloads | -| `MAX_VIDEO_DURATION` | 1800 | Max video duration (seconds, 0=unlimited) | +| `MAX_USER_QUEUE_SIZE` | 0 | Max concurrent per user (0=unlimited) | +| `MAX_VIDEO_DURATION` | 0 | Max video duration (seconds, 0=unlimited) | +| `STREAMING_DURATION_THRESHOLD` | 300 | Stream videos longer than this (seconds) | | `LOG_LEVEL` | INFO | Logging level | +**Note:** Thread pool (500 workers) and curl_cffi connections (10,000) are hardcoded for maximum throughput. + See `.env.example` for complete list. diff --git a/tiktok_api/client.py b/tiktok_api/client.py index c36aab5..ecb7f76 100644 --- a/tiktok_api/client.py +++ b/tiktok_api/client.py @@ -848,76 +848,28 @@ def _extract_with_context_sync( try: # Use yt-dlp's internal method to get raw webpage data # This also sets up all necessary cookies - # NOTE: When using a proxy, yt-dlp's impersonate=True feature - # doesn't work correctly. We need to download without impersonate. + # NOTE: TikTok's impersonate feature doesn't work through HTTP proxies. + # Always use direct connection for extraction, proxy is used for downloads. + saved_proxy = None # Will store proxy for download context if self.proxy_manager and self.proxy_manager.has_proxies(): - # Download webpage without impersonate to avoid proxy issues - res = ie._download_webpage_handle( - normalized_url, video_id, fatal=False, impersonate=False + # Download webpage without proxy but with impersonate + # Save current proxy setting and temporarily disable it + saved_proxy = ydl_opts.get("proxy") + if "proxy" in ydl_opts: + del ydl_opts["proxy"] + # Recreate YDL without proxy for extraction + # Create new instance first to ensure we have a valid ydl + # even if something goes wrong during recreation + old_ydl = ydl + ydl = yt_dlp.YoutubeDL(ydl_opts) + old_ydl.close() # Close old instance after new one is ready + ie = ydl.get_info_extractor("TikTok") + ie.set_downloader(ydl) + + # Use standard extraction with impersonate (no proxy) + video_data, status = ie._extract_web_data_and_status( + normalized_url, video_id ) - if res is False: - raise TikTokExtractionError( - f"Failed to download webpage for video {video_id}" - ) - - webpage, urlh = res - - # Check for login redirect - import urllib.parse - - if urllib.parse.urlparse(urlh.url).path == "/login": - raise TikTokExtractionError( - "TikTok is requiring login for access to this content" - ) - - # Extract data manually using yt-dlp's helper methods - video_data = None - status = -1 - - # Try universal data first - if universal_data := ie._get_universal_data(webpage, video_id): - from yt_dlp.utils import traverse_obj - - status = ( - traverse_obj( - universal_data, - ("webapp.video-detail", "statusCode", {int}), - ) - or 0 - ) - video_data = traverse_obj( - universal_data, - ("webapp.video-detail", "itemInfo", "itemStruct", {dict}), - ) - - # Try sigi state data - elif sigi_data := ie._get_sigi_state(webpage, video_id): - from yt_dlp.utils import traverse_obj - - status = ( - traverse_obj(sigi_data, ("VideoPage", "statusCode", {int})) - or 0 - ) - video_data = traverse_obj( - sigi_data, ("ItemModule", video_id, {dict}) - ) - - # Try next.js data - elif next_data := ie._search_nextjs_data( - webpage, video_id, default={} - ): - from yt_dlp.utils import traverse_obj - - status = ( - traverse_obj( - next_data, ("props", "pageProps", "statusCode", {int}) - ) - or 0 - ) - video_data = traverse_obj( - next_data, - ("props", "pageProps", "itemInfo", "itemStruct", {dict}), - ) # Check TikTok status codes for errors # 10204 = Video not found / deleted @@ -930,10 +882,10 @@ def _extract_with_context_sync( elif status == 10216: return None, "deleted", None # Treat under review as deleted + # Validate that we got video data if not video_data: - raise TikTokExtractionError( - f"Unable to extract webpage video data (status: {status})" - ) + logger.error(f"No video data returned for {video_id} (status={status})") + return None, "extraction", None else: # No proxy, use the standard method with impersonate video_data, status = ie._extract_web_data_and_status( @@ -947,6 +899,11 @@ def _extract_with_context_sync( return None, "private", None elif status == 10216: return None, "deleted", None # Treat under review as deleted + + # Validate that we got video data + if not video_data: + logger.error(f"No video data returned for {video_id} (status={status})") + return None, "extraction", None except AttributeError as e: logger.error( f"Failed to call yt-dlp internal method: {e}. " @@ -958,11 +915,18 @@ def _extract_with_context_sync( ) from e # Create download context with the live instances + # For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy) + # For non-proxy path, use request_proxy as before + context_proxy = ( + saved_proxy + if self.proxy_manager and self.proxy_manager.has_proxies() + else request_proxy + ) download_context = { "ydl": ydl, "ie": ie, "referer_url": url, - "proxy": request_proxy, # Store proxy for per-request assignment + "proxy": context_proxy, # Store proxy for per-request assignment } # Success - transfer ownership of ydl to caller via download_context