From e195d383f957a0267815c040d397846b1689710a Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 00:38:31 -0800
Subject: [PATCH 1/9] Remove performance config, hardcode maximum throughput
 values

Simplify configuration by removing most performance-related env vars
and hardcoding values optimized for maximum resource usage:

- ThreadPoolExecutor: 500 workers (vs default 32)
- aiohttp connections: unlimited (limit=0)
- curl_cffi pool: 10000 max_clients
- Image downloads: no concurrency limit (removed semaphore)

Keep only 3 user-configurable limits via env vars:
- MAX_USER_QUEUE_SIZE (default 0 = no limit)
- STREAMING_DURATION_THRESHOLD (default 300s)
- MAX_VIDEO_DURATION (default 0 = no limit)
---
 .env.example | 56 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/.env.example b/.env.example
index da4dce6..f56eb79 100644
--- a/.env.example
+++ b/.env.example
@@ -1,17 +1,10 @@
-TG_SERVER=http://telegram-bot-api:8081
-TELEGRAM_API_ID=1234567
-TELEGRAM_API_HASH=abc123
-
-DB_URL=postgresql://postgres:postgres@db/ttbot-db
 # tt-bot
 BOT_TOKEN=12345:abcde
 ADMIN_IDS=[1234567]
 # SECOND_IDS=[1234567]
 JOIN_LOGS=-1234567
 STORAGE_CHANNEL_ID=12345
-# API settings
-BOTSTAT=abcdefg12345
-MONETAG_URL=https://example.com/your-monetag-link/
+
 # stats-bot
 STATS_BOT_TOKEN=12345:abcde
 STATS_IDS=[-1234567]
@@ -19,32 +12,53 @@ STATS_CHAT=-1234567
 STATS_MESSAGE_ID=23
 DAILY_STATS_MESSAGE_ID=24
 
+# API settings
+BOTSTAT=abcdefg12345
+MONETAG_URL=https://example.com/your-monetag-link/
+
 # Logging settings (optional)
 # LOG_LEVEL=INFO  # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
 
 # yt-dlp settings (optional)
-YTDLP_COOKIES=cookies.txt
+# YTDLP_COOKIES=cookies.txt
 
 # Proxy settings (load balancing with multiple proxies)
 # Path to file with proxy list (one proxy URL per line)
-PROXY_FILE=proxies.txt
+# PROXY_FILE=proxies.txt
 # Use proxy only for TikTok API requests, not for media downloads
 # PROXY_DATA_ONLY=false
 # Include host machine's direct IP in round-robin rotation
 # PROXY_INCLUDE_HOST=false
 
+# Performance settings (for high-throughput scenarios)
+# ThreadPoolExecutor workers for sync yt-dlp extraction (default: 128)
+# THREAD_POOL_SIZE=128
+# Total aiohttp connection pool size for URL resolution (default: 200)
+# AIOHTTP_POOL_SIZE=200
+# Per-host connection limit (default: 50)
+# AIOHTTP_LIMIT_PER_HOST=50
+# Max parallel image downloads per slideshow (default: 20)
+# MAX_CONCURRENT_IMAGES=20
+# curl_cffi connection pool size for media downloads (default: 200)
+# CURL_POOL_SIZE=200
+# Use streaming for videos longer than this (seconds, default: 300 = 5 min)
+# STREAMING_DURATION_THRESHOLD=300
+# Maximum video duration in seconds (default: 1800 = 30 min, 0 = no limit)
+# MAX_VIDEO_DURATION=1800
+
+# Queue settings (optional, defaults shown)
+# MAX_USER_QUEUE_SIZE=3
+
 # Retry settings - 3-part retry strategy with proxy rotation
 # Part 1: URL resolution retries (short URLs to full URLs)
-URL_RESOLVE_MAX_RETRIES=3
+# URL_RESOLVE_MAX_RETRIES=3
 # Part 2: Video info extraction retries (metadata)
-VIDEO_INFO_MAX_RETRIES=3
+# VIDEO_INFO_MAX_RETRIES=3
 # Part 3: Download retries (video/images/audio)
-DOWNLOAD_MAX_RETRIES=3
-
-# Limits (optional, 0 = no limit)
-# Max concurrent videos per user in queue
-MAX_USER_QUEUE_SIZE=0
-# Use streaming for videos longer than this (seconds, 0 = never stream)
-STREAMING_DURATION_THRESHOLD=300
-# Maximum video duration in seconds (0 = no limit)
-MAX_VIDEO_DURATION=0
+# DOWNLOAD_MAX_RETRIES=3
+
+# Telegram Bot API
+TG_SERVER=http://telegram-bot-api:8081
+
+# db
+DB_URL=postgresql://postgres:postgres@db/ttbot-db

From 7295e49416700e1d0134e6e86d0a025cdcac24c2 Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 00:49:11 -0800
Subject: [PATCH 2/9] Change positions of values in .env.example

---
 .env.example | 53 ++++++++++++++++++----------------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/.env.example b/.env.example
index f56eb79..2ba7825 100644
--- a/.env.example
+++ b/.env.example
@@ -1,10 +1,14 @@
+TG_SERVER=http://telegram-bot-api:8081
+DB_URL=postgresql://postgres:postgres@db/ttbot-db
 # tt-bot
 BOT_TOKEN=12345:abcde
 ADMIN_IDS=[1234567]
 # SECOND_IDS=[1234567]
 JOIN_LOGS=-1234567
 STORAGE_CHANNEL_ID=12345
-
+# API settings
+BOTSTAT=abcdefg12345
+MONETAG_URL=https://example.com/your-monetag-link/
 # stats-bot
 STATS_BOT_TOKEN=12345:abcde
 STATS_IDS=[-1234567]
@@ -12,53 +16,32 @@ STATS_CHAT=-1234567
 STATS_MESSAGE_ID=23
 DAILY_STATS_MESSAGE_ID=24
 
-# API settings
-BOTSTAT=abcdefg12345
-MONETAG_URL=https://example.com/your-monetag-link/
-
 # Logging settings (optional)
 # LOG_LEVEL=INFO  # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
 
 # yt-dlp settings (optional)
-# YTDLP_COOKIES=cookies.txt
+YTDLP_COOKIES=cookies.txt
 
 # Proxy settings (load balancing with multiple proxies)
 # Path to file with proxy list (one proxy URL per line)
-# PROXY_FILE=proxies.txt
+PROXY_FILE=proxies.txt
 # Use proxy only for TikTok API requests, not for media downloads
 # PROXY_DATA_ONLY=false
 # Include host machine's direct IP in round-robin rotation
 # PROXY_INCLUDE_HOST=false
 
-# Performance settings (for high-throughput scenarios)
-# ThreadPoolExecutor workers for sync yt-dlp extraction (default: 128)
-# THREAD_POOL_SIZE=128
-# Total aiohttp connection pool size for URL resolution (default: 200)
-# AIOHTTP_POOL_SIZE=200
-# Per-host connection limit (default: 50)
-# AIOHTTP_LIMIT_PER_HOST=50
-# Max parallel image downloads per slideshow (default: 20)
-# MAX_CONCURRENT_IMAGES=20
-# curl_cffi connection pool size for media downloads (default: 200)
-# CURL_POOL_SIZE=200
-# Use streaming for videos longer than this (seconds, default: 300 = 5 min)
-# STREAMING_DURATION_THRESHOLD=300
-# Maximum video duration in seconds (default: 1800 = 30 min, 0 = no limit)
-# MAX_VIDEO_DURATION=1800
-
-# Queue settings (optional, defaults shown)
-# MAX_USER_QUEUE_SIZE=3
-
 # Retry settings - 3-part retry strategy with proxy rotation
 # Part 1: URL resolution retries (short URLs to full URLs)
-# URL_RESOLVE_MAX_RETRIES=3
+URL_RESOLVE_MAX_RETRIES=3
 # Part 2: Video info extraction retries (metadata)
-# VIDEO_INFO_MAX_RETRIES=3
+VIDEO_INFO_MAX_RETRIES=3
 # Part 3: Download retries (video/images/audio)
-# DOWNLOAD_MAX_RETRIES=3
-
-# Telegram Bot API
-TG_SERVER=http://telegram-bot-api:8081
-
-# db
-DB_URL=postgresql://postgres:postgres@db/ttbot-db
+DOWNLOAD_MAX_RETRIES=3
+
+# Limits (optional, 0 = no limit)
+# Max concurrent videos per user in queue
+MAX_USER_QUEUE_SIZE=0
+# Use streaming for videos longer than this (seconds, 0 = never stream)
+STREAMING_DURATION_THRESHOLD=300
+# Maximum video duration in seconds (0 = no limit)
+MAX_VIDEO_DURATION=0

From 8d109a8f6c3c58b6bdc0722858f586846a91ef5e Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 00:53:16 -0800
Subject: [PATCH 3/9] Add Telegram API credentials to .env.example

---
 .env.example | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.env.example b/.env.example
index 2ba7825..da4dce6 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,7 @@
 TG_SERVER=http://telegram-bot-api:8081
+TELEGRAM_API_ID=1234567
+TELEGRAM_API_HASH=abc123
+
 DB_URL=postgresql://postgres:postgres@db/ttbot-db
 # tt-bot
 BOT_TOKEN=12345:abcde

From acff6be9cc76c9a79f9f80ce12f1718c3537b0e8 Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 21:10:54 -0800
Subject: [PATCH 4/9] Update CODEBASE_MAP.md

---
 docs/CODEBASE_MAP.md | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/CODEBASE_MAP.md b/docs/CODEBASE_MAP.md
index 720af88..4e769f6 100644
--- a/docs/CODEBASE_MAP.md
+++ b/docs/CODEBASE_MAP.md
@@ -1,12 +1,12 @@
 ---
-last_mapped: 2026-01-14T22:45:00Z
+last_mapped: 2026-01-15T12:00:00Z
 total_files: 61
-total_tokens: 71358
+total_tokens: 69443
 ---
 
 # Codebase Map
 
-> Auto-generated by Cartographer. Last mapped: 2026-01-14
+> Auto-generated by Cartographer. Last mapped: 2026-01-15
 
 ## System Overview
 
@@ -168,11 +168,13 @@ tt-bot/
 
 | File | Purpose | Tokens |
 |------|---------|--------|
-| client.py | Main TikTokClient + ProxySession + 3-part retry | 18,676 |
+| client.py | Main TikTokClient + ProxySession + 3-part retry | 16,499 |
 | proxy_manager.py | Thread-safe round-robin proxy rotation | 1,303 |
-| models.py | VideoInfo, MusicInfo dataclasses | 1,091 |
+| models.py | VideoInfo, MusicInfo dataclasses | 1,079 |
 | exceptions.py | Exception hierarchy (9 error types) | 233 |
 
+**Note:** `VideoInfo.author` field was removed (unused in codebase).
+
 **Key Classes:**
 - `TikTokClient`: Main extraction client with integrated retry
 - `ProxySession`: Manages proxy state per request flow (sticky until retry)
@@ -263,8 +265,8 @@ tt-bot/
 
 | File | Purpose | Tokens |
 |------|---------|--------|
-| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,374 |
-| queue_manager.py | Per-user concurrency limits | 1,021 |
+| video_types.py | Video/image sending, slideshow retry, HEIC conversion | 6,322 |
+| queue_manager.py | Per-user concurrency limits | 1,029 |
 | utils.py | Helpers (lang resolution, user registration) | 692 |
 
 **Key Functions:**
@@ -273,6 +275,10 @@ tt-bot/
 - `send_music_result()`: Send audio with cover
 - `QueueManager.info_queue()`: Acquire/release queue slot
 
+**Note:** Thumbnail download thresholds:
+- Inline messages: >30s (lowered from 60s)
+- Regular messages: >60s
+
 ---
 
 ### Stats Module (`stats/`)
@@ -516,6 +522,8 @@ sequenceDiagram
 | `BOT_TOKEN` | Main bot token |
 | `DB_URL` | PostgreSQL connection string |
 | `TG_SERVER` | Telegram API server URL |
+| `TELEGRAM_API_ID` | Telegram API ID (for custom Bot API server) |
+| `TELEGRAM_API_HASH` | Telegram API hash (for custom Bot API server) |
 
 ### Retry Configuration (NEW)
 | Variable | Default | Description |
@@ -527,10 +535,11 @@ sequenceDiagram
 ### Performance
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `THREAD_POOL_SIZE` | 128 | ThreadPoolExecutor workers |
-| `MAX_USER_QUEUE_SIZE` | 3 | Max concurrent per user |
-| `MAX_CONCURRENT_IMAGES` | 20 | Max parallel image downloads |
-| `MAX_VIDEO_DURATION` | 1800 | Max video duration (seconds, 0=unlimited) |
+| `MAX_USER_QUEUE_SIZE` | 0 | Max concurrent per user (0=unlimited) |
+| `MAX_VIDEO_DURATION` | 0 | Max video duration (seconds, 0=unlimited) |
+| `STREAMING_DURATION_THRESHOLD` | 300 | Stream videos longer than this (seconds) |
 | `LOG_LEVEL` | INFO | Logging level |
 
+**Note:** Thread pool (500 workers) and curl_cffi connections (10,000) are hardcoded for maximum throughput.
+
 See `.env.example` for complete list.

From 8745d5a8c071d0ee40357e80add728d86a0df366 Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 21:30:22 -0800
Subject: [PATCH 5/9] Fix TikTok extraction with proxies by using direct
 connection for metadata

TikTok's browser impersonation (impersonate=True) doesn't work through HTTP
proxies, causing extraction to fail with "Unable to extract webpage video data".

Changed approach:
- Use direct connection (no proxy) for video info extraction with impersonate
- Use proxy for media downloads to hide server IP

This fixes the issue where all proxy attempts would fail due to TikTok's
JavaScript challenge blocking non-browser requests through proxies.
---
 tiktok_api/client.py | 99 +++++++++++---------------------------------
 1 file changed, 25 insertions(+), 74 deletions(-)

diff --git a/tiktok_api/client.py b/tiktok_api/client.py
index c36aab5..8ba7359 100644
--- a/tiktok_api/client.py
+++ b/tiktok_api/client.py
@@ -848,76 +848,25 @@ def _extract_with_context_sync(
             try:
                 # Use yt-dlp's internal method to get raw webpage data
                 # This also sets up all necessary cookies
-                # NOTE: When using a proxy, yt-dlp's impersonate=True feature
-                # doesn't work correctly. We need to download without impersonate.
+                # NOTE: TikTok's impersonate feature doesn't work through HTTP proxies.
+                # Always use direct connection for extraction, proxy is used for downloads.
+                saved_proxy = None  # Will store proxy for download context
                 if self.proxy_manager and self.proxy_manager.has_proxies():
-                    # Download webpage without impersonate to avoid proxy issues
-                    res = ie._download_webpage_handle(
-                        normalized_url, video_id, fatal=False, impersonate=False
-                    )
-                    if res is False:
-                        raise TikTokExtractionError(
-                            f"Failed to download webpage for video {video_id}"
-                        )
-
-                    webpage, urlh = res
-
-                    # Check for login redirect
-                    import urllib.parse
-
-                    if urllib.parse.urlparse(urlh.url).path == "/login":
-                        raise TikTokExtractionError(
-                            "TikTok is requiring login for access to this content"
-                        )
-
-                    # Extract data manually using yt-dlp's helper methods
-                    video_data = None
-                    status = -1
-
-                    # Try universal data first
-                    if universal_data := ie._get_universal_data(webpage, video_id):
-                        from yt_dlp.utils import traverse_obj
-
-                        status = (
-                            traverse_obj(
-                                universal_data,
-                                ("webapp.video-detail", "statusCode", {int}),
-                            )
-                            or 0
-                        )
-                        video_data = traverse_obj(
-                            universal_data,
-                            ("webapp.video-detail", "itemInfo", "itemStruct", {dict}),
-                        )
-
-                    # Try sigi state data
-                    elif sigi_data := ie._get_sigi_state(webpage, video_id):
-                        from yt_dlp.utils import traverse_obj
-
-                        status = (
-                            traverse_obj(sigi_data, ("VideoPage", "statusCode", {int}))
-                            or 0
-                        )
-                        video_data = traverse_obj(
-                            sigi_data, ("ItemModule", video_id, {dict})
-                        )
-
-                    # Try next.js data
-                    elif next_data := ie._search_nextjs_data(
-                        webpage, video_id, default={}
-                    ):
-                        from yt_dlp.utils import traverse_obj
+                    # Download webpage without proxy but with impersonate
+                    # Save current proxy setting and temporarily disable it
+                    saved_proxy = ydl_opts.get("proxy")
+                    if "proxy" in ydl_opts:
+                        del ydl_opts["proxy"]
+                    # Recreate YDL without proxy for extraction
+                    ydl.close()
+                    ydl = yt_dlp.YoutubeDL(ydl_opts)
+                    ie = ydl.get_info_extractor("TikTok")
+                    ie.set_downloader(ydl)
 
-                        status = (
-                            traverse_obj(
-                                next_data, ("props", "pageProps", "statusCode", {int})
-                            )
-                            or 0
-                        )
-                        video_data = traverse_obj(
-                            next_data,
-                            ("props", "pageProps", "itemInfo", "itemStruct", {dict}),
-                        )
+                    # Use standard extraction with impersonate (no proxy)
+                    video_data, status = ie._extract_web_data_and_status(
+                        normalized_url, video_id
+                    )
 
                     # Check TikTok status codes for errors
                     # 10204 = Video not found / deleted
@@ -929,11 +878,6 @@ def _extract_with_context_sync(
                         return None, "private", None
                     elif status == 10216:
                         return None, "deleted", None  # Treat under review as deleted
-
-                    if not video_data:
-                        raise TikTokExtractionError(
-                            f"Unable to extract webpage video data (status: {status})"
-                        )
                 else:
                     # No proxy, use the standard method with impersonate
                     video_data, status = ie._extract_web_data_and_status(
@@ -958,11 +902,18 @@ def _extract_with_context_sync(
                 ) from e
 
             # Create download context with the live instances
+            # For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy)
+            # For non-proxy path, use request_proxy as before
+            context_proxy = (
+                saved_proxy
+                if self.proxy_manager and self.proxy_manager.has_proxies()
+                else request_proxy
+            )
             download_context = {
                 "ydl": ydl,
                 "ie": ie,
                 "referer_url": url,
-                "proxy": request_proxy,  # Store proxy for per-request assignment
+                "proxy": context_proxy,  # Store proxy for per-request assignment
             }
 
             # Success - transfer ownership of ydl to caller via download_context

From 248c0501626cf520f70a79dbcf32834a4187edaa Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 21:51:22 -0800
Subject: [PATCH 6/9] Improve error handling in ydl recreation for proxy
 extraction

Create the new YoutubeDL instance before closing the old one to ensure
we have a valid ydl even if initialization fails.
---
 tiktok_api/client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tiktok_api/client.py b/tiktok_api/client.py
index 8ba7359..f00d911 100644
--- a/tiktok_api/client.py
+++ b/tiktok_api/client.py
@@ -858,8 +858,11 @@ def _extract_with_context_sync(
                     if "proxy" in ydl_opts:
                         del ydl_opts["proxy"]
                     # Recreate YDL without proxy for extraction
-                    ydl.close()
+                    # Create new instance first to ensure we have a valid ydl
+                    # even if something goes wrong during recreation
+                    old_ydl = ydl
                     ydl = yt_dlp.YoutubeDL(ydl_opts)
+                    old_ydl.close()  # Close old instance after new one is ready
                     ie = ydl.get_info_extractor("TikTok")
                     ie.set_downloader(ydl)
 

From 14c1abe74bdebc14145996fa9f738e43f908a82a Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 21:56:26 -0800
Subject: [PATCH 7/9] Add video_data validation after TikTok extraction

Return extraction error if video_data is None despite a non-error status
code, preventing downstream issues from invalid data.
---
 tiktok_api/client.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tiktok_api/client.py b/tiktok_api/client.py
index f00d911..ecb7f76 100644
--- a/tiktok_api/client.py
+++ b/tiktok_api/client.py
@@ -881,6 +881,11 @@ def _extract_with_context_sync(
                         return None, "private", None
                     elif status == 10216:
                         return None, "deleted", None  # Treat under review as deleted
+
+                    # Validate that we got video data
+                    if not video_data:
+                        logger.error(f"No video data returned for {video_id} (status={status})")
+                        return None, "extraction", None
                 else:
                     # No proxy, use the standard method with impersonate
                     video_data, status = ie._extract_web_data_and_status(
@@ -894,6 +899,11 @@ def _extract_with_context_sync(
                         return None, "private", None
                     elif status == 10216:
                         return None, "deleted", None  # Treat under review as deleted
+
+                    # Validate that we got video data
+                    if not video_data:
+                        logger.error(f"No video data returned for {video_id} (status={status})")
+                        return None, "extraction", None
             except AttributeError as e:
                 logger.error(
                     f"Failed to call yt-dlp internal method: {e}. "

From 74283324b15c2ab4be33bec3e3746863b7cb89bd Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 22:26:05 -0800
Subject: [PATCH 8/9] Always use proxy for TikTok extraction

Remove logic that stripped proxy from ydl_opts during extraction.
Datacenter IPs are typically blocked by TikTok, so extraction must
use the configured proxy to work on servers.
---
 tiktok_api/client.py | 75 ++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 55 deletions(-)

diff --git a/tiktok_api/client.py b/tiktok_api/client.py
index ecb7f76..8186ff2 100644
--- a/tiktok_api/client.py
+++ b/tiktok_api/client.py
@@ -848,62 +848,27 @@ def _extract_with_context_sync(
             try:
                 # Use yt-dlp's internal method to get raw webpage data
                 # This also sets up all necessary cookies
-                # NOTE: TikTok's impersonate feature doesn't work through HTTP proxies.
-                # Always use direct connection for extraction, proxy is used for downloads.
-                saved_proxy = None  # Will store proxy for download context
-                if self.proxy_manager and self.proxy_manager.has_proxies():
-                    # Download webpage without proxy but with impersonate
-                    # Save current proxy setting and temporarily disable it
-                    saved_proxy = ydl_opts.get("proxy")
-                    if "proxy" in ydl_opts:
-                        del ydl_opts["proxy"]
-                    # Recreate YDL without proxy for extraction
-                    # Create new instance first to ensure we have a valid ydl
-                    # even if something goes wrong during recreation
-                    old_ydl = ydl
-                    ydl = yt_dlp.YoutubeDL(ydl_opts)
-                    old_ydl.close()  # Close old instance after new one is ready
-                    ie = ydl.get_info_extractor("TikTok")
-                    ie.set_downloader(ydl)
-
-                    # Use standard extraction with impersonate (no proxy)
-                    video_data, status = ie._extract_web_data_and_status(
-                        normalized_url, video_id
-                    )
-
-                    # Check TikTok status codes for errors
-                    # 10204 = Video not found / deleted
-                    # 10216 = Video under review
-                    # 10222 = Private video
-                    if status == 10204:
-                        return None, "deleted", None
-                    elif status == 10222:
-                        return None, "private", None
-                    elif status == 10216:
-                        return None, "deleted", None  # Treat under review as deleted
-
-                    # Validate that we got video data
-                    if not video_data:
-                        logger.error(f"No video data returned for {video_id} (status={status})")
-                        return None, "extraction", None
-                else:
-                    # No proxy, use the standard method with impersonate
-                    video_data, status = ie._extract_web_data_and_status(
-                        normalized_url, video_id
-                    )
+                # NOTE: Always use proxy for extraction if configured, as datacenter
+                # IPs are typically blocked by TikTok.
+                video_data, status = ie._extract_web_data_and_status(
+                    normalized_url, video_id
+                )
 
-                    # Check TikTok status codes for errors (same as proxy path)
-                    if status == 10204:
-                        return None, "deleted", None
-                    elif status == 10222:
-                        return None, "private", None
-                    elif status == 10216:
-                        return None, "deleted", None  # Treat under review as deleted
-
-                    # Validate that we got video data
-                    if not video_data:
-                        logger.error(f"No video data returned for {video_id} (status={status})")
-                        return None, "extraction", None
+                # Check TikTok status codes for errors
+                # 10204 = Video not found / deleted
+                # 10216 = Video under review
+                # 10222 = Private video
+                if status == 10204:
+                    return None, "deleted", None
+                elif status == 10222:
+                    return None, "private", None
+                elif status == 10216:
+                    return None, "deleted", None  # Treat under review as deleted
+
+                # Validate that we got video data
+                if not video_data:
+                    logger.error(f"No video data returned for {video_id} (status={status})")
+                    return None, "extraction", None
             except AttributeError as e:
                 logger.error(
                     f"Failed to call yt-dlp internal method: {e}. "

From bbc59c1e357c84643cc3b80ed756196df97737d5 Mon Sep 17 00:00:00 2001
From: Kyryl Andreiev <karilaa@karilaa.dev>
Date: Thu, 15 Jan 2026 23:28:09 -0800
Subject: [PATCH 9/9] Fix TikTok WAF blocking by using Chrome 120 impersonation

TikTok's WAF blocks newer Chrome versions (136+) when used with proxies
due to TLS fingerprint / User-Agent mismatches. This commit:

- Use fixed Chrome 120 impersonation target instead of auto-selecting newest
- Set matching User-Agent header for yt-dlp extraction and media downloads
- Add per-proxy session pool to avoid proxy contamination between requests
- Bake proxy into curl_cffi sessions at construction time
---
 tiktok_api/client.py | 171 ++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 93 deletions(-)

diff --git a/tiktok_api/client.py b/tiktok_api/client.py
index 8186ff2..8d292d3 100644
--- a/tiktok_api/client.py
+++ b/tiktok_api/client.py
@@ -29,10 +29,22 @@
 # This ensures impersonation targets update automatically with yt-dlp
 try:
     from yt_dlp.networking._curlcffi import BROWSER_TARGETS, _TARGETS_COMPAT_LOOKUP
+    from yt_dlp.networking.impersonate import ImpersonateTarget
 except ImportError:
     # Fallback if yt-dlp structure changes or curl_cffi not available during import
     BROWSER_TARGETS = {}
     _TARGETS_COMPAT_LOOKUP = {}
+    ImpersonateTarget = None
+
+# Note: yt-dlp's CurlCFFIRH already handles proxies correctly per-request via
+# session.curl.setopt(CurlOpt.PROXY, proxy) in _send(). No monkey-patching needed.
+# The session caching by cookiejar is fine because proxy is set on each request.
+
+# TikTok WAF blocks newer Chrome versions (136+) when used with proxies due to
+# TLS fingerprint / User-Agent mismatches. Use Chrome 120 which is known to work.
+# The User-Agent must match the impersonation target to avoid WAF detection.
+TIKTOK_IMPERSONATE_TARGET = "chrome120"
+TIKTOK_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 
 from .exceptions import (
     TikTokDeletedError,
@@ -179,112 +191,76 @@ class TikTokClient:
     _aiohttp_connector: Optional[TCPConnector] = None
     _connector_lock = threading.Lock()
 
-    # curl_cffi session for browser-impersonated media downloads
-    _curl_session: Optional[CurlAsyncSession] = None
+    # curl_cffi session pool for browser-impersonated media downloads
+    # Keyed by proxy URL (None for direct connection) to avoid proxy contamination
+    _curl_session_pool: dict[Optional[str], CurlAsyncSession] = {}
     _curl_session_lock = threading.Lock()
     _impersonate_target: Optional[str] = None
 
     @classmethod
     def _get_impersonate_target(cls) -> str:
-        """Get the best impersonation target from yt-dlp's BROWSER_TARGETS.
-
-        Uses the same priority as yt-dlp:
-        1. Prioritize desktop over mobile (non-ios, non-android)
-        2. Prioritize Chrome > Safari > Firefox > Edge > Tor
-        3. Prioritize newest version
+        """Get the impersonation target for TikTok requests.
 
-        This ensures the impersonation target updates automatically when you
-        update yt-dlp, without any hardcoded values.
+        TikTok's WAF blocks newer Chrome versions (136+) when used with proxies
+        due to TLS fingerprint / User-Agent mismatches. Chrome 120 is known to
+        work reliably with proxies.
 
         Returns:
-            curl_cffi-compatible impersonate string (e.g., "chrome136")
+            curl_cffi-compatible impersonate string (e.g., "chrome120")
         """
-        import itertools
-
-        # Get curl_cffi version as tuple for comparison
-        try:
-            curl_cffi_version = tuple(
-                int(x) for x in curl_cffi.__version__.split(".")[:2]
-            )
-        except (ValueError, AttributeError):
-            curl_cffi_version = (0, 9)  # Minimum supported version
-
-        # Collect all available targets for our curl_cffi version
-        available_targets: dict[str, Any] = {}
-        for version, targets in BROWSER_TARGETS.items():
-            if curl_cffi_version >= version:
-                available_targets.update(targets)
-
-        if not available_targets:
-            # Fallback to a common target if BROWSER_TARGETS is empty
-            logger.warning(
-                "No BROWSER_TARGETS available from yt-dlp, using 'chrome' fallback"
-            )
-            return "chrome"
-
-        # Sort by yt-dlp's priority (same logic as _curlcffi.py)
-        # This ensures we pick the same target yt-dlp would use
-        sorted_targets = sorted(
-            available_targets.items(),
-            key=lambda x: (
-                # deprioritize mobile targets since they give very different behavior
-                x[1].os not in ("ios", "android"),
-                # prioritize tor < edge < firefox < safari < chrome
-                ("tor", "edge", "firefox", "safari", "chrome").index(x[1].client)
-                if x[1].client in ("tor", "edge", "firefox", "safari", "chrome")
-                else -1,
-                # prioritize newest version
-                float(x[1].version) if x[1].version else 0,
-                # group by os name
-                x[1].os or "",
-            ),
-            reverse=True,
-        )
-
-        # Get the best target name
-        best_name = sorted_targets[0][0]
-
-        # Apply compatibility lookup for older curl_cffi versions
-        if curl_cffi_version < (0, 11):
-            best_name = _TARGETS_COMPAT_LOOKUP.get(best_name, best_name)
-
+        # Use fixed Chrome 120 target that works with TikTok's WAF
+        # This must match TIKTOK_IMPERSONATE_TARGET and TIKTOK_USER_AGENT
         logger.debug(
-            f"Selected impersonation target: {best_name} "
+            f"Using impersonation target: {TIKTOK_IMPERSONATE_TARGET} "
             f"(curl_cffi {curl_cffi.__version__})"
         )
-        return best_name
+        return TIKTOK_IMPERSONATE_TARGET
 
     @classmethod
-    def _get_curl_session(cls) -> CurlAsyncSession:
-        """Get or create shared curl_cffi AsyncSession with browser impersonation.
+    def _get_curl_session(cls, proxy: Optional[str] = None) -> CurlAsyncSession:
+        """Get or create curl_cffi AsyncSession for a specific proxy.
+
+        Sessions are pooled by proxy URL to avoid proxy contamination.
+        curl_cffi bakes the proxy into the session at creation time, so we need
+        separate sessions for different proxies.
 
         The session uses yt-dlp's BROWSER_TARGETS to select the best impersonation
         target, ensuring TLS fingerprint matches a real browser.
+
+        Args:
+            proxy: Proxy URL string, or None for direct connection.
+
+        Returns:
+            CurlAsyncSession configured with the specified proxy.
         """
         with cls._curl_session_lock:
-            # Check if session needs to be created
-            # Note: CurlAsyncSession doesn't have is_closed, we track via _curl_session being None
-            if cls._curl_session is None:
-                pool_size = 10000  # High value for maximum throughput
-                cls._impersonate_target = cls._get_impersonate_target()
-                cls._curl_session = CurlAsyncSession(
+            # Check if session exists for this proxy
+            if proxy not in cls._curl_session_pool:
+                pool_size = 1000  # Per-proxy pool size
+                if cls._impersonate_target is None:
+                    cls._impersonate_target = cls._get_impersonate_target()
+
+                # Create session with proxy baked in at construction time
+                cls._curl_session_pool[proxy] = CurlAsyncSession(
                     impersonate=cls._impersonate_target,
+                    proxy=proxy,  # curl_cffi converts this to {"all": proxy}
                     max_clients=pool_size,
                 )
                 logger.info(
-                    f"Created curl_cffi session with impersonate={cls._impersonate_target}, "
-                    f"max_clients={pool_size}"
+                    f"Created curl_cffi session for proxy={_strip_proxy_auth(proxy)}, "
+                    f"impersonate={cls._impersonate_target}, max_clients={pool_size}"
                 )
-            return cls._curl_session
+            return cls._curl_session_pool[proxy]
 
     @classmethod
     async def close_curl_session(cls) -> None:
-        """Close shared curl_cffi session. Call on application shutdown."""
+        """Close all curl_cffi sessions in the pool. Call on application shutdown."""
         with cls._curl_session_lock:
-            session = cls._curl_session
-            cls._curl_session = None
+            sessions = list(cls._curl_session_pool.values())
+            cls._curl_session_pool.clear()
             cls._impersonate_target = None
-        if session is not None:
+
+        for session in sessions:
             try:
                 await session.close()
             except Exception as e:
@@ -382,10 +358,11 @@ def _get_proxy_info(self) -> str:
         return "None"
 
     def _get_bypass_headers(self, referer_url: str) -> dict[str, str]:
-        """Get bypass headers dynamically from yt-dlp.
+        """Get bypass headers for TikTok media downloads.
 
-        Uses yt-dlp's standard headers which are updated with each yt-dlp release.
-        We add Origin and Referer for CORS compliance with TikTok CDN.
+        Uses headers matching our impersonation target (Chrome 120) to avoid
+        TikTok WAF detection. The User-Agent must match the curl_cffi
+        impersonation target.
 
         Args:
             referer_url: The referer URL to set in headers
@@ -394,6 +371,8 @@ def _get_bypass_headers(self, referer_url: str) -> dict[str, str]:
             Dict of headers for media download
         """
         headers = dict(YTDLP_STD_HEADERS)  # Copy to avoid mutation
+        # Override User-Agent to match our impersonation target
+        headers["User-Agent"] = TIKTOK_USER_AGENT
         headers["Referer"] = referer_url
         headers["Origin"] = "https://www.tiktok.com"
         headers["Accept"] = "*/*"
@@ -487,17 +466,21 @@ async def _download_media_async(
         if not self.data_only_proxy:
             proxy = download_context.get("proxy")
 
-        session = self._get_curl_session()
+        # Get session with proxy baked in (avoids proxy contamination between requests)
+        session = self._get_curl_session(proxy=proxy)
 
         for attempt in range(1, max_retries + 1):
-            logger.debug(f"CDN download attempt {attempt}/{max_retries} for media URL")
+            logger.debug(
+                f"CDN download attempt {attempt}/{max_retries} for media URL "
+                f"via {_strip_proxy_auth(proxy)}"
+            )
             response = None
             try:
+                # Note: proxy is already configured in the session (baked in at creation)
                 response = await session.get(
                     media_url,
                     headers=headers,
                     cookies=cookies,
-                    proxy=proxy,
                     timeout=60,
                     allow_redirects=True,
                     stream=use_streaming,
@@ -770,6 +753,13 @@ def _get_ydl_opts(
             "no_warnings": True,
         }
 
+        # Set impersonation target and matching User-Agent to avoid TikTok WAF detection.
+        # TikTok blocks newer Chrome versions (136+) when used with proxies due to
+        # TLS fingerprint mismatches. Chrome 120 is known to work reliably.
+        if ImpersonateTarget is not None:
+            opts["impersonate"] = ImpersonateTarget("chrome", "120", "macos", None)
+        opts["http_headers"] = {"User-Agent": TIKTOK_USER_AGENT}
+
         # Use explicit proxy decision if it was provided (even if None = direct connection)
         if explicit_proxy is not ...:
             if explicit_proxy is not None:
@@ -880,18 +870,12 @@ def _extract_with_context_sync(
                 ) from e
 
             # Create download context with the live instances
-            # For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy)
-            # For non-proxy path, use request_proxy as before
-            context_proxy = (
-                saved_proxy
-                if self.proxy_manager and self.proxy_manager.has_proxies()
-                else request_proxy
-            )
+            # Use the same proxy for downloads that was used for extraction
             download_context = {
                 "ydl": ydl,
                 "ie": ie,
                 "referer_url": url,
-                "proxy": context_proxy,  # Store proxy for per-request assignment
+                "proxy": request_proxy if request_proxy is not ... else None,
             }
 
             # Success - transfer ownership of ydl to caller via download_context
@@ -1423,15 +1407,16 @@ async def detect_image_format(self, image_url: str, video_info: VideoInfo) -> st
         if not self.data_only_proxy:
             proxy = video_info._download_context.get("proxy")
 
-        session = self._get_curl_session()
+        # Get session with proxy baked in (avoids proxy contamination between requests)
+        session = self._get_curl_session(proxy=proxy)
 
         response = None
         try:
+            # Note: proxy is already configured in the session (baked in at creation)
             response = await session.get(
                 image_url,
                 headers=headers,
                 cookies=cookies,
-                proxy=proxy,
                 timeout=10,
                 allow_redirects=True,
             )