Skip to content
246 changes: 98 additions & 148 deletions tiktok_api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,22 @@
# This ensures impersonation targets update automatically with yt-dlp
try:
from yt_dlp.networking._curlcffi import BROWSER_TARGETS, _TARGETS_COMPAT_LOOKUP
from yt_dlp.networking.impersonate import ImpersonateTarget
except ImportError:
# Fallback if yt-dlp structure changes or curl_cffi not available during import
BROWSER_TARGETS = {}
_TARGETS_COMPAT_LOOKUP = {}
ImpersonateTarget = None

# Note: yt-dlp's CurlCFFIRH already handles proxies correctly per-request via
# session.curl.setopt(CurlOpt.PROXY, proxy) in _send(). No monkey-patching needed.
# The session caching by cookiejar is fine because proxy is set on each request.

# TikTok WAF blocks newer Chrome versions (136+) when used with proxies due to
# TLS fingerprint / User-Agent mismatches. Use Chrome 120 which is known to work.
# The User-Agent must match the impersonation target to avoid WAF detection.
TIKTOK_IMPERSONATE_TARGET = "chrome120"
TIKTOK_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

from .exceptions import (
TikTokDeletedError,
Expand Down Expand Up @@ -179,112 +191,76 @@ class TikTokClient:
_aiohttp_connector: Optional[TCPConnector] = None
_connector_lock = threading.Lock()

# curl_cffi session for browser-impersonated media downloads
_curl_session: Optional[CurlAsyncSession] = None
# curl_cffi session pool for browser-impersonated media downloads
# Keyed by proxy URL (None for direct connection) to avoid proxy contamination
_curl_session_pool: dict[Optional[str], CurlAsyncSession] = {}
_curl_session_lock = threading.Lock()
_impersonate_target: Optional[str] = None

@classmethod
def _get_impersonate_target(cls) -> str:
"""Get the best impersonation target from yt-dlp's BROWSER_TARGETS.
"""Get the impersonation target for TikTok requests.

Uses the same priority as yt-dlp:
1. Prioritize desktop over mobile (non-ios, non-android)
2. Prioritize Chrome > Safari > Firefox > Edge > Tor
3. Prioritize newest version

This ensures the impersonation target updates automatically when you
update yt-dlp, without any hardcoded values.
TikTok's WAF blocks newer Chrome versions (136+) when used with proxies
due to TLS fingerprint / User-Agent mismatches. Chrome 120 is known to
work reliably with proxies.

Returns:
curl_cffi-compatible impersonate string (e.g., "chrome136")
curl_cffi-compatible impersonate string (e.g., "chrome120")
"""
import itertools

# Get curl_cffi version as tuple for comparison
try:
curl_cffi_version = tuple(
int(x) for x in curl_cffi.__version__.split(".")[:2]
)
except (ValueError, AttributeError):
curl_cffi_version = (0, 9) # Minimum supported version

# Collect all available targets for our curl_cffi version
available_targets: dict[str, Any] = {}
for version, targets in BROWSER_TARGETS.items():
if curl_cffi_version >= version:
available_targets.update(targets)

if not available_targets:
# Fallback to a common target if BROWSER_TARGETS is empty
logger.warning(
"No BROWSER_TARGETS available from yt-dlp, using 'chrome' fallback"
)
return "chrome"

# Sort by yt-dlp's priority (same logic as _curlcffi.py)
# This ensures we pick the same target yt-dlp would use
sorted_targets = sorted(
available_targets.items(),
key=lambda x: (
# deprioritize mobile targets since they give very different behavior
x[1].os not in ("ios", "android"),
# prioritize tor < edge < firefox < safari < chrome
("tor", "edge", "firefox", "safari", "chrome").index(x[1].client)
if x[1].client in ("tor", "edge", "firefox", "safari", "chrome")
else -1,
# prioritize newest version
float(x[1].version) if x[1].version else 0,
# group by os name
x[1].os or "",
),
reverse=True,
)

# Get the best target name
best_name = sorted_targets[0][0]

# Apply compatibility lookup for older curl_cffi versions
if curl_cffi_version < (0, 11):
best_name = _TARGETS_COMPAT_LOOKUP.get(best_name, best_name)

# Use fixed Chrome 120 target that works with TikTok's WAF
# This must match TIKTOK_IMPERSONATE_TARGET and TIKTOK_USER_AGENT
logger.debug(
f"Selected impersonation target: {best_name} "
f"Using impersonation target: {TIKTOK_IMPERSONATE_TARGET} "
f"(curl_cffi {curl_cffi.__version__})"
)
return best_name
return TIKTOK_IMPERSONATE_TARGET

@classmethod
def _get_curl_session(cls) -> CurlAsyncSession:
"""Get or create shared curl_cffi AsyncSession with browser impersonation.
def _get_curl_session(cls, proxy: Optional[str] = None) -> CurlAsyncSession:
"""Get or create curl_cffi AsyncSession for a specific proxy.

Sessions are pooled by proxy URL to avoid proxy contamination.
curl_cffi bakes the proxy into the session at creation time, so we need
separate sessions for different proxies.

The session uses yt-dlp's BROWSER_TARGETS to select the best impersonation
target, ensuring TLS fingerprint matches a real browser.

Args:
proxy: Proxy URL string, or None for direct connection.

Returns:
CurlAsyncSession configured with the specified proxy.
"""
with cls._curl_session_lock:
# Check if session needs to be created
# Note: CurlAsyncSession doesn't have is_closed, we track via _curl_session being None
if cls._curl_session is None:
pool_size = 10000 # High value for maximum throughput
cls._impersonate_target = cls._get_impersonate_target()
cls._curl_session = CurlAsyncSession(
# Check if session exists for this proxy
if proxy not in cls._curl_session_pool:
pool_size = 1000 # Per-proxy pool size
if cls._impersonate_target is None:
cls._impersonate_target = cls._get_impersonate_target()

# Create session with proxy baked in at construction time
cls._curl_session_pool[proxy] = CurlAsyncSession(
impersonate=cls._impersonate_target,
proxy=proxy, # curl_cffi converts this to {"all": proxy}
max_clients=pool_size,
)
logger.info(
f"Created curl_cffi session with impersonate={cls._impersonate_target}, "
f"max_clients={pool_size}"
f"Created curl_cffi session for proxy={_strip_proxy_auth(proxy)}, "
f"impersonate={cls._impersonate_target}, max_clients={pool_size}"
)
return cls._curl_session
return cls._curl_session_pool[proxy]

@classmethod
async def close_curl_session(cls) -> None:
"""Close shared curl_cffi session. Call on application shutdown."""
"""Close all curl_cffi sessions in the pool. Call on application shutdown."""
with cls._curl_session_lock:
session = cls._curl_session
cls._curl_session = None
sessions = list(cls._curl_session_pool.values())
cls._curl_session_pool.clear()
cls._impersonate_target = None
if session is not None:

for session in sessions:
try:
await session.close()
except Exception as e:
Expand Down Expand Up @@ -382,10 +358,11 @@ def _get_proxy_info(self) -> str:
return "None"

def _get_bypass_headers(self, referer_url: str) -> dict[str, str]:
"""Get bypass headers dynamically from yt-dlp.
"""Get bypass headers for TikTok media downloads.

Uses yt-dlp's standard headers which are updated with each yt-dlp release.
We add Origin and Referer for CORS compliance with TikTok CDN.
Uses headers matching our impersonation target (Chrome 120) to avoid
TikTok WAF detection. The User-Agent must match the curl_cffi
impersonation target.

Args:
referer_url: The referer URL to set in headers
Expand All @@ -394,6 +371,8 @@ def _get_bypass_headers(self, referer_url: str) -> dict[str, str]:
Dict of headers for media download
"""
headers = dict(YTDLP_STD_HEADERS) # Copy to avoid mutation
# Override User-Agent to match our impersonation target
headers["User-Agent"] = TIKTOK_USER_AGENT
headers["Referer"] = referer_url
headers["Origin"] = "https://www.tiktok.com"
headers["Accept"] = "*/*"
Expand Down Expand Up @@ -487,17 +466,21 @@ async def _download_media_async(
if not self.data_only_proxy:
proxy = download_context.get("proxy")

session = self._get_curl_session()
# Get session with proxy baked in (avoids proxy contamination between requests)
session = self._get_curl_session(proxy=proxy)

for attempt in range(1, max_retries + 1):
logger.debug(f"CDN download attempt {attempt}/{max_retries} for media URL")
logger.debug(
f"CDN download attempt {attempt}/{max_retries} for media URL "
f"via {_strip_proxy_auth(proxy)}"
)
response = None
try:
# Note: proxy is already configured in the session (baked in at creation)
response = await session.get(
media_url,
headers=headers,
cookies=cookies,
proxy=proxy,
timeout=60,
allow_redirects=True,
stream=use_streaming,
Expand Down Expand Up @@ -770,6 +753,13 @@ def _get_ydl_opts(
"no_warnings": True,
}

# Set impersonation target and matching User-Agent to avoid TikTok WAF detection.
# TikTok blocks newer Chrome versions (136+) when used with proxies due to
# TLS fingerprint mismatches. Chrome 120 is known to work reliably.
if ImpersonateTarget is not None:
opts["impersonate"] = ImpersonateTarget("chrome", "120", "macos", None)
opts["http_headers"] = {"User-Agent": TIKTOK_USER_AGENT}

# Use explicit proxy decision if it was provided (even if None = direct connection)
if explicit_proxy is not ...:
if explicit_proxy is not None:
Expand Down Expand Up @@ -848,62 +838,27 @@ def _extract_with_context_sync(
try:
# Use yt-dlp's internal method to get raw webpage data
# This also sets up all necessary cookies
# NOTE: TikTok's impersonate feature doesn't work through HTTP proxies.
# Always use direct connection for extraction, proxy is used for downloads.
saved_proxy = None # Will store proxy for download context
if self.proxy_manager and self.proxy_manager.has_proxies():
# Download webpage without proxy but with impersonate
# Save current proxy setting and temporarily disable it
saved_proxy = ydl_opts.get("proxy")
if "proxy" in ydl_opts:
del ydl_opts["proxy"]
# Recreate YDL without proxy for extraction
# Create new instance first to ensure we have a valid ydl
# even if something goes wrong during recreation
old_ydl = ydl
ydl = yt_dlp.YoutubeDL(ydl_opts)
old_ydl.close() # Close old instance after new one is ready
ie = ydl.get_info_extractor("TikTok")
ie.set_downloader(ydl)

# Use standard extraction with impersonate (no proxy)
video_data, status = ie._extract_web_data_and_status(
normalized_url, video_id
)

# Check TikTok status codes for errors
# 10204 = Video not found / deleted
# 10216 = Video under review
# 10222 = Private video
if status == 10204:
return None, "deleted", None
elif status == 10222:
return None, "private", None
elif status == 10216:
return None, "deleted", None # Treat under review as deleted

# Validate that we got video data
if not video_data:
logger.error(f"No video data returned for {video_id} (status={status})")
return None, "extraction", None
else:
# No proxy, use the standard method with impersonate
video_data, status = ie._extract_web_data_and_status(
normalized_url, video_id
)
# NOTE: Always use proxy for extraction if configured, as datacenter
# IPs are typically blocked by TikTok.
video_data, status = ie._extract_web_data_and_status(
normalized_url, video_id
)

# Check TikTok status codes for errors (same as proxy path)
if status == 10204:
return None, "deleted", None
elif status == 10222:
return None, "private", None
elif status == 10216:
return None, "deleted", None # Treat under review as deleted

# Validate that we got video data
if not video_data:
logger.error(f"No video data returned for {video_id} (status={status})")
return None, "extraction", None
# Check TikTok status codes for errors
# 10204 = Video not found / deleted
# 10216 = Video under review
# 10222 = Private video
if status == 10204:
return None, "deleted", None
elif status == 10222:
return None, "private", None
elif status == 10216:
return None, "deleted", None # Treat under review as deleted

# Validate that we got video data
if not video_data:
logger.error(f"No video data returned for {video_id} (status={status})")
return None, "extraction", None
except AttributeError as e:
logger.error(
f"Failed to call yt-dlp internal method: {e}. "
Expand All @@ -915,18 +870,12 @@ def _extract_with_context_sync(
) from e

# Create download context with the live instances
# For proxy path, use the saved_proxy (extraction was without proxy, downloads use proxy)
# For non-proxy path, use request_proxy as before
context_proxy = (
saved_proxy
if self.proxy_manager and self.proxy_manager.has_proxies()
else request_proxy
)
# Use the same proxy for downloads that was used for extraction
download_context = {
"ydl": ydl,
"ie": ie,
"referer_url": url,
"proxy": context_proxy, # Store proxy for per-request assignment
"proxy": request_proxy if request_proxy is not ... else None,
}

# Success - transfer ownership of ydl to caller via download_context
Expand Down Expand Up @@ -1458,15 +1407,16 @@ async def detect_image_format(self, image_url: str, video_info: VideoInfo) -> st
if not self.data_only_proxy:
proxy = video_info._download_context.get("proxy")

session = self._get_curl_session()
# Get session with proxy baked in (avoids proxy contamination between requests)
session = self._get_curl_session(proxy=proxy)

response = None
try:
# Note: proxy is already configured in the session (baked in at creation)
response = await session.get(
image_url,
headers=headers,
cookies=cookies,
proxy=proxy,
timeout=10,
allow_redirects=True,
)
Expand Down