-
Notifications
You must be signed in to change notification settings - Fork 3
fix(discord): prevent rate limit retry storm #1586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ import { type DiscordMessage, DiscordMessageSchema } from "./schemas"; | |
|
|
||
| const BASE_URL = "https://discord.com/api/v10"; | ||
| const RATE_LIMIT_TIMEOUT_MS = 30_000; | ||
| const MAX_RETRIES = 5; | ||
|
|
||
| const RateLimitResponseSchema = z.object({ | ||
| message: z.string(), | ||
|
|
@@ -23,6 +24,19 @@ const RateLimitResponseSchema = z.object({ | |
| code: z.number().optional(), | ||
| }); | ||
|
|
||
| // Per-endpoint rate limit tracking: endpoint → timestamp until which requests should wait | ||
| const rateLimitUntil = new Map<string, number>(); | ||
|
|
||
| /** @internal Exported for test cleanup only */ | ||
| export function _resetRateLimitState(): void { | ||
| rateLimitUntil.clear(); | ||
| } | ||
|
|
||
| /** @internal Exported for testing the gate timeout path */ | ||
| export function _setRateLimitGate(endpoint: string, until: number): void { | ||
| rateLimitUntil.set(endpoint, until); | ||
| } | ||
|
|
||
| async function call<T extends z.ZodType>( | ||
| method: string, | ||
| endpoint: string, | ||
|
|
@@ -38,8 +52,26 @@ async function call<T extends z.ZodType>( | |
| } | ||
|
|
||
| const startTime = performance.now(); | ||
| let retries = 0; | ||
|
|
||
| while (true) { | ||
| // Wait for any active rate limit gate on this endpoint | ||
| const waitUntil = rateLimitUntil.get(endpoint); | ||
| if (waitUntil != null) { | ||
| const waitMs = waitUntil - Date.now(); | ||
| if (waitMs > 0) { | ||
| const elapsedMs = performance.now() - startTime; | ||
| if (elapsedMs + waitMs > RATE_LIMIT_TIMEOUT_MS) { | ||
| log.error( | ||
| { endpoint, elapsedMs, waitMs, retries }, | ||
| "Discord rate limit exceeded max wait time", | ||
| ); | ||
| throw new Error(`Discord rate limit exceeded`); | ||
| } | ||
| await new Promise((resolve) => setTimeout(resolve, waitMs)); | ||
| } | ||
| } | ||
|
|
||
| const response = await fetch(url, { | ||
| method, | ||
| body: method === "POST" ? JSON.stringify(params) : undefined, | ||
|
|
@@ -53,6 +85,8 @@ async function call<T extends z.ZodType>( | |
| const json = await response.json(); | ||
|
|
||
| if (response.status === 429) { | ||
| retries++; | ||
|
|
||
| const rateLimit = RateLimitResponseSchema.safeParse(json); | ||
| const retryAfterMs = rateLimit.success | ||
| ? rateLimit.data.retry_after * 1000 | ||
|
|
@@ -61,23 +95,34 @@ async function call<T extends z.ZodType>( | |
|
|
||
| const elapsedMs = performance.now() - startTime; | ||
|
|
||
| if (elapsedMs + retryAfterMs > RATE_LIMIT_TIMEOUT_MS) { | ||
| if ( | ||
| retries > MAX_RETRIES || | ||
| elapsedMs + retryAfterMs > RATE_LIMIT_TIMEOUT_MS | ||
| ) { | ||
| log.error( | ||
| { endpoint, elapsedMs, retryAfterMs, global }, | ||
| { endpoint, elapsedMs, retryAfterMs, global, retries }, | ||
| "Discord rate limit exceeded max wait time", | ||
| ); | ||
| throw new Error(`Discord rate limit exceeded`); | ||
| } | ||
|
|
||
| // Update shared rate limit gate (Math.max avoids overwriting a longer wait) | ||
| const newUntil = Date.now() + retryAfterMs; | ||
| const existing = rateLimitUntil.get(endpoint) ?? 0; | ||
| rateLimitUntil.set(endpoint, Math.max(existing, newUntil)); | ||
|
|
||
| log.warn( | ||
| { endpoint, retryAfterMs, global }, | ||
| { endpoint, retryAfterMs, global, retries }, | ||
| "Discord rate limited, retrying", | ||
| ); | ||
|
|
||
| await new Promise((resolve) => setTimeout(resolve, retryAfterMs)); | ||
| continue; | ||
| } | ||
|
|
||
| // Clear rate limit gate on success | ||
| rateLimitUntil.delete(endpoint); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deleting the gate on any successful response is correct for the common case, but there's a subtle race: if Request A and Request B both slept on the gate, A succeeds first and deletes the gate, then B (which has already computed and is sleeping through its The only interesting case is a hypothetical Request C that arrives after A's deletion but before B's timer fires: C sees no gate and fires immediately. Since A just got a 200, the rate limit window has passed, so C getting a 200 is correct behaviour. No issue, just documenting the reasoning in case this edge case comes up in a future review. |
||
|
|
||
| if (!response.ok) { | ||
| log.error( | ||
| { endpoint, status: response.status, body: json }, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor naming note:
MAX_RETRIES = 5withretries > MAX_RETRIESallows 6 total API calls (1 initial + 5 retries). Using>=instead would give you exactlyMAX_RETRIEScalls, or renaming the constant toMAX_ATTEMPTS = 6would make the intent self-documenting. The current behaviour is conventional for retry logic, and the test comment at line 882 confirms it's intentional — just worth a quick sanity check that 6 total calls is what you want.