Precise dual rate limiting for LLM APIs (RPM + TPM)
@aid-on/llm-throttle is a high-precision rate limiting library specialized for LLM API calls. It simultaneously controls both RPM (Requests Per Minute) and TPM (Tokens Per Minute) to achieve efficient API usage.
- Dual Rate Limiting: Simultaneously manages both RPM and TPM
- Token Bucket Algorithm: Smoothed rate limiting with burst handling
- Real-time Adjustment: Post-adjustment based on actual token consumption
- Detailed Metrics: Usage visualization and efficiency tracking
- Full TypeScript Support: Type-safe development experience
- Zero Dependencies: Lightweight design with no external library dependencies
npm install @aid-on/llm-throttle
import { LLMThrottle } from '@aid-on/llm-throttle'; // Configure rate limits const limiter = new LLMThrottle({ rpm: 60, // 60 requests per minute tpm: 10000 // 10,000 tokens per minute }); // Check before request const requestId = 'unique-request-id'; const estimatedTokens = 1500; if (limiter.consume(requestId, estimatedTokens)) { // Execute API call const response = await callLLMAPI(); // Adjust with actual token usage const actualTokens = response.usage.total_tokens; limiter.adjustConsumption(requestId, actualTokens); } else { console.log('Rate limit reached'); }
const limiter = new LLMThrottle({ rpm: 60, tpm: 10000, burstRPM: 120, // Allow up to 120 requests in short bursts burstTPM: 20000 // Allow up to 20,000 tokens in short bursts });
import { RateLimitError } from '@aid-on/llm-throttle'; try { limiter.consumeOrThrow(requestId, estimatedTokens); // API call processing } catch (error) { if (error instanceof RateLimitError) { console.log(`Limit reason: ${error.reason}`); console.log(`Available in: ${error.availableIn}ms`); } }
const metrics = limiter.getMetrics(); console.log('RPM usage:', metrics.rpm.percentage + '%'); console.log('TPM usage:', metrics.tpm.percentage + '%'); console.log('Average tokens/request:', metrics.consumptionHistory.averageTokensPerRequest); console.log('Estimation accuracy:', metrics.efficiency);
const check = limiter.canProcess(estimatedTokens); if (check.allowed) { // Can process limiter.consume(requestId, estimatedTokens); } else { console.log(`Limit reason: ${check.reason}`); console.log(`Available in: ${check.availableIn}ms`); }
new LLMThrottle(config: DualRateLimitConfig)
canProcess(estimatedTokens: number): RateLimitCheckResult- Check if processing is possibleconsume(requestId: string, estimatedTokens: number, metadata?: Record<string, unknown>): boolean- Consume tokensconsumeOrThrow(requestId: string, estimatedTokens: number, metadata?: Record<string, unknown>): void- Throw error on consumption failureadjustConsumption(requestId: string, actualTokens: number): void- Adjust with actual consumptiongetMetrics(): RateLimitMetrics- Get usage metricsgetConsumptionHistory(): ConsumptionRecord[]- Get consumption historyreset(): void- Reset limit statesetHistoryRetention(ms: number): void- Set history retention period
interface DualRateLimitConfig { rpm: number; tpm: number; burstRPM?: number; burstTPM?: number; clock?: () => number; } interface RateLimitCheckResult { allowed: boolean; reason?: 'rpm_limit' | 'tpm_limit'; availableIn?: number; availableTokens?: { rpm: number; tpm: number; }; } interface RateLimitMetrics { rpm: { used: number; available: number; limit: number; percentage: number; }; tpm: { used: number; available: number; limit: number; percentage: number; }; efficiency: number; consumptionHistory: { count: number; averageTokensPerRequest: number; totalTokens: number; }; }
import OpenAI from 'openai'; import { LLMThrottle } from '@aid-on/llm-throttle'; const openai = new OpenAI(); const limiter = new LLMThrottle({ rpm: 500, // Example OpenAI Tier 1 limits tpm: 10000 }); async function chatCompletion(messages: any[], requestId: string) { const estimatedTokens = estimateTokens(messages); // Custom estimation logic if (!limiter.consume(requestId, estimatedTokens)) { throw new Error('Rate limit reached'); } try { const response = await openai.chat.completions.create({ model: 'gpt-3.5-turbo', messages }); // Adjust with actual usage const actualTokens = response.usage?.total_tokens || estimatedTokens; limiter.adjustConsumption(requestId, actualTokens); return response; } catch (error) { // Return estimated value on error limiter.adjustConsumption(requestId, 0); throw error; } }
class APIManager { private limiters = new Map<string, LLMThrottle>(); constructor() { // Service-specific limit configuration this.limiters.set('openai', new LLMThrottle({ rpm: 500, tpm: 10000 })); this.limiters.set('anthropic', new LLMThrottle({ rpm: 1000, tpm: 20000 })); } async callAPI(service: string, requestId: string, estimatedTokens: number) { const limiter = this.limiters.get(service); if (!limiter) throw new Error(`Unknown service: ${service}`); const check = limiter.canProcess(estimatedTokens); if (!check.allowed) { throw new RateLimitError( `Rate limit exceeded for ${service}: ${check.reason}`, check.reason!, check.availableIn! ); } limiter.consume(requestId, estimatedTokens); // API call processing... } }
npm testMIT License