故障排查指南: 快速定位和解决问题
在Develop和运维Large Language Modelapply时, 难免会遇到各种问题. 本指南汇总了 最常见的故障场景和Solution, help您快速定位问题, 恢复Service.
常见API错误
401 Unauthorized
问题表现
{
"error": {
"message": "Invalid API key provided",
"type": "invalid_request_error",
"code": "invalid_api_key"
}
}排查步骤
- 检查API密钥是否正确
- 确认密钥是否过期或被禁用
- 验证Request Headers格式是否正确
Solution
// 正确的Request Headers格式
const headers = {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
};
// 环境变量管理
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error('API key not configured');
}429 Rate Limit Exceeded
问题表现
请求频率超过限制, API返回429错误
Solution
// 指数退避重试
async function retryWithBackoff(fn, maxRetries = 5) {
for (let i = 0; i < maxRetries; i++) {
try {
return await fn();
} catch (error) {
if (error.status === 429 && i < maxRetries - 1) {
const delay = Math.pow(2, i) * 1000;
console.log(`Rate limited, retrying in ${delay}ms`);
await new Promise(r => setTimeout(r, delay));
} else {
throw error;
}
}
}
}
// 请求队列管理
class RateLimiter {
constructor(rateLimit = 60, interval = 60000) {
this.queue = [];
this.processing = false;
this.rateLimit = rateLimit;
this.interval = interval;
}
async add(fn) {
return new Promise((resolve, reject) => {
this.queue.push({ fn, resolve, reject });
this.process();
});
}
async process() {
if (this.processing) return;
this.processing = true;
while (this.queue.length > 0) {
const batch = this.queue.splice(0, this.rateLimit);
await Promise.all(batch.map(({ fn, resolve, reject }) =>
fn().then(resolve).catch(reject)
));
if (this.queue.length > 0) {
await new Promise(r => setTimeout(r, this.interval));
}
}
this.processing = false;
}
}500 Internal Server Error
常见原因
- ModelService暂时不可用
- 请求格式错误导致Service崩溃
- 超长输入导致内存溢出
应急处理
// 故障转移Strategy
const models = ['gpt-4', 'gpt-3.5-turbo', 'claude-2'];
let modelIndex = 0;
async function callWithFallback(prompt) {
while (modelIndex < models.length) {
try {
const response = await callAPI(models[modelIndex], prompt);
return response;
} catch (error) {
if (error.status === 500) {
console.error(`Model ${models[modelIndex]} failed, trying next`);
modelIndex++;
} else {
throw error;
}
}
}
throw new Error('All models failed');
}性能问题排查
响应延迟过高
🔍 诊断Tool
// 性能MonitorCode
class PerformanceMonitor {
constructor() {
this.metrics = {
requests: 0,
totalLatency: 0,
errors: 0,
slowRequests: 0
};
}
async track(operation, threshold = 2000) {
const start = Date.now();
this.metrics.requests++;
try {
const result = await operation();
const duration = Date.now() - start;
this.metrics.totalLatency += duration;
if (duration > threshold) {
this.metrics.slowRequests++;
console.warn(`Slow request detected: ${duration}ms`);
this.diagnose(duration);
}
return result;
} catch (error) {
this.metrics.errors++;
throw error;
}
}
diagnose(duration) {
console.log('Performance diagnosis:');
console.log(`- Average latency: ${this.getAverageLatency()}ms`);
console.log(`- Slow request rate: ${this.getSlowRate()}%`);
console.log(`- Error rate: ${this.getErrorRate()}%`);
// Analyze可能原因
if (duration > 5000) {
console.log('Possible causes:');
console.log('- Model overload');
console.log('- Network issues');
console.log('- Large context size');
}
}
getAverageLatency() {
return Math.round(this.metrics.totalLatency / this.metrics.requests);
}
getSlowRate() {
return ((this.metrics.slowRequests / this.metrics.requests) * 100).toFixed(2);
}
getErrorRate() {
return ((this.metrics.errors / this.metrics.requests) * 100).toFixed(2);
}
}⚡ Optimize措施
前端Optimize
- • Implement请求去重
- • 添加本地缓存
- • Optimize请求大小
后端Optimize
- • 启用连接池
- • Implement请求合并
- • usingCDN加速
内存问题处理
内存泄漏排查
// Node.js内存Monitor
const v8 = require('v8');
const { performance } = require('perf_hooks');
class MemoryMonitor {
constructor() {
this.baseline = process.memoryUsage();
this.snapshots = [];
}
snapshot() {
const usage = process.memoryUsage();
const heap = v8.getHeapStatistics();
const snapshot = {
timestamp: Date.now(),
rss: usage.rss / 1024 / 1024, // MB
heapTotal: usage.heapTotal / 1024 / 1024,
heapUsed: usage.heapUsed / 1024 / 1024,
external: usage.external / 1024 / 1024,
heapLimit: heap.heap_size_limit / 1024 / 1024
};
this.snapshots.push(snapshot);
// 检测异常
if (snapshot.heapUsed > snapshot.heapLimit * 0.9) {
console.error('Critical: Heap usage > 90%');
this.triggerGC();
}
return snapshot;
}
detectLeak() {
if (this.snapshots.length < 10) return false;
// 计算内存增长趋势
const recent = this.snapshots.slice(-10);
const growth = recent[9].heapUsed - recent[0].heapUsed;
const growthRate = growth / recent[0].heapUsed;
if (growthRate > 0.5) {
console.warn('Possible memory leak detected');
console.warn(`Memory grew by ${(growthRate * 100).toFixed(2)}%`);
return true;
}
return false;
}
triggerGC() {
if (global.gc) {
console.log('Triggering garbage collection...');
global.gc();
} else {
console.warn('GC not exposed. Run with --expose-gc');
}
}
}
// usingExample
const memMonitor = new MemoryMonitor();
// 定期Monitor
setInterval(() => {
const snapshot = memMonitor.snapshot();
console.log(`Memory usage: ${snapshot.heapUsed.toFixed(2)}MB / ${snapshot.heapLimit.toFixed(2)}MB`);
if (memMonitor.detectLeak()) {
// 执行清理操作
clearCaches();
memMonitor.triggerGC();
}
}, 60000); // 每minutes检查一次网络问题Debug
连接超时处理
诊断Script
// 网络诊断Tool
async function diagnoseNetwork(endpoint) {
console.log('Starting network diagnosis...');
// 1. DNS解析Test
try {
const dns = require('dns').promises;
const start = Date.now();
const addresses = await dns.resolve4(new URL(endpoint).hostname);
console.log(`✓ DNS resolution: ${Date.now() - start}ms`);
console.log(` Resolved to: ${addresses.join(', ')}`);
} catch (error) {
console.error('✗ DNS resolution failed:', error.message);
}
// 2. 连接Test
try {
const start = Date.now();
const response = await fetch(endpoint, {
method: 'HEAD',
signal: AbortSignal.timeout(5000)
});
console.log(`✓ Connection test: ${Date.now() - start}ms`);
console.log(` Status: ${response.status}`);
} catch (error) {
console.error('✗ Connection failed:', error.message);
}
// 3. SSL/TLSTest
if (endpoint.startsWith('https')) {
try {
const tls = require('tls');
const url = new URL(endpoint);
const socket = tls.connect({
host: url.hostname,
port: 443,
servername: url.hostname
});
socket.on('secureConnect', () => {
const cert = socket.getPeerCertificate();
console.log('✓ SSL/TLS connection established');
console.log(` Certificate valid until: ${cert.valid_to}`);
socket.end();
});
} catch (error) {
console.error('✗ SSL/TLS error:', error.message);
}
}
// 4. 延迟Test
console.log('\nLatency test (5 requests):');
const latencies = [];
for (let i = 0; i < 5; i++) {
try {
const start = Date.now();
await fetch(endpoint, {
method: 'HEAD',
signal: AbortSignal.timeout(10000)
});
const latency = Date.now() - start;
latencies.push(latency);
console.log(` Request ${i + 1}: ${latency}ms`);
} catch (error) {
console.log(` Request ${i + 1}: Failed`);
}
}
if (latencies.length > 0) {
const avg = latencies.reduce((a, b) => a + b) / latencies.length;
console.log(`\nAverage latency: ${avg.toFixed(2)}ms`);
}
}日志Analyze技巧
结构化日志记录
// 结构化日志系统
const winston = require('winston');
const logger = winston.createLogger({
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
transports: [
new winston.transports.File({
filename: 'error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'combined.log'
})
]
});
// 请求追踪中间件
function requestLogger(req, res, next) {
const requestId = crypto.randomUUID();
req.requestId = requestId;
const start = Date.now();
// 记录请求
logger.info('Request received', {
requestId,
method: req.method,
path: req.path,
ip: req.ip,
userAgent: req.get('user-agent')
});
// 记录响应
res.on('finish', () => {
const duration = Date.now() - start;
logger.info('Request completed', {
requestId,
statusCode: res.statusCode,
duration,
contentLength: res.get('content-length')
});
// 慢请求告警
if (duration > 3000) {
logger.warn('Slow request detected', {
requestId,
duration,
path: req.path
});
}
});
next();
}
// 错误日志增强
function errorLogger(error, req, res, next) {
logger.error('Request error', {
requestId: req.requestId,
error: {
message: error.message,
stack: error.stack,
type: error.constructor.name
},
request: {
method: req.method,
path: req.path,
body: req.body,
query: req.query
}
});
next(error);
}Production EnvironmentDebug
远程DebugConfigure
DebugToolConfigure
# Node.js远程Debug
node --inspect=0.0.0.0:9229 app.js
# Chrome DevTools连接
chrome://inspect
# VS CodeDebugConfigure
{
"type": "node",
"request": "attach",
"name": "Attach to Remote",
"address": "your-server.com",
"port": 9229,
"localRoot": "${workspaceFolder}",
"remoteRoot": "/app"
}安全Note事项
- ✓ usingSSH隧道保护Debug端口
- ✓ 限制Debug访问IP
- ✓ Debug完成后关闭端口
- ✓ 不在Production Environment保留DebugCode
应急响应流程
故障处理SOP
🚨 P0故障(Service完全不可用)
- 立即通知所有相关人员
- 切换到备用Service(如有)
- 收集故障现场信息
- 错误日志
- 系统指标
- 最近变更
- 快速定位问题
- 检查Service状态
- 验证依赖Service
- 查看Monitor告警
- 执行恢复操作
- 验证Service恢复
- 撰写故障报告
📋 故障报告模板
# 故障报告 ## 基本信息 - 故障级别: P0 - 发生时间: 2024-01-20 14:30:00 - 恢复时间: 2024-01-20 15:45:00 - 影响范围: 全部用户 - 责任人: 张三 ## 故障描述 APIService返回500错误, 所有请求失败 ## 根本原因 Data库连接池耗尽, 新请求无法获取连接 ## 处理过程 1. 14:30 - 收到告警 2. 14:35 - 定位到Data库连接问题 3. 14:40 - 扩大连接池大小 4. 14:45 - 重启applyService 5. 15:00 - Service逐步恢复 6. 15:45 - 完全恢复 ## 改进措施 1. 增加连接池Monitor 2. Implement连接池自动扩容 3. 添加熔断机制