故障排查指南: 快速定位和解决问题

在Develop和运维Large Language Modelapply时, 难免会遇到各种问题. 本指南汇总了 最常见的故障场景和Solution, help您快速定位问题, 恢复Service.

常见API错误

401 Unauthorized

问题表现

{
  "error": {
    "message": "Invalid API key provided",
    "type": "invalid_request_error",
    "code": "invalid_api_key"
  }
}

排查步骤

  1. 检查API密钥是否正确
  2. 确认密钥是否过期或被禁用
  3. 验证Request Headers格式是否正确

Solution

// 正确的Request Headers格式
const headers = {
  'Authorization': 'Bearer YOUR_API_KEY',
  'Content-Type': 'application/json'
};

// 环境变量管理
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
  throw new Error('API key not configured');
}

429 Rate Limit Exceeded

问题表现

请求频率超过限制, API返回429错误

Solution

// 指数退避重试
async function retryWithBackoff(fn, maxRetries = 5) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      return await fn();
    } catch (error) {
      if (error.status === 429 && i < maxRetries - 1) {
        const delay = Math.pow(2, i) * 1000;
        console.log(`Rate limited, retrying in ${delay}ms`);
        await new Promise(r => setTimeout(r, delay));
      } else {
        throw error;
      }
    }
  }
}

// 请求队列管理
class RateLimiter {
  constructor(rateLimit = 60, interval = 60000) {
    this.queue = [];
    this.processing = false;
    this.rateLimit = rateLimit;
    this.interval = interval;
  }
  
  async add(fn) {
    return new Promise((resolve, reject) => {
      this.queue.push({ fn, resolve, reject });
      this.process();
    });
  }
  
  async process() {
    if (this.processing) return;
    this.processing = true;
    
    while (this.queue.length > 0) {
      const batch = this.queue.splice(0, this.rateLimit);
      await Promise.all(batch.map(({ fn, resolve, reject }) =>
        fn().then(resolve).catch(reject)
      ));
      
      if (this.queue.length > 0) {
        await new Promise(r => setTimeout(r, this.interval));
      }
    }
    
    this.processing = false;
  }
}

500 Internal Server Error

常见原因

  • ModelService暂时不可用
  • 请求格式错误导致Service崩溃
  • 超长输入导致内存溢出

应急处理

// 故障转移Strategy
const models = ['gpt-4', 'gpt-3.5-turbo', 'claude-2'];
let modelIndex = 0;

async function callWithFallback(prompt) {
  while (modelIndex < models.length) {
    try {
      const response = await callAPI(models[modelIndex], prompt);
      return response;
    } catch (error) {
      if (error.status === 500) {
        console.error(`Model ${models[modelIndex]} failed, trying next`);
        modelIndex++;
      } else {
        throw error;
      }
    }
  }
  
  throw new Error('All models failed');
}

性能问题排查

响应延迟过高

🔍 诊断Tool

// 性能MonitorCode
class PerformanceMonitor {
  constructor() {
    this.metrics = {
      requests: 0,
      totalLatency: 0,
      errors: 0,
      slowRequests: 0
    };
  }
  
  async track(operation, threshold = 2000) {
    const start = Date.now();
    this.metrics.requests++;
    
    try {
      const result = await operation();
      const duration = Date.now() - start;
      
      this.metrics.totalLatency += duration;
      if (duration > threshold) {
        this.metrics.slowRequests++;
        console.warn(`Slow request detected: ${duration}ms`);
        this.diagnose(duration);
      }
      
      return result;
    } catch (error) {
      this.metrics.errors++;
      throw error;
    }
  }
  
  diagnose(duration) {
    console.log('Performance diagnosis:');
    console.log(`- Average latency: ${this.getAverageLatency()}ms`);
    console.log(`- Slow request rate: ${this.getSlowRate()}%`);
    console.log(`- Error rate: ${this.getErrorRate()}%`);
    
    // Analyze可能原因
    if (duration > 5000) {
      console.log('Possible causes:');
      console.log('- Model overload');
      console.log('- Network issues');
      console.log('- Large context size');
    }
  }
  
  getAverageLatency() {
    return Math.round(this.metrics.totalLatency / this.metrics.requests);
  }
  
  getSlowRate() {
    return ((this.metrics.slowRequests / this.metrics.requests) * 100).toFixed(2);
  }
  
  getErrorRate() {
    return ((this.metrics.errors / this.metrics.requests) * 100).toFixed(2);
  }
}

⚡ Optimize措施

前端Optimize

  • • Implement请求去重
  • • 添加本地缓存
  • • Optimize请求大小

后端Optimize

  • • 启用连接池
  • • Implement请求合并
  • • usingCDN加速

内存问题处理

内存泄漏排查

// Node.js内存Monitor
const v8 = require('v8');
const { performance } = require('perf_hooks');

class MemoryMonitor {
  constructor() {
    this.baseline = process.memoryUsage();
    this.snapshots = [];
  }
  
  snapshot() {
    const usage = process.memoryUsage();
    const heap = v8.getHeapStatistics();
    
    const snapshot = {
      timestamp: Date.now(),
      rss: usage.rss / 1024 / 1024, // MB
      heapTotal: usage.heapTotal / 1024 / 1024,
      heapUsed: usage.heapUsed / 1024 / 1024,
      external: usage.external / 1024 / 1024,
      heapLimit: heap.heap_size_limit / 1024 / 1024
    };
    
    this.snapshots.push(snapshot);
    
    // 检测异常
    if (snapshot.heapUsed > snapshot.heapLimit * 0.9) {
      console.error('Critical: Heap usage > 90%');
      this.triggerGC();
    }
    
    return snapshot;
  }
  
  detectLeak() {
    if (this.snapshots.length < 10) return false;
    
    // 计算内存增长趋势
    const recent = this.snapshots.slice(-10);
    const growth = recent[9].heapUsed - recent[0].heapUsed;
    const growthRate = growth / recent[0].heapUsed;
    
    if (growthRate > 0.5) {
      console.warn('Possible memory leak detected');
      console.warn(`Memory grew by ${(growthRate * 100).toFixed(2)}%`);
      return true;
    }
    
    return false;
  }
  
  triggerGC() {
    if (global.gc) {
      console.log('Triggering garbage collection...');
      global.gc();
    } else {
      console.warn('GC not exposed. Run with --expose-gc');
    }
  }
}

// usingExample
const memMonitor = new MemoryMonitor();

// 定期Monitor
setInterval(() => {
  const snapshot = memMonitor.snapshot();
  console.log(`Memory usage: ${snapshot.heapUsed.toFixed(2)}MB / ${snapshot.heapLimit.toFixed(2)}MB`);
  
  if (memMonitor.detectLeak()) {
    // 执行清理操作
    clearCaches();
    memMonitor.triggerGC();
  }
}, 60000); // 每minutes检查一次

网络问题Debug

连接超时处理

诊断Script

// 网络诊断Tool
async function diagnoseNetwork(endpoint) {
  console.log('Starting network diagnosis...');
  
  // 1. DNS解析Test
  try {
    const dns = require('dns').promises;
    const start = Date.now();
    const addresses = await dns.resolve4(new URL(endpoint).hostname);
    console.log(`✓ DNS resolution: ${Date.now() - start}ms`);
    console.log(`  Resolved to: ${addresses.join(', ')}`);
  } catch (error) {
    console.error('✗ DNS resolution failed:', error.message);
  }
  
  // 2. 连接Test
  try {
    const start = Date.now();
    const response = await fetch(endpoint, {
      method: 'HEAD',
      signal: AbortSignal.timeout(5000)
    });
    console.log(`✓ Connection test: ${Date.now() - start}ms`);
    console.log(`  Status: ${response.status}`);
  } catch (error) {
    console.error('✗ Connection failed:', error.message);
  }
  
  // 3. SSL/TLSTest
  if (endpoint.startsWith('https')) {
    try {
      const tls = require('tls');
      const url = new URL(endpoint);
      
      const socket = tls.connect({
        host: url.hostname,
        port: 443,
        servername: url.hostname
      });
      
      socket.on('secureConnect', () => {
        const cert = socket.getPeerCertificate();
        console.log('✓ SSL/TLS connection established');
        console.log(`  Certificate valid until: ${cert.valid_to}`);
        socket.end();
      });
    } catch (error) {
      console.error('✗ SSL/TLS error:', error.message);
    }
  }
  
  // 4. 延迟Test
  console.log('\nLatency test (5 requests):');
  const latencies = [];
  
  for (let i = 0; i < 5; i++) {
    try {
      const start = Date.now();
      await fetch(endpoint, {
        method: 'HEAD',
        signal: AbortSignal.timeout(10000)
      });
      const latency = Date.now() - start;
      latencies.push(latency);
      console.log(`  Request ${i + 1}: ${latency}ms`);
    } catch (error) {
      console.log(`  Request ${i + 1}: Failed`);
    }
  }
  
  if (latencies.length > 0) {
    const avg = latencies.reduce((a, b) => a + b) / latencies.length;
    console.log(`\nAverage latency: ${avg.toFixed(2)}ms`);
  }
}

日志Analyze技巧

结构化日志记录

// 结构化日志系统
const winston = require('winston');

const logger = winston.createLogger({
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json()
  ),
  transports: [
    new winston.transports.File({ 
      filename: 'error.log', 
      level: 'error' 
    }),
    new winston.transports.File({ 
      filename: 'combined.log' 
    })
  ]
});

// 请求追踪中间件
function requestLogger(req, res, next) {
  const requestId = crypto.randomUUID();
  req.requestId = requestId;
  
  const start = Date.now();
  
  // 记录请求
  logger.info('Request received', {
    requestId,
    method: req.method,
    path: req.path,
    ip: req.ip,
    userAgent: req.get('user-agent')
  });
  
  // 记录响应
  res.on('finish', () => {
    const duration = Date.now() - start;
    
    logger.info('Request completed', {
      requestId,
      statusCode: res.statusCode,
      duration,
      contentLength: res.get('content-length')
    });
    
    // 慢请求告警
    if (duration > 3000) {
      logger.warn('Slow request detected', {
        requestId,
        duration,
        path: req.path
      });
    }
  });
  
  next();
}

// 错误日志增强
function errorLogger(error, req, res, next) {
  logger.error('Request error', {
    requestId: req.requestId,
    error: {
      message: error.message,
      stack: error.stack,
      type: error.constructor.name
    },
    request: {
      method: req.method,
      path: req.path,
      body: req.body,
      query: req.query
    }
  });
  
  next(error);
}

Production EnvironmentDebug

远程DebugConfigure

DebugToolConfigure

# Node.js远程Debug
node --inspect=0.0.0.0:9229 app.js

# Chrome DevTools连接
chrome://inspect

# VS CodeDebugConfigure
{
  "type": "node",
  "request": "attach",
  "name": "Attach to Remote",
  "address": "your-server.com",
  "port": 9229,
  "localRoot": "${workspaceFolder}",
  "remoteRoot": "/app"
}

安全Note事项

  • ✓ usingSSH隧道保护Debug端口
  • ✓ 限制Debug访问IP
  • ✓ Debug完成后关闭端口
  • ✓ 不在Production Environment保留DebugCode

应急响应流程

故障处理SOP

🚨 P0故障(Service完全不可用)

  1. 立即通知所有相关人员
  2. 切换到备用Service(如有)
  3. 收集故障现场信息
    • 错误日志
    • 系统指标
    • 最近变更
  4. 快速定位问题
    • 检查Service状态
    • 验证依赖Service
    • 查看Monitor告警
  5. 执行恢复操作
  6. 验证Service恢复
  7. 撰写故障报告

📋 故障报告模板

# 故障报告

## 基本信息
- 故障级别: P0
- 发生时间: 2024-01-20 14:30:00
- 恢复时间: 2024-01-20 15:45:00
- 影响范围: 全部用户
- 责任人: 张三

## 故障描述
APIService返回500错误, 所有请求失败

## 根本原因
Data库连接池耗尽, 新请求无法获取连接

## 处理过程
1. 14:30 - 收到告警
2. 14:35 - 定位到Data库连接问题
3. 14:40 - 扩大连接池大小
4. 14:45 - 重启applyService
5. 15:00 - Service逐步恢复
6. 15:45 - 完全恢复

## 改进措施
1. 增加连接池Monitor
2. Implement连接池自动扩容
3. 添加熔断机制

Build稳定可靠的AIService

master故障排查技巧, 让您的Service保持高可用, 快速响应各种问题.

获取技术support