LangChain + LLM API Complete Integration Guide
Build intelligent applications with the LangChain framework, supporting RAG, Agents, Memory, and more.
RAG System
Knowledge retrieval augmentation
Intelligent Agent
Autonomous decision-making and tool calling
Chained Calls
Modular task orchestration
Multi-model Support
Unified interface calls
1. Basic Configuration and Usage
Getting Started
# Install Langchain
pip install langchain langchain-openai chromadb tiktoken
# Basic configuration example
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
# Initialize model
llm = ChatOpenAI(
base_url="https://api.n1n.ai/v1",
api_key="your-api-key",
model="gpt-4o",
temperature=0.7
)
# Create prompt template
prompt = ChatPromptTemplate.from_messages([
("system", "You are a professional {role}"),
("human", "{input}")
])
# Create chain
chain = LLMChain(llm=llm, prompt=prompt)
# Run the chain
response = chain.run(role="translation expert", input="Hello World")
print(response)Supported models
- • GPT-4o / GPT-3.5-turbo
- • Claude 3.5 Sonnet
- • Gemini Pro
- • 本地 Llama / Mistral
Core components
- • Prompts - prompt management
- • Models - model interfaces
- • Chains - task chains
- • Memory - contextual memory
2. RAG Retrieval-Augmented Generation
Complete RAG implementation
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
import os
# Configure API
os.environ["OPENAI_API_BASE"] = "https://api.n1n.ai/v1"
os.environ["OPENAI_API_KEY"] = "your-api-key"
# 1. 加载Documentation
loader = PyPDFLoader("document.pdf")
documents = loader.load()
# 2. 分割Documentation
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""]
)
texts = text_splitter.split_documents(documents)
# 3. Create向量存储
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_base="https://api.n1n.ai/v1"
)
vectorstore = Chroma.from_documents(
documents=texts,
embedding=embeddings,
persist_directory="./chroma_db"
)
# 4. Create检索链
llm = ChatOpenAI(
model="gpt-4o",
temperature=0,
openai_api_base="https://api.n1n.ai/v1"
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 3}
),
return_source_documents=True
)
# 5. Query
query = "What are the main points mentioned in the document?"
result = qa_chain({"query": query})
print("Answer: ", result["result"])
print("\nRelated documents: ")
for doc in result["source_documents"]:
print(f"- {doc.page_content[:100]}...")💡 RAG Best Practices
- • Chunk size: 500–1500 tokens, adjust by content type
- • Overlap: 10–20% to ensure context continuity
- • Embeddings: text-embedding-3-small is cost-effective
- • Retrieval k: 3–5 related documents is optimal
- • Vector stores: Chroma for local dev, Pinecone for production
3. Intelligent Agent System
Tool-calling Agent
from langchain.agents import create_openai_functions_agent, AgentExecutor
from langchain.tools import Tool, StructuredTool
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
import requests
from pydantic import BaseModel, Field
# Define tools
class WeatherInput(BaseModel):
city: str = Field(description="City name")
def get_weather(city: str) -> str:
"""Get weather information"""
# Call a real API in production
return f"The weather in {city} is sunny, 25°C"
def search_web(query: str) -> str:
"""Search the web for information"""
return f"Search results: information related to {query}..."
# Create tool list
tools = [
StructuredTool.from_function(
func=get_weather,
name="get_weather",
description="Get the weather for a specified city",
args_schema=WeatherInput
),
Tool(
name="search_web",
func=search_web,
description="Search the web for information"
)
]
# Initialize model
llm = ChatOpenAI(
model="gpt-4o",
temperature=0,
openai_api_base="https://api.n1n.ai/v1",
openai_api_key="your-api-key"
)
# Create prompt template
prompt = ChatPromptTemplate.from_messages([
("system", """You are an intelligent assistant and can use the following tools:
- get_weather: get weather information
- search_web: search the web
Choose appropriate tools based on the user's question."""),
MessagesPlaceholder(variable_name="chat_history", optional=True),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad")
])
# Create agent
agent = create_openai_functions_agent(llm, tools, prompt)
agent_executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=True,
max_iterations=3
)
# Run agent
response = agent_executor.invoke({
"input": "What's the weather like in Beijing today? Is tomorrow good for a trip?"
})
print(response["output"])Common tools
- • 搜索引擎
- • 计算器
- • Data库查询
- • API Call
Agent types
- • ReAct Agent
- • OpenAI Functions
- • Plan-and-Execute
- • Self-Ask
Use cases
- • 客服机器人
- • DataAnalyze
- • 任务自动化
- • 研究助手
4. Memory System
多种记忆类型
from langchain.memory import ConversationBufferMemory, ConversationSummaryMemory
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ChatMessageHistory
# 1. Basic conversational memory
memory = ConversationBufferMemory()
llm = ChatOpenAI(
model="gpt-4o",
openai_api_base="https://api.n1n.ai/v1",
openai_api_key="your-api-key"
)
conversation = ConversationChain(
llm=llm,
memory=memory,
verbose=True
)
# Conversation example
print(conversation.run("My name is Xiaoming"))
print(conversation.run("What name did I just say?"))
# 2. Summary memory (for long conversations)
summary_memory = ConversationSummaryMemory(
llm=llm,
max_token_limit=100
)
summary_chain = ConversationChain(
llm=llm,
memory=summary_memory,
verbose=True
)
# 3. Vector memory (retrieve related history)
from langchain.memory import VectorStoreRetrieverMemory
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
embeddings = OpenAIEmbeddings(
openai_api_base="https://api.n1n.ai/v1"
)
vectorstore = Chroma(embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_kwargs=dict(k=2))
vector_memory = VectorStoreRetrieverMemory(
retriever=retriever,
memory_key="history",
input_key="input"
)
# 4. Persistent memory
from langchain.memory import SQLiteEntityStore
from langchain.memory import ConversationEntityMemory
entity_store = SQLiteEntityStore(db_file="memory.db")
entity_memory = ConversationEntityMemory(
llm=llm,
entity_store=entity_store
)5. Streaming Output
Real-time streaming
from langchain_openai import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import ChatPromptTemplate
import asyncio
# 1. Synchronous streaming output
llm = ChatOpenAI(
model="gpt-4o",
streaming=True,
callbacks=[StreamingStdOutCallbackHandler()],
openai_api_base="https://api.n1n.ai/v1",
openai_api_key="your-api-key"
)
# Stream output to console
response = llm.invoke("Write a story about AI")
# 2. Asynchronous streaming
async def async_stream():
llm = ChatOpenAI(
model="gpt-4o",
streaming=True,
openai_api_base="https://api.n1n.ai/v1",
openai_api_key="your-api-key"
)
async for chunk in llm.astream("Tell a joke"):
print(chunk.content, end="", flush=True)
# 3. Custom streaming callback
from langchain.callbacks.base import BaseCallbackHandler
from typing import Any, Dict, List
class CustomStreamHandler(BaseCallbackHandler):
def __init__(self):
self.tokens = []
def on_llm_new_token(self, token: str, **kwargs) -> None:
"""Handle new tokens"""
self.tokens.append(token)
# Can be sent to WebSocket, SSE, etc.
print(f"Token: {token}", end="")
def on_llm_end(self, response: Any, **kwargs) -> None:
"""Handle end of LLM response"""
full_response = "".join(self.tokens)
print(f"\nFull response length: {len(full_response)}")
# Using custom handler
custom_llm = ChatOpenAI(
model="gpt-4o",
streaming=True,
callbacks=[CustomStreamHandler()],
openai_api_base="https://api.n1n.ai/v1"
)
response = custom_llm.invoke("Explain quantum computing")6. Performance Optimization and Cost Control
Optimization tips
from langchain.cache import SQLiteCache, InMemoryCache
from langchain.globals import set_llm_cache
from langchain_openai import ChatOpenAI
import time
# 1. Set cache
set_llm_cache(SQLiteCache(database_path=".langchain.db"))
# Or use in-memory cache
# set_llm_cache(InMemoryCache())
llm = ChatOpenAI(
model="gpt-3.5-turbo", # Use a more cost-effective model
openai_api_base="https://api.n1n.ai/v1",
openai_api_key="your-api-key"
)
# First call (will be cached)
start = time.time()
response1 = llm.invoke("What is machine learning?")
print(f"First call took: {time.time() - start:.2f}s")
# Second call (reads from cache)
start = time.time()
response2 = llm.invoke("What is machine learning?")
print(f"Second call took: {time.time() - start:.2f}s")
# 2. Batch optimization
from langchain.callbacks import get_openai_callback
prompts = ["Explain Python", "Explain JavaScript", "Explain Rust"]
# Use callback to track costs
with get_openai_callback() as cb:
# Batch processing
responses = llm.batch(prompts, config={"max_concurrency": 3})
print(f"Total tokens: {cb.total_tokens}")
print(f"Estimated cost: ${cb.total_cost:.4f}")
# 3. Prompt compression
from langchain.prompts import FewShotPromptTemplate
from langchain.prompts.example_selector import LengthBasedExampleSelector
examples = [
{"input": "2+2", "output": "4"},
{"input": "5*3", "output": "15"},
{"input": "10/2", "output": "5"}
]
example_selector = LengthBasedExampleSelector(
examples=examples,
example_prompt=ChatPromptTemplate.from_template("Input: {input}\nOutput: {output}"),
max_length=50 # Limit example length
)
# 4. Model fallback strategy
from langchain.llms import FallbackLLM
primary_llm = ChatOpenAI(
model="gpt-4o",
openai_api_base="https://api.n1n.ai/v1"
)
fallback_llm = ChatOpenAI(
model="gpt-3.5-turbo",
openai_api_base="https://api.n1n.ai/v1"
)
# Automatically fall back if primary model fails
try:
response = primary_llm.invoke("A complex question...")
except Exception:
response = fallback_llm.invoke("A complex question...")🎯 Optimization strategies
- ✅ Enable cache to reduce duplicate calls
- ✅ Use batching to increase throughput
- ✅ Compress prompts to reduce token usage
- ✅ Use model fallback for simple tasks
- ✅ Use async concurrency
💰 Cost Control
- ✅ Use GPT-3.5 for routine tasks
- ✅ Use GPT-4o for complex reasoning
- ✅ Use local models for private data
- ✅ Monitor token usage
- ✅ Set cost alerts