Token Calculation and Optimization Complete Guide
Deeply understand tokens, master counting methods and optimization tips, and effectively reduce API usage cost.
Token counting
Accurately measure usage
Prompt compression
Reduce input length
Cost control
Reduce usage cost
Usage analysis
Optimize usage patterns
1. What is a token?
Token definition
A token is a basic unit of text, roughly 4 characters or 0.75 English words.
"Hello world" → 2 tokens
"你好世界" → 4 tokens
"ChatGPT is amazing!" → 5 tokens
Billing
- • Input tokens: the content you send
- • Output tokens: the content the AI generates
- • Billing unit: per 1K tokens
- • Price difference: output costs 2-4x input
💡 Token limits
GPT-4o: 128K tokens
Claude 3.5: 200K tokens
GPT-3.5: 16K tokens
2. Precise token counting
Counting tools
import tiktoken
class TokenCounter:
def __init__(self):
self.encoders = {
"gpt-4": tiktoken.encoding_for_model("gpt-4"),
"gpt-3.5-turbo": tiktoken.encoding_for_model("gpt-3.5-turbo")
}
def count_tokens(self, text: str, model: str = "gpt-4") -> int:
encoding = self.encoders.get(model, tiktoken.get_encoding("cl100k_base"))
return len(encoding.encode(text))
def count_messages_tokens(self, messages: list, model: str = "gpt-4") -> int:
encoding = self.encoders.get(model)
tokens_per_message = 3
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
num_tokens += 3 # end marker
return num_tokens
# Usage example
counter = TokenCounter()
text = "Hello, how can I help you?"
tokens = counter.count_tokens(text)
print(f"Token count: {tokens}")
# Cost estimation
def estimate_cost(input_tokens: int, output_tokens: int, model: str = "gpt-4o"):
pricing = {
"gpt-4o": {"input": 0.0025, "output": 0.01},
"gpt-4o-mini": {"input": 0.00015, "output": 0.0006}
}
cost = (input_tokens * pricing[model]["input"] +
output_tokens * pricing[model]["output"]) / 1000
return cost💰 Cost reference
| Model | Input price | Output price |
|---|---|---|
| GPT-4o | $2.5/M | $10/M |
| GPT-4o mini | $0.15/M | $0.6/M |
3. Prompt optimization
Smart compression
class PromptOptimizer:
def compress_prompt(self, prompt: str, max_tokens: int = 1000) -> str:
# 1. Remove redundant words
redundant_words = ["please", "could you", "I would like"]
for word in redundant_words:
prompt = prompt.replace(word, "")
# 2. Use abbreviations
abbreviations = {
"for example": "e.g.",
"that is": "i.e.",
"et cetera": "etc."
}
for full, abbr in abbreviations.items():
prompt = prompt.replace(full, abbr)
# 3. Simplify directives
# Example: replace verbose phrases with concise ones
if "analyze this in great detail" in prompt.lower():
prompt = prompt.replace("analyze this in great detail", "analyze this")
return prompt.strip()
# Optimization example
original = "Please could you analyze this text in detail"
optimized = PromptOptimizer().compress_prompt(original)
# Result: "analyze this text"Optimization tips
- • Remove polite filler
- • Use abbreviations
- • Simplify directives
- • Remove redundancy
Optimization impact
Save 30-50% tokens on average
Preserve semantics
Improve response speed
4. Caching strategies
Smart caching
import hashlib
from datetime import datetime, timedelta
class TokenCache:
def __init__(self):
self.cache = {}
self.stats = {"hits": 0, "misses": 0, "tokens_saved": 0}
def get_key(self, prompt: str, model: str) -> str:
content = f"{prompt}:{model}"
return hashlib.md5(content.encode()).hexdigest()
def get(self, prompt: str, model: str):
key = self.get_key(prompt, model)
if key in self.cache:
entry = self.cache[key]
if datetime.now() < entry["expires"]:
self.stats["hits"] += 1
self.stats["tokens_saved"] += entry["tokens"]
return entry["response"]
self.stats["misses"] += 1
return None
def set(self, prompt: str, model: str, response: str, tokens: int):
key = self.get_key(prompt, model)
self.cache[key] = {
"response": response,
"tokens": tokens,
"expires": datetime.now() + timedelta(hours=24)
}5. Smart chunking
Document chunking
def smart_chunk_text(text: str, max_tokens: int = 1500):
"""Smart text chunking"""
sentences = text.split('.')
chunks = []
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = count_tokens(sentence + '.')
if current_tokens + sentence_tokens > max_tokens:
chunks.append('.'.join(current_chunk) + '.')
current_chunk = [sentence]
current_tokens = sentence_tokens
else:
current_chunk.append(sentence)
current_tokens += sentence_tokens
if current_chunk:
chunks.append('.'.join(current_chunk) + '.')
return chunks6. Real-world application
Document processing
# Real-world use: document summarization optimization
def optimized_document_summary(document: str):
# 1. Check cache
cached = cache.get(document[:100], "summary")
if cached:
return cached
# 2. Count tokens
tokens = counter.count_tokens(document)
if tokens < 2000:
# Process directly
summary = process_with_api(document)
else:
# Process in chunks
chunks = smart_chunk_text(document)
summaries = []
for chunk in chunks:
# Optimize the prompt per chunk
prompt = f"Summarize: {chunk}"
optimized = optimizer.compress_prompt(prompt)
summaries.append(process_with_api(optimized))
# Merge summaries
summary = merge_summaries(summaries)
# 3. Cache the result
cache.set(document[:100], "summary", summary, tokens)
return summary7. Optimization strategy summary
🎯 Input optimization
- ✅ Use concise directives
- ✅ Remove redundant content
- ✅ Use abbreviations
- ✅ Preprocess text
- ✅ Smart chunking
💰 Cost control
- ✅ Implement caching
- ✅ Batch processing
- ✅ Choose the right model
- ✅ Monitor usage
- ✅ Budget limits
📊 Optimization impact
Prompt optimization: Save 30-50%
Caching strategy: Save 40-60%
Batch processing: Improve 5-10x