Embeddings Text Vectorization Complete Guide

Convert text to high-dimensional vectors, implement semantic search, similarity calculation, clustering analysis and other advanced features

Semantic Search

Understand query intent

Vector Database

Efficient storage and retrieval

Similarity Calculation

Precise content matching

RAG System

Knowledge-enhanced generation

1. Basic Text Vectorization

Getting Started

import openai
import numpy as np

# Configure API
openai.api_base = "https://api.n1n.ai/v1"
openai.api_key = "your-api-key"

# 1. Basic text vectorization
def get_embedding(text, model="text-embedding-3-small"):
    """Get vector representation of text"""
    response = openai.Embedding.create(
        model=model,
        input=text
    )
    return response['data'][0]['embedding']

# Single text vectorization
text = "Machine Learning is a branch of Artificial Intelligence"
embedding = get_embedding(text)
print(f"Vector dimensions: {len(embedding)}")  # 1536 dimensions

# Batch text vectorization
texts = [
    "Deep Learning is a subset of Machine Learning",
    "Neural networks simulate brain structure",
    "Natural language processing enables machines to understand human language"
]

response = openai.Embedding.create(
    model="text-embedding-3-small",
    input=texts
)

embeddings = [item['embedding'] for item in response['data']]
print(f"Processed {len(embeddings)} texts")

Model Selection

  • • text-embedding-3-small (1536 dims)
  • • text-embedding-3-large (3072 dims)
  • • text-embedding-ada-002 (1536 dims)

Cost Comparison

  • • Small: $0.02/1M tokens
  • • Large: $0.13/1M tokens
  • • Ada-002: $0.10/1M tokens

Application Scenarios

  • • Q&A systems
  • • Recommendation systems
  • • Content deduplication

2. Semantic Similarity Calculation

Similarity Search

import numpy as np
from scipy.spatial.distance import cosine
import openai

openai.api_base = "https://api.n1n.ai/v1"
openai.api_key = "your-api-key"

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity"""
    return 1 - cosine(vec1, vec2)

def get_embeddings(texts):
    """Batch get text vectors"""
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [item['embedding'] for item in response['data']]

# Semantic similarity calculation example
documents = [
    "Python is a popular programming language",
    "JavaScript is used for frontend development",
    "Machine Learning requires large amounts of data",
    "Deep Learning is an important AI technology",
    "Cats are cute pets"
]

query = "The relationship between Artificial Intelligence and Deep Learning"

# Get all vectors
doc_embeddings = get_embeddings(documents)
query_embedding = get_embeddings([query])[0]

# Calculate similarity
similarities = []
for i, doc_emb in enumerate(doc_embeddings):
    sim = cosine_similarity(query_embedding, doc_emb)
    similarities.append((documents[i], sim))

# Sort and display results
similarities.sort(key=lambda x: x[1], reverse=True)

print(f"Query: {query}\n")
print("Similarity ranking:")
for doc, sim in similarities:
    print(f"  {sim:.4f} - {doc}")

# Output example:
# Similarity ranking:
#   0.8234 - Deep Learning is an important AI technology
#   0.7891 - Machine Learning requires large amounts of data
#   0.4523 - Python is a popular programming language
#   0.3912 - JavaScript is used for frontend development
#   0.1234 - Cats are cute pets

💡 Similarity Threshold Reference

  • • > 0.9: Nearly identical content
  • • 0.8-0.9: Highly relevant
  • • 0.7-0.8: Clearly relevant
  • • 0.6-0.7: Somewhat relevant
  • • < 0.6: Low relevance

3. Vector Database Integration

Chroma Vector Store

from chromadb import Client
from chromadb.config import Settings
import openai

# Initialize Chroma vector database
client = Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="./chroma_db"
))

# Create or get collection
collection = client.get_or_create_collection(
    name="documents",
    metadata={"hnsw:space": "cosine"}  # Use cosine similarity
)

# OpenAI Configure
openai.api_base = "https://api.n1n.ai/v1"
openai.api_key = "your-api-key"

def get_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=text
    )
    return response['data'][0]['embedding']

# 1. Add documents to vector database
documents = [
    {"id": "doc1", "text": "Vue.js is a progressive JavaScript framework", "metadata": {"category": "frontend"}},
    {"id": "doc2", "text": "React uses virtual DOM to improve performance", "metadata": {"category": "frontend"}},
    {"id": "doc3", "text": "Django is a Python web framework", "metadata": {"category": "backend"}},
    {"id": "doc4", "text": "FastAPI provides automatic documentation generation", "metadata": {"category": "backend"}},
    {"id": "doc5", "text": "PostgreSQL is a relational database", "metadata": {"category": "database"}}
]

# Batch add
for doc in documents:
    embedding = get_embedding(doc["text"])
    collection.add(
        embeddings=[embedding],
        documents=[doc["text"]],
        metadatas=[doc["metadata"]],
        ids=[doc["id"]]
    )

print(f"Added {len(documents)} documents to vector database")

# 2. Semantic search
query = "Frontend framework performance optimization"
query_embedding = get_embedding(query)

# Search for most similar documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3,
    where={"category": "frontend"}  # Optional: metadata filtering
)

print(f"\nQuery: {query}")
print("Search results:")
for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"  {i+1}. Similarity: {1-dist:.4f}")
    print(f"     Documentation: {doc}")

# 3. Update and delete
# UpdateDocumentation
collection.update(
    ids=["doc1"],
    embeddings=[get_embedding("Vue 3 provides composition API")],
    documents=["Vue 3 provides composition API"]
)

# Delete documents
collection.delete(ids=["doc5"])

# 4. Persistence
client.persist()

Open-source Solutions

  • • Chroma - Lightweight and easy to use
  • • Weaviate - Feature-rich
  • • Milvus - High performance
  • • Qdrant - Rust implementation

Cloud Services

  • • Pinecone - Fully managed
  • • Zilliz Cloud - Managed Milvus
  • • Supabase Vector
  • • MongoDB Atlas Vector

Selection Suggestions

  • • DevelopTest: Chroma
  • • Production Environment: Pinecone
  • • Large scale: Milvus
  • • Hybrid search: Weaviate

4. RAG System Implementation

Complete RAG Workflow

import openai
from typing import List, Dict
import chromadb
from chromadb.utils import embedding_functions

class RAGSystem:
    def __init__(self, api_key: str, collection_name: str = "knowledge_base"):
        # Configure OpenAI
        openai.api_key = api_key
        openai.api_base = "https://api.n1n.ai/v1"
        
        # Initialize vector database
        self.client = chromadb.PersistentClient(path="./rag_db")
        
        # Use OpenAI embedding function
        self.embedding_function = embedding_functions.OpenAIEmbeddingFunction(
            api_key=api_key,
            api_base="https://api.n1n.ai/v1",
            model_name="text-embedding-3-small"
        )
        
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.embedding_function
        )
    
    def add_documents(self, documents: List[Dict[str, str]]):
        """Add documents to knowledge base"""
        self.collection.add(
            documents=[doc["content"] for doc in documents],
            metadatas=[{"source": doc.get("source", "unknown")} for doc in documents],
            ids=[doc["id"] for doc in documents]
        )
        return f"Added {len(documents)} documents"
    
    def search(self, query: str, n_results: int = 3) -> List[Dict]:
        """Semantic search for related documents"""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        return [{
            "content": doc,
            "distance": dist,
            "metadata": meta
        } for doc, dist, meta in zip(
            results['documents'][0],
            results['distances'][0],
            results['metadatas'][0]
        )]
    
    def generate_answer(self, query: str, context_docs: List[str]) -> str:
        """Generate answers based on retrieved documents"""
        context = "\n\n".join(context_docs)
        
        prompt = f"""Answer the question based on the following documents. If there is no relevant information in the documents, please say "I cannot answer this question based on the provided documents". 

Document content:
{context}

Question: {query}

Answer:"""
        
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a document-based Q&A assistant"},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content
    
    def ask(self, question: str) -> Dict:
        """Complete RAG workflow"""
        # 1. Retrieve relevant documents
        relevant_docs = self.search(question, n_results=3)
        
        # 2. Generate answer
        doc_contents = [doc["content"] for doc in relevant_docs]
        answer = self.generate_answer(question, doc_contents)
        
        return {
            "question": question,
            "answer": answer,
            "sources": relevant_docs
        }

# usingExample
rag = RAGSystem(api_key="your-api-key")

# Add knowledge documents
documents = [
    {"id": "1", "content": "Python 3.12 introduced better error messages", "source": "python_docs"},
    {"id": "2", "content": "TypeScript 5.0 supports decorators", "source": "ts_docs"},
    {"id": "3", "content": "React 18 introduced concurrent features", "source": "react_docs"}
]

rag.add_documents(documents)

# Ask question
result = rag.ask("What are the new features in the latest Python version? ")
print(f"Question: {result['question']}")
print(f"Answer: {result['answer']}")
print(f"Sources: {[s['metadata']['source'] for s in result['sources']]}")

5. Text Clustering and Visualization

Clustering Analysis

import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import openai

openai.api_base = "https://api.n1n.ai/v1"
openai.api_key = "your-api-key"

def get_embeddings(texts):
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=texts
    )
    return np.array([item['embedding'] for item in response['data']])

# Text data
texts = [
    # Technology category
    "Python is a programming language",
    "JavaScript is used for web development",
    "Machine learning algorithms are powerful",
    "Deep learning models need training",
    
    # Animal category
    "Cats are independent pets",
    "Dogs are loyal friends",
    "Birds can fly",
    "Fish swim in water",
    
    # Food category
    "Pizza is Italian cuisine",
    "Sushi comes from Japan",
    "Burgers are fast food",
    "Salads are healthy"
]

# Get vectors
embeddings = get_embeddings(texts)

# K-Means clustering
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Print clustering results
for i in range(n_clusters):
    print(f"\nCluster {i+1}:")
    cluster_texts = [texts[j] for j, c in enumerate(clusters) if c == i]
    for text in cluster_texts:
        print(f"  - {text}")

# Dimensionality reduction visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

# Plot clustering graph
plt.figure(figsize=(10, 8))
colors = ['red', 'blue', 'green']

for i in range(n_clusters):
    cluster_points = embeddings_2d[clusters == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
               c=colors[i], label=f'Cluster {i+1}', alpha=0.6)

# Add text labels
for i, txt in enumerate(texts):
    plt.annotate(txt, (embeddings_2d[i, 0], embeddings_2d[i, 1]),
                fontsize=8, alpha=0.7)

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Text Vector Clustering Visualization')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

6. Performance Optimization Tips

OptimizeStrategy

import openai
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor
import numpy as np

openai.api_base = "https://api.n1n.ai/v1"
openai.api_key = "your-api-key"

# 1. Batch processing optimization
def batch_embeddings(texts, batch_size=100):
    """Process large volumes of text in batches"""
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        response = openai.Embedding.create(
            model="text-embedding-3-small",
            input=batch
        )
        
        batch_embeddings = [item['embedding'] for item in response['data']]
        all_embeddings.extend(batch_embeddings)
        
        # Avoid rate limits
        if i + batch_size < len(texts):
            time.sleep(0.1)
    
    return all_embeddings

# 2. Concurrent processing
async def async_get_embedding(text, semaphore):
    """Asynchronously get embedding for a single text"""
    async with semaphore:
        response = await openai.Embedding.acreate(
            model="text-embedding-3-small",
            input=text
        )
        return response['data'][0]['embedding']

async def concurrent_embeddings(texts, max_concurrent=5):
    """Concurrently process multiple texts"""
    semaphore = asyncio.Semaphore(max_concurrent)
    tasks = [async_get_embedding(text, semaphore) for text in texts]
    return await asyncio.gather(*tasks)

# 3. Caching strategy
class EmbeddingCache:
    def __init__(self, cache_size=1000):
        self.cache = {}
        self.cache_size = cache_size
    
    def get_embedding(self, text, model="text-embedding-3-small"):
        # Check cache
        cache_key = f"{model}:{text}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Get new embedding
        response = openai.Embedding.create(
            model=model,
            input=text
        )
        embedding = response['data'][0]['embedding']
        
        # Update cache (simple LRU implementation)
        if len(self.cache) >= self.cache_size:
            # Delete the oldest entry
            oldest_key = list(self.cache.keys())[0]
            del self.cache[oldest_key]
        
        self.cache[cache_key] = embedding
        return embedding

# 4. Vector compression (dimensionality reduction)
def compress_embeddings(embeddings, target_dim=256):
    """Use PCA to reduce vector dimensions"""
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=target_dim)
    compressed = pca.fit_transform(embeddings)
    
    print(f"Compression ratio: {embeddings.shape[1]} -> {compressed.shape[1]}")
    print(f"Explained variance retained: {sum(pca.explained_variance_ratio_):.2%}")
    
    return compressed, pca

# 5. Model selection strategy
def smart_embedding(text, use_large=False):
    """Select model based on text length"""
    text_length = len(text)
    
    if text_length > 8000 or use_large:
        # Use large model for long text or when high precision is required
        model = "text-embedding-3-large"
    else:
        # Use small model for short text, faster and cheaper
        model = "text-embedding-3-small"
    
    response = openai.Embedding.create(
        model=model,
        input=text
    )
    
    return {
        "embedding": response['data'][0]['embedding'],
        "model": model,
        "tokens": response['usage']['total_tokens']
    }

# Usage example
cache = EmbeddingCache()
texts = ["Example text 1", "Example text 2", "Example text 1"]  # Note duplicate

# Processing with cache
for text in texts:
    embedding = cache.get_embedding(text)
    print(f"Processing: {text[:20]}...")

⚡ Performance Optimization

  • ✅ Batch processing to reduce API calls
  • ✅ Concurrent requests to increase throughput
  • ✅ Caching to avoid repeated computation
  • ✅ Vector compression to save storage
  • ✅ Smart model selection

💰 Cost Control

  • ✅ Use small models for most tasks
  • ✅ Preprocess text to remove redundancy
  • ✅ Configure a reasonable caching strategy
  • ✅ Periodically clean up unused vectors
  • ✅ Monitor API usage