GPT-4 Vision Image Recognition Complete Guide

Use GPT-4o's visual capabilities for advanced tasks like image understanding, OCR, and object detection

Image understanding

Deep scene analysis

OCR recognition

Text extraction and recognition

Object detection

Identify and locate objects

Multi-image analysis

Batch image processing

1. Basic Image Analysis

Getting Started

import openai
import base64
import requests
from PIL import Image
import io

# Configure API
openai.api_key = "your-api-key"
openai.api_base = "https://api.n1n.ai/v1"

def encode_image(image_path):
    """Encode image to base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def analyze_image(image_path, prompt):
    """Analyze image using GPT-4 Vision"""
    # Encode image
    base64_image = encode_image(image_path)
    
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"  # Options: low, high, auto
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

# Usage example
result = analyze_image(
    "photo.jpg",
    "Please describe this image in detail, including scene, objects, colors and atmosphere"
)
print(result)

# Analyze web images
def analyze_url_image(image_url, prompt):
    """Analyze web images"""
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url}
                    }
                ]
            }
        ]
    )
    return response.choices[0].message.content

Supported formats

  • • JPEG / JPG
  • • PNG
  • • WebP
  • • GIF (非动态)

Image detail

  • • low: Fast and low-cost
  • • high: High-precision analysis
  • • auto: Auto selection

Size limits

  • • Max 20MB
  • • Recommended < 2048px
  • • Auto-scaling applied

2. Multi-image Processing

Batch analysis

# Multi-image analysis
def analyze_multiple_images(image_paths, prompt):
    """Analyze multiple images simultaneously"""
    messages = [{"type": "text", "text": prompt}]
    
    # Add all images
    for path in image_paths:
        base64_image = encode_image(path)
        messages.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
                "detail": "low"  # Recommend using low for multiple images to save tokens
            }
        })
    
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": messages
        }],
        max_tokens=2000
    )
    
    return response.choices[0].message.content

# Usage example: Compare multiple images
images = ["product1.jpg", "product2.jpg", "product3.jpg"]
comparison = analyze_multiple_images(
    images,
    "Please compare the design features of these three products, pointing out their similarities and differences"
)

# Image sequence analysis (like consecutive frames)
def analyze_image_sequence(frames, task="describe_changes"):
    """Analyze image sequences, like video frames"""
    
    if task == "describe_changes":
        prompt = "This is a time-series image. Please describe the changes from the first to the last image. "
    elif task == "detect_motion":
        prompt = "Analyze the movement and actions in these consecutive frames. "
    elif task == "summarize_story":
        prompt = "Tell a coherent story based on these images. "
    
    return analyze_multiple_images(frames, prompt)

💡 Multi-image tips

  • • Use low detail mode to save tokens
  • • Analyze related images together for better results
  • • Note total token limit (128K)
  • • Useful for comparison and sequence analysis

3. OCR Text Recognition

Text extraction

# OCR text recognition application
def extract_text_from_image(image_path, language="Chinese and English"):
    """Extract text from an image"""
    prompt = f"""Please extract all {language} text from the image.
    Requirements:
    1. Preserve original formatting and layout
    2. Mark uncertain characters
    3. If there are tables, present them in Markdown format
    4. Recognize all visible text, including watermarks and labels
    """
    
    return analyze_image(image_path, prompt)

# Document analysis
def analyze_document(image_path):
    """Analyze a document image"""
    prompt = """Please analyze this document image:
    1. Document type (invoice, contract, report, etc.)
    2. Extract key information (dates, amounts, parties, etc.)
    3. Document structure and layout
    4. Important clauses or key points
    Please output results in a structured format.
    """
    
    result = analyze_image(image_path, prompt)
    
    # Post-process into a structured result
    return {
        "raw_text": extract_text_from_image(image_path),
        "analysis": result
    }

# Handwriting recognition
def recognize_handwriting(image_path):
    """Recognize handwritten content"""
    prompt = """Please recognize the handwritten content in the image:
    1. Transcribe all handwritten text
    2. Mark unclear characters with [?]
    3. Preserve original paragraph structure
    4. Identify any charts or symbols
    """
    
    return analyze_image(image_path, prompt)

# Table data extraction
def extract_table_data(image_path):
    """Extract table data from an image"""
    prompt = """Please convert the table in the image to CSV format:
    1. First row is the header
    2. Use commas as separators
    3. Handle merged cells
    4. Preserve all data
    
    Output format example:
    col1,col2,col3
    data1,data2,data3
    """
    
    csv_data = analyze_image(image_path, prompt)
    
    # Can be further processed into a DataFrame
    import pandas as pd
    from io import StringIO
    
    try:
        df = pd.read_csv(StringIO(csv_data))
        return df
    except:
        return csv_data

OCR capabilities

  • • Multilingual text recognition
  • • Handwriting recognition
  • • Table data extraction
  • • Document structure analysis
  • • Formula/symbol recognition

Use cases

  • • Invoice recognition
  • • ID document recognition
  • • Menu translation
  • • Notes digitization
  • • Exam paper grading

4. Advanced Application Scenarios

Professional analysis

# Advanced application scenarios

# 1. Medical image analysis (Note: Educational reference only, not a diagnosis)
def analyze_medical_image(image_path):
    """Analyze a medical image"""
    prompt = """Please analyze this medical image (for educational purposes only):
    1. Describe visible anatomical structures
    2. Point out any obvious abnormalities
    3. Evaluate image quality
    4. Suggest possible further examinations if needed
    
    Disclaimer: This is an AI analysis and not a substitute for professional medical diagnosis.
    """
    
    return analyze_image(image_path, prompt)

# 2. Product quality inspection
def quality_inspection(image_path, product_type="electronic"):
    """Product quality inspection"""
    prompt = f"""Please inspect this {product_type} product image:
    1. Surface defects (scratches, dents, stains, etc.)
    2. Assembly issues (alignment, gaps, looseness, etc.)
    3. Correctness of labels and markings
    4. Overall quality rating (1-10)
    
    Please list all identified issues in detail.
    """
    
    return analyze_image(image_path, prompt)

# 3. Security monitoring analysis
def security_analysis(image_path):
    """Analyze a security monitoring image"""
    prompt = """Analyze this surveillance image:
    1. Number of people in the scene
    2. Activities and behaviors
    3. Suspicious or abnormal situations
    4. Environmental safety hazards
    5. Time and lighting conditions
    """
    
    return analyze_image(image_path, prompt)

# 4. Artwork analysis
def analyze_artwork(image_path):
    """Analyze an artwork"""
    prompt = """Please analyze this artwork:
    1. Artistic style and movement
    2. Techniques and media used
    3. Color usage and composition
    4. Themes and symbolism
    5. Possible period of creation
    6. Artistic value evaluation
    """
    
    return analyze_image(image_path, prompt)

# 5. Real estate evaluation
def analyze_property(image_paths):
    """Analyze real estate images"""
    prompt = """Please evaluate these real estate photos:
    1. Room types and features
    2. Decoration style and quality
    3. Estimated space size
    4. Facilities and furniture conditions
    5. Maintenance status
    6. Improvement suggestions
    """
    
    return analyze_multiple_images(image_paths, prompt)

5. Performance Optimization

Optimization strategies

# Performance optimization strategies

class VisionOptimizer:
    """GPT-4 Vision optimization toolkit"""
    
    @staticmethod
    def resize_image(image_path, max_size=(2048, 2048)):
        """Resize image to optimize performance"""
        from PIL import Image
        
        img = Image.open(image_path)
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        
        # Convert to RGB if needed
        if img.mode in ('RGBA', 'LA'):
            background = Image.new('RGB', img.size, (255, 255, 255))
            background.paste(img, mask=img.split()[-1])
            img = background
        
        # Save optimized image
        optimized_path = f"optimized_{image_path}"
        img.save(optimized_path, "JPEG", quality=85)
        
        return optimized_path
    
    @staticmethod
    def calculate_token_cost(image_path, detail="high"):
        """Estimate image token consumption"""
        from PIL import Image
        
        img = Image.open(image_path)
        width, height = img.size
        
        if detail == "low":
            # Low-detail mode fixed at 85 tokens
            return 85
        
        # High-detail mode calculation
        # First scale within 2048x2048
        if width > 2048 or height > 2048:
            ratio = min(2048/width, 2048/height)
            width = int(width * ratio)
            height = int(height * ratio)
        
        # Then scale so the shortest side is 768px
        if min(width, height) > 768:
            ratio = 768 / min(width, height)
            width = int(width * ratio)
            height = int(height * ratio)
        
        # Calculate number of 512x512 tiles
        tiles_width = (width + 511) // 512
        tiles_height = (height + 511) // 512
        total_tiles = tiles_width * tiles_height
        
        # 170 tokens per tile, plus base 85 tokens
        return 170 * total_tiles + 85
    
    @staticmethod
    def batch_process_images(image_paths, prompt, batch_size=5):
        """Batch process images"""
        results = []
        
        for i in range(0, len(image_paths), batch_size):
            batch = image_paths[i:i+batch_size]
            
            # Optimize each image
            optimized_batch = []
            for path in batch:
                optimized_path = VisionOptimizer.resize_image(path)
                optimized_batch.append(optimized_path)
            
            # Batch analysis
            result = analyze_multiple_images(optimized_batch, prompt)
            results.append(result)
            
            # Cleanup temporary files
            for path in optimized_batch:
                os.remove(path)
        
        return results

# Usage example
optimizer = VisionOptimizer()

# 1. Calculate token cost
image = "large_photo.jpg"
tokens = optimizer.calculate_token_cost(image, detail="high")
cost = tokens * 0.01 / 1000  # Example pricing assumption
print(f"Estimated {tokens} tokens, cost about ${cost:.4f}")

# 2. Optimize large images
optimized = optimizer.resize_image("huge_image.png")

# 3. Batch processing
images = ["img1.jpg", "img2.jpg", "img3.jpg", "img4.jpg", "img5.jpg"]
results = optimizer.batch_process_images(
    images,
    "Describe the main content of each image",
    batch_size=3
)

📊 Token calculation formula

Low detail: Fixed 85 tokens

High detail: 170 × tiles + 85 tokens

Where tiles = ⌈width/512⌉ × ⌈height/512⌉

6. Streaming

Real-time analysis

# Stream processing for large volumes of images
import asyncio
import aiohttp
from typing import List, AsyncGenerator

class VisionStreamer:
    """Stream image analysis"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.n1n.ai/v1"
    
    async def analyze_image_stream(
        self,
        image_path: str,
        prompt: str
    ) -> AsyncGenerator[str, None]:
        """Stream image analysis results"""
        
        base64_image = encode_image(image_path)
        
        async with aiohttp.ClientSession() as session:
            async with session.post(
                f"{self.base_url}/chat/completions",
                headers={
                    "Authorization": f"Bearer {self.api_key}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "gpt-4o",
                    "messages": [{
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }],
                    "stream": True
                }
            ) as response:
                async for line in response.content:
                    if line:
                        decoded = line.decode('utf-8').strip()
                        if decoded.startswith("data: "):
                            data = decoded[6:]
                            if data != "[DONE]":
                                try:
                                    import json
                                    chunk = json.loads(data)
                                    content = chunk['choices'][0]['delta'].get('content', '')
                                    if content:
                                        yield content
                                except:
                                    pass

# Usage: streaming processing
async def main():
    streamer = VisionStreamer("your-api-key")
    
    # Output analysis results in real time
    async for chunk in streamer.analyze_image_stream(
        "complex_image.jpg",
        "Analyze all elements in this image in detail"
    ):
        print(chunk, end='', flush=True)

# 运行
asyncio.run(main())

7. Best Practices

🎯 Improve accuracy

  • ✅ Provide clear, high-quality images
  • ✅ Use specific, detailed prompts
  • ✅ Ensure good lighting for text recognition
  • ✅ Avoid excessive image compression
  • ✅ Provide contextual information

💰 Cost control

  • ✅ Preprocess to adjust image size
  • ✅ Choose detail level appropriately
  • ✅ Batch similar tasks
  • ✅ Cache common analysis results
  • ✅ Monitor token usage