LLM Fine-tuning: Make AI Understand Your Business

Fine-tuning adapts a pre-trained LLM to specific tasks. By continuing training on domain data, you can significantly improve performance on specialized tasks while preserving general capabilities.

Fine-tuning Methods Comparison

MethodParams TrainedVRAM RequirementTraining SpeedEffectUse Case
Full-parameter FT100%Very highSlowBestAmple resources
LoRA0.1-1%LowFastGreatGeneral recommendation
QLoRA0.1-1%Very lowMediumGoodResource-constrained
Prefix Tuning0.01%Very lowVery fastModerateSimple tasks
Adapter1-5%MediumFastGoodMulti-task

LoRA Fine-tuning in Practice

Most popular parameter-efficient method

# LoRA fine-tuning example
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Load base model
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRA configuration
lora_config = LoraConfig(
    r=16,                      # LoRA rank
    lora_alpha=32,            # LoRA scaling factor
    target_modules=[          # Target modules
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj"
    ],
    lora_dropout=0.1,         # Dropout rate
    bias="none",              # Bias handling
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()
# trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06%

# Prepare training data
def prepare_dataset(examples):
    # Build instruction format
    texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        text = f"""### Instruction:
{instruction}

### Response:
{output}"""
        texts.append(text)
    
    # Tokenize
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    
    return encodings

# Training configuration
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch"
)

# Start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Save LoRA weights
model.save_pretrained("./lora_weights")

# Load for inference
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, "./lora_weights")
model = model.merge_and_unload()  # Merge weights

QLoRA: 4-bit Fine-tuning

Fine-tune LLMs on consumer GPUs

# QLoRA fine-tuning — very low VRAM usage
from transformers import BitsAndBytesConfig
import torch

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-hf",  # 13B model
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# QLoRA configuration
qlora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply QLoRA
model = get_peft_model(model, qlora_config)

# VRAM usage comparison
# Full-precision 13B: ~26GB
# QLoRA 13B: ~6GB (including gradients)

# Optionally use DeepSpeed for further optimization
deepspeed_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 2e-4,
            "betas": [0.9, 0.999],
            "eps": 1e-8
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 2e-4,
            "warmup_num_steps": 100
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8
    }
}

Data Preparation Best Practices

High-quality data is critical

Data format example

[
  {
    "instruction": "Translate the following text into English",
    "input": "Machine learning is a branch of artificial intelligence",
    "output": "Machine learning is a branch of artificial intelligence"
  },
  {
    "instruction": "Summarize the key points of the following text",
    "input": "Large language models are pre-trained on massive text corpora...",
    "output": "Key points: 1. Pretraining 2. Data scale 3. Application capabilities"
  },
  {
    "instruction": "Write a Python function implementing quicksort",
    "input": "",
    "output": "def quicksort(arr):
    if len(arr) <= 1:..."
  }
]

Data quality requirements

  • Accuracy: Ensure labels are correct
  • Diversity: Cover scenarios and edge cases
  • Consistency: Maintain uniform formatting and style
  • Representativeness: Reflect real usage scenarios
  • Right scale: Typically 1k–10k high-quality samples

Data cleaning flow

def clean_dataset(data):
    cleaned = []
    for item in data:
        # Remove empty items
        if not item['instruction'] or not item['output']:
            continue
        
        # Length filter
        if len(item['output']) < 10 or len(item['output']) > 2000:
            continue
        
        # Deduplicate
        text = item['instruction'] + item['output']
        if text in seen_texts:
            continue
        seen_texts.add(text)
        
        # Normalize format
        item['instruction'] = item['instruction'].strip()
        item['output'] = item['output'].strip()
        
        cleaned.append(item)
    
    return cleaned

Evaluating Fine-tuned Models

How to evaluate fine-tuned model performance

Automated metrics

  • Perplexity: Language model uncertainty
  • BLEU: Translation quality
  • ROUGE: Summarization quality
  • Accuracy: Classification accuracy
  • F1 Score: Harmonic mean of precision and recall

Human evaluation

  • Relevance: Is the answer on-topic?
  • Correctness: Is the information accurate?
  • Fluency: Is the text natural?
  • Completeness: Is the answer comprehensive?
  • Consistency: Is style consistent?

A/B Test Example

# Compare base vs fine-tuned models
def compare_models(base_model, finetuned_model, test_cases):
    results = []
    
    for case in test_cases:
        base_output = generate(base_model, case['input'])
        ft_output = generate(finetuned_model, case['input'])
        
        # Auto-scoring
        base_score = calculate_score(base_output, case['expected'])
        ft_score = calculate_score(ft_output, case['expected'])
        
        results.append({
            'input': case['input'],
            'base_output': base_output,
            'ft_output': ft_output,
            'base_score': base_score,
            'ft_score': ft_score,
            'improvement': ft_score - base_score
        })
    
    # Aggregate stats
    avg_improvement = np.mean([r['improvement'] for r in results])
    win_rate = sum(1 for r in results if r['ft_score'] > r['base_score']) / len(results)
    
    print(f"Average improvement: {avg_improvement:.2%}")
    print(f"Win rate: {win_rate:.2%}")

Serving and Deployment

Bring fine-tuned models to production

# Deploy a fine-tuned model with vLLM
from vllm import LLM, SamplingParams

# Load merged model
llm = LLM(
    model="./merged_model",
    tensor_parallel_size=1,
    dtype="half",  # FP16 inference
    max_model_len=2048
)

# API service
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class GenerationRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9

@app.post("/generate")
async def generate(request: GenerationRequest):
    sampling_params = SamplingParams(
        temperature=request.temperature,
        top_p=request.top_p,
        max_tokens=request.max_tokens
    )
    
    outputs = llm.generate([request.prompt], sampling_params)
    
    return {
        "generated_text": outputs[0].outputs[0].text,
        "usage": {
            "prompt_tokens": len(outputs[0].prompt_token_ids),
            "completion_tokens": len(outputs[0].outputs[0].token_ids)
        }
    }

# Run the service
# uvicorn main:app --host 0.0.0.0 --port 8000

# Docker deployment
dockerfile = """
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04

RUN pip install vllm fastapi uvicorn

COPY ./merged_model /model
COPY ./main.py /app/main.py

WORKDIR /app
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
"""

Fine-tuning Cost Estimates

Different Model Sizes Cost

Model sizeGPU requirementTraining timeCloud cost
7B (LoRA)1 × A100 40GB2–4 hours$10–20
13B (QLoRA)1 × A100 40GB4–8 hours$20–40
30B (LoRA)2 × A100 80GB8–16 hours$80–160
70B (QLoRA)4 × A100 80GB24–48 hours$500–1000

* Estimate based on 10k training samples and 3 epochs

Start Fine-tuning Your Own Model

With fine-tuning, you can make LLMs better understand and handle your domain tasks. Combined with LLM API, quickly deploy and use your fine-tuned model.

Start Fine-tuning