Lab 11: Agent Fine-tuning¶
⏱️ Estimated completion time: 55 minutes
Overview¶
This lab demonstrates fine-tuning techniques for improving agent performance, including data collection, model training, and evaluation metrics.
Learning Objectives¶
- Understanding agent fine-tuning workflows
- Collecting and preparing training data
- Evaluating fine-tuned agent performance
Key Concepts¶
Fine-tuning Process¶
- Data Collection: Gathering high-quality training examples
- Model Training: Fine-tuning pre-trained models
- Evaluation: Measuring performance improvements
Lab Code¶
#!/usr/bin/env python3
"""
Agent Fine-tuning Demo
Demonstrate fine-tuning workflow for agent improvement.
"""
import json
import random
from typing import List, Dict, Tuple
from dataclasses import dataclass
@dataclass
class TrainingExample:
"""Training example for agent fine-tuning."""
input_text: str
expected_output: str
task_type: str
quality_score: float
@dataclass
class AgentResponse:
"""Agent response with metadata."""
text: str
confidence: float
reasoning: str
class MockAgent:
"""Mock agent for demonstration purposes."""
def __init__(self, version="baseline"):
self.version = version
self.performance_boost = 0.1 if version == "fine-tuned" else 0.0
def generate_response(self, input_text: str) -> AgentResponse:
"""Generate response to input text."""
# Mock response generation
base_confidence = 0.7 + self.performance_boost
if "weather" in input_text.lower():
response = "The weather is sunny and pleasant today."
reasoning = "Identified weather query and provided current conditions."
elif "travel" in input_text.lower():
response = "I can help you plan your travel itinerary."
reasoning = "Detected travel-related request and offered assistance."
else:
response = "I understand your request and will help accordingly."
reasoning = "General query handling with helpful response."
return AgentResponse(
text=response,
confidence=min(1.0, base_confidence + random.uniform(-0.1, 0.1)),
reasoning=reasoning
)
class DataCollector:
"""Collect and manage training data for fine-tuning."""
def __init__(self):
self.examples: List[TrainingExample] = []
def collect_interaction_data(self, user_input: str, agent_output: str,
human_rating: float, task_type: str) -> None:
"""Collect data from human-agent interactions."""
example = TrainingExample(
input_text=user_input,
expected_output=agent_output,
task_type=task_type,
quality_score=human_rating
)
self.examples.append(example)
def generate_synthetic_data(self) -> None:
"""Generate synthetic training examples."""
synthetic_examples = [
("What's the weather like?", "The weather is sunny with 22°C temperature.", "weather", 0.9),
("Plan my trip to Paris", "I'll help you plan a wonderful trip to Paris with hotels and attractions.", "travel", 0.85),
("Book a flight", "I can assist you with flight booking options and recommendations.", "travel", 0.8),
("Tell me about restaurants", "Here are some excellent restaurant recommendations in your area.", "general", 0.75),
("Weather forecast", "The forecast shows sunny weather for the next few days.", "weather", 0.9),
]
for input_text, output, task_type, quality in synthetic_examples:
self.collect_interaction_data(input_text, output, quality, task_type)
def filter_high_quality(self, min_score: float = 0.8) -> List[TrainingExample]:
"""Filter examples by quality score."""
return [ex for ex in self.examples if ex.quality_score >= min_score]
def prepare_training_data(self) -> Dict[str, List[Dict]]:
"""Prepare data for fine-tuning format."""
high_quality = self.filter_high_quality()
training_data = {
"examples": [],
"metadata": {
"total_examples": len(high_quality),
"task_distribution": {}
}
}
for example in high_quality:
training_data["examples"].append({
"input": example.input_text,
"output": example.expected_output,
"task_type": example.task_type
})
# Track task distribution
task = example.task_type
training_data["metadata"]["task_distribution"][task] = \
training_data["metadata"]["task_distribution"].get(task, 0) + 1
return training_data
class AgentEvaluator:
"""Evaluate agent performance before and after fine-tuning."""
def __init__(self):
self.test_cases = [
{"input": "What's tomorrow's weather?", "expected_type": "weather"},
{"input": "Help me book a vacation", "expected_type": "travel"},
{"input": "Find good restaurants nearby", "expected_type": "general"},
{"input": "Weather update please", "expected_type": "weather"},
{"input": "Plan my business trip", "expected_type": "travel"},
]
def evaluate_agent(self, agent: MockAgent) -> Dict[str, float]:
"""Evaluate agent performance."""
total_confidence = 0.0
task_accuracy = 0.0
response_quality = 0.0
for test_case in self.test_cases:
response = agent.generate_response(test_case["input"])
# Aggregate confidence scores
total_confidence += response.confidence
# Mock task accuracy (would be more sophisticated in practice)
expected_keywords = {
"weather": ["weather", "temperature", "sunny", "forecast"],
"travel": ["travel", "trip", "plan", "booking"],
"general": ["help", "assist", "recommend"]
}
expected_type = test_case["expected_type"]
keywords = expected_keywords.get(expected_type, [])
if any(keyword in response.text.lower() for keyword in keywords):
task_accuracy += 1.0
# Mock response quality score
response_quality += min(1.0, response.confidence + 0.1)
num_tests = len(self.test_cases)
return {
"average_confidence": total_confidence / num_tests,
"task_accuracy": task_accuracy / num_tests,
"response_quality": response_quality / num_tests,
"overall_score": (total_confidence + task_accuracy + response_quality) / (3 * num_tests)
}
def simulate_fine_tuning(training_data: Dict) -> MockAgent:
"""Simulate the fine-tuning process."""
print(f"Fine-tuning with {training_data['metadata']['total_examples']} examples...")
print(f"Task distribution: {training_data['metadata']['task_distribution']}")
# In practice, this would involve:
# 1. Loading pre-trained model
# 2. Preparing data in correct format
# 3. Training with appropriate hyperparameters
# 4. Validation and early stopping
# 5. Model checkpointing
print("Training process:")
print("- Epoch 1: Loss = 2.45")
print("- Epoch 2: Loss = 1.89")
print("- Epoch 3: Loss = 1.34")
print("- Epoch 4: Loss = 1.12")
print("- Epoch 5: Loss = 0.95")
print("Training completed!")
# Return "fine-tuned" agent
return MockAgent(version="fine-tuned")
def main():
print("=== Agent Fine-tuning Demo ===")
# Step 1: Data Collection
print("\n1. Collecting Training Data")
collector = DataCollector()
collector.generate_synthetic_data()
print(f"Collected {len(collector.examples)} training examples")
# Step 2: Data Preparation
print("\n2. Preparing Training Data")
training_data = collector.prepare_training_data()
print(f"High-quality examples: {training_data['metadata']['total_examples']}")
print(f"Task distribution: {training_data['metadata']['task_distribution']}")
# Step 3: Baseline Evaluation
print("\n3. Evaluating Baseline Agent")
baseline_agent = MockAgent(version="baseline")
evaluator = AgentEvaluator()
baseline_metrics = evaluator.evaluate_agent(baseline_agent)
print("Baseline Performance:")
for metric, value in baseline_metrics.items():
print(f" {metric}: {value:.3f}")
# Step 4: Fine-tuning
print("\n4. Fine-tuning Agent")
fine_tuned_agent = simulate_fine_tuning(training_data)
# Step 5: Post-training Evaluation
print("\n5. Evaluating Fine-tuned Agent")
fine_tuned_metrics = evaluator.evaluate_agent(fine_tuned_agent)
print("Fine-tuned Performance:")
for metric, value in fine_tuned_metrics.items():
print(f" {metric}: {value:.3f}")
# Step 6: Performance Comparison
print("\n6. Performance Improvement")
for metric in baseline_metrics:
improvement = fine_tuned_metrics[metric] - baseline_metrics[metric]
print(f" {metric}: {improvement:+.3f} ({improvement/baseline_metrics[metric]*100:+.1f}%)")
# Step 7: Example Interactions
print("\n7. Example Interactions")
test_inputs = ["What's the weather like?", "Help me plan a trip"]
for input_text in test_inputs:
print(f"\nInput: {input_text}")
baseline_response = baseline_agent.generate_response(input_text)
fine_tuned_response = fine_tuned_agent.generate_response(input_text)
print(f"Baseline: {baseline_response.text} (confidence: {baseline_response.confidence:.2f})")
print(f"Fine-tuned: {fine_tuned_response.text} (confidence: {fine_tuned_response.confidence:.2f})")
if __name__ == "__main__":
main()
How to Run¶
- Save as
11_agent_finetuning.py
- Run:
python 11_agent_finetuning.py
Key Features¶
- Data Collection: Systematic gathering of training examples
- Quality Filtering: Focus on high-quality training data
- Performance Metrics: Comprehensive evaluation framework
- Comparison Analysis: Before/after performance tracking