Basic Training Example¶
This comprehensive example demonstrates how to train a language model from scratch using LLMBuilder. We'll cover data preparation, tokenizer training, model training, and text generation.
🎯 Overview¶
In this example, we'll:
- Prepare training data from various document formats
- Train a tokenizer on our text corpus
- Configure and train a language model
- Generate text with the trained model
- Evaluate performance and iterate
📁 Project Structure¶
basic_training_example/
├── data/
│ ├── raw/ # Raw documents (PDF, DOCX, TXT)
│ └── processed/ # Cleaned text files
├── config/
│ └── training_config.json # Training configuration
├── output/
│ ├── tokenizer/ # Trained tokenizer
│ ├── model/ # Trained model
│ └── logs/ # Training logs
└── train_model.py # Main training script
🚀 Complete Training Script¶
#!/usr/bin/env python3
"""
Basic Training Example for LLMBuilder
This script demonstrates a complete training pipeline:
1. Data loading and preprocessing
2. Tokenizer training
3. Model training
4. Text generation and evaluation
"""
import os
import sys
from pathlib import Path
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def main():
"""Main training pipeline."""
# Configuration
project_dir = Path(__file__).parent
raw_data_dir = project_dir / "data" / "raw"
processed_data_dir = project_dir / "data" / "processed"
output_dir = project_dir / "output"
# Create directories
for dir_path in [processed_data_dir, output_dir]:
dir_path.mkdir(parents=True, exist_ok=True)
logger.info("🚀 Starting LLMBuilder training pipeline")
# Step 1: Data Preparation
logger.info("📁 Step 1: Preparing training data")
training_data_path = prepare_training_data(raw_data_dir, processed_data_dir)
# Step 2: Tokenizer Training
logger.info("🔤 Step 2: Training tokenizer")
tokenizer_dir = train_tokenizer(training_data_path, output_dir)
# Step 3: Model Training
logger.info("🧠 Step 3: Training language model")
model_path = train_language_model(training_data_path, tokenizer_dir, output_dir)
# Step 4: Text Generation
logger.info("🎯 Step 4: Testing text generation")
test_text_generation(model_path, tokenizer_dir)
# Step 5: Evaluation
logger.info("📊 Step 5: Evaluating model performance")
evaluate_model(model_path, tokenizer_dir, training_data_path)
logger.info("✅ Training pipeline completed successfully!")
def prepare_training_data(raw_data_dir, processed_data_dir):
"""Load and preprocess training data from various formats."""
from llmbuilder.data import DataLoader, TextCleaner
# Initialize data loader
loader = DataLoader(
min_length=50, # Filter short texts
clean_text=True, # Apply basic cleaning
remove_duplicates=True # Remove duplicate content
)
# Load all supported files
texts = []
supported_extensions = ['.txt', '.pdf', '.docx', '.md', '.html']
logger.info(f"Loading documents from {raw_data_dir}")
if raw_data_dir.exists():
for file_path in raw_data_dir.rglob("*"):
if file_path.suffix.lower() in supported_extensions:
try:
text = loader.load_file(file_path)
if text:
texts.append(text)
logger.info(f" ✅ Loaded {file_path.name}: {len(text):,} characters")
except Exception as e:
logger.warning(f" ❌ Failed to load {file_path.name}: {e}")
# If no files found, create sample data
if not texts:
logger.info("No data files found, creating sample training data")
sample_text = create_sample_data()
texts = [sample_text]
# Combine and clean texts
combined_text = "\n\n".join(texts)
# Advanced text cleaning
cleaner = TextCleaner(
normalize_whitespace=True,
remove_urls=True,
remove_emails=True,
min_sentence_length=20,
remove_duplicates=True,
language_filter="en" # Keep only English text
)
cleaned_text = cleaner.clean(combined_text)
stats = cleaner.get_stats()
logger.info(f"Text cleaning results:")
logger.info(f" Original: {stats.original_length:,} characters")
logger.info(f" Cleaned: {stats.cleaned_length:,} characters")
logger.info(f" Removed: {stats.removal_percentage:.1f}%")
# Save processed data
training_data_path = processed_data_dir / "training_data.txt"
with open(training_data_path, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
logger.info(f"Training data saved to {training_data_path}")
return training_data_path
def train_tokenizer(training_data_path, output_dir):
"""Train a BPE tokenizer on the training data."""
from llmbuilder.tokenizer import TokenizerTrainer
from llmbuilder.config import TokenizerConfig
tokenizer_dir = output_dir / "tokenizer"
tokenizer_dir.mkdir(exist_ok=True)
# Configure tokenizer
config = TokenizerConfig(
vocab_size=16000, # Vocabulary size
model_type="bpe", # Byte-Pair Encoding
character_coverage=1.0, # Cover all characters
max_sentence_length=4096, # Maximum sentence length
special_tokens=[ # Special tokens
"<pad>", "<unk>", "<s>", "</s>", "<mask>"
]
)
# Train tokenizer
trainer = TokenizerTrainer(config=config)
results = trainer.train(
input_file=str(training_data_path),
output_dir=str(tokenizer_dir),
model_prefix="tokenizer"
)
logger.info(f"Tokenizer training completed:")
logger.info(f" Model: {results['model_file']}")
logger.info(f" Vocab: {results['vocab_file']}")
logger.info(f" Training time: {results['training_time']:.1f}s")
# Test tokenizer
from llmbuilder.tokenizer import Tokenizer
tokenizer = Tokenizer.from_pretrained(str(tokenizer_dir))
test_text = "Hello, world! This is a test of the tokenizer."
tokens = tokenizer.encode(test_text)
decoded = tokenizer.decode(tokens)
logger.info(f"Tokenizer test:")
logger.info(f" Original: {test_text}")
logger.info(f" Tokens: {tokens}")
logger.info(f" Decoded: {decoded}")
logger.info(f" Perfect reconstruction: {test_text == decoded}")
return tokenizer_dir
def train_language_model(training_data_path, tokenizer_dir, output_dir):
"""Train the language model."""
import llmbuilder as lb
from llmbuilder.config import Config, ModelConfig, TrainingConfig
from llmbuilder.data import TextDataset
model_dir = output_dir / "model"
model_dir.mkdir(exist_ok=True)
# Create configuration
config = Config(
model=ModelConfig(
vocab_size=16000, # Must match tokenizer
num_layers=8, # Number of transformer layers
num_heads=8, # Number of attention heads
embedding_dim=512, # Embedding dimension
max_seq_length=1024, # Maximum sequence length
dropout=0.1, # Dropout rate
model_type="gpt" # Model architecture
),
training=TrainingConfig(
batch_size=8, # Batch size (adjust for your hardware)
num_epochs=10, # Number of training epochs
learning_rate=3e-4, # Learning rate
warmup_steps=1000, # Warmup steps
weight_decay=0.01, # Weight decay
max_grad_norm=1.0, # Gradient clipping
save_every=1000, # Save checkpoint every N steps
eval_every=500, # Evaluate every N steps
log_every=100 # Log every N steps
)
)
# Save configuration
config_path = model_dir / "config.json"
config.save(str(config_path))
logger.info(f"Configuration saved to {config_path}")
# Build model
model = lb.build_model(config.model)
num_params = sum(p.numel() for p in model.parameters())
logger.info(f"Model built with {num_params:,} parameters")
# Prepare dataset
dataset = TextDataset(
data_path=str(training_data_path),
block_size=config.model.max_seq_length,
stride=config.model.max_seq_length // 2, # 50% overlap
cache_in_memory=True
)
logger.info(f"Dataset prepared: {len(dataset):,} samples")
# Train model
results = lb.train_model(
model=model,
dataset=dataset,
config=config.training,
checkpoint_dir=str(model_dir)
)
logger.info(f"Training completed:")
logger.info(f" Final loss: {results.final_loss:.4f}")
logger.info(f" Training time: {results.training_time}")
logger.info(f" Model saved to: {results.model_path}")
return results.model_path
def test_text_generation(model_path, tokenizer_dir):
"""Test text generation with the trained model."""
import llmbuilder as lb
test_prompts = [
"Artificial intelligence is",
"The future of technology",
"Machine learning can help us",
"In the world of programming",
"The benefits of renewable energy"
]
logger.info("Testing text generation:")
for prompt in test_prompts:
try:
generated_text = lb.generate_text(
model_path=model_path,
tokenizer_path=str(tokenizer_dir),
prompt=prompt,
max_new_tokens=50,
temperature=0.8,
top_k=50,
top_p=0.9
)
logger.info(f" Prompt: {prompt}")
logger.info(f" Generated: {generated_text}")
logger.info("")
except Exception as e:
logger.error(f" Generation failed for '{prompt}': {e}")
def evaluate_model(model_path, tokenizer_dir, training_data_path):
"""Evaluate model performance."""
from llmbuilder.model import load_model
from llmbuilder.tokenizer import Tokenizer
from llmbuilder.data import TextDataset
import torch
# Load model and tokenizer
model = load_model(model_path)
tokenizer = Tokenizer.from_pretrained(str(tokenizer_dir))
# Create evaluation dataset (small sample)
eval_dataset = TextDataset(
data_path=str(training_data_path),
block_size=512,
stride=256,
max_samples=100 # Small sample for quick evaluation
)
# Calculate perplexity
model.eval()
total_loss = 0
total_tokens = 0
with torch.no_grad():
for i, batch in enumerate(eval_dataset):
if i >= 10: # Limit evaluation for speed
break
input_ids = torch.tensor([batch], dtype=torch.long)
# Forward pass
outputs = model(input_ids[:, :-1])
logits = outputs.logits if hasattr(outputs, 'logits') else outputs
# Calculate loss
targets = input_ids[:, 1:]
loss = torch.nn.functional.cross_entropy(
logits.view(-1, logits.size(-1)),
targets.view(-1),
ignore_index=tokenizer.pad_token_id
)
total_loss += loss.item()
total_tokens += targets.numel()
avg_loss = total_loss / min(10, len(eval_dataset))
perplexity = torch.exp(torch.tensor(avg_loss)).item()
logger.info(f"Model evaluation:")
logger.info(f" Average loss: {avg_loss:.4f}")
logger.info(f" Perplexity: {perplexity:.2f}")
logger.info(f" Model parameters: {sum(p.numel() for p in model.parameters()):,}")
def create_sample_data():
"""Create sample training data if no files are found."""
return """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
Machine learning (ML) is a type of artificial intelligence that allows software applications to become more accurate at predicting outcomes without being explicitly programmed to do so. Machine learning algorithms use historical data as input to predict new output values.
Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
Computer vision is an interdisciplinary scientific field that deals with how computers can gain high-level understanding from digital images or videos. From the perspective of engineering, it seeks to understand and automate tasks that the human visual system can do.
The future of artificial intelligence holds great promise for solving complex problems in healthcare, transportation, education, and many other fields. As AI systems become more sophisticated, they will continue to transform how we work, learn, and interact with technology.
"""
if __name__ == "__main__":
main()
📊 Expected Output¶
When you run this script, you should see output similar to:
INFO:__main__:🚀 Starting LLMBuilder training pipeline
INFO:__main__:📁 Step 1: Preparing training data
INFO:__main__:No data files found, creating sample training data
INFO:__main__:Text cleaning results:
INFO:__main__: Original: 1,234 characters
INFO:__main__: Cleaned: 1,180 characters
INFO:__main__: Removed: 4.4%
INFO:__main__:Training data saved to basic_training_example/data/processed/training_data.txt
INFO:__main__:🔤 Step 2: Training tokenizer
INFO:__main__:Tokenizer training completed:
INFO:__main__: Model: basic_training_example/output/tokenizer/tokenizer.model
INFO:__main__: Vocab: basic_training_example/output/tokenizer/tokenizer.vocab
INFO:__main__: Training time: 5.2s
INFO:__main__:Tokenizer test:
INFO:__main__: Original: Hello, world! This is a test of the tokenizer.
INFO:__main__: Tokens: [15496, 995, 0, 1188, 374, 264, 1296, 315, 279, 4037, 3213, 13]
INFO:__main__: Decoded: Hello, world! This is a test of the tokenizer.
INFO:__main__: Perfect reconstruction: True
INFO:__main__:🧠 Step 3: Training language model
INFO:__main__:Configuration saved to basic_training_example/output/model/config.json
INFO:__main__:Model built with 42,123,456 parameters
INFO:__main__:Dataset prepared: 156 samples
INFO:__main__:Training completed:
INFO:__main__: Final loss: 2.45
INFO:__main__: Training time: 0:15:23
INFO:__main__: Model saved to: basic_training_example/output/model/model.pt
INFO:__main__:🎯 Step 4: Testing text generation
INFO:__main__:Testing text generation:
INFO:__main__: Prompt: Artificial intelligence is
INFO:__main__: Generated: Artificial intelligence is a rapidly evolving field that encompasses machine learning, deep learning, and neural networks...
INFO:__main__:📊 Step 5: Evaluating model performance
INFO:__main__:Model evaluation:
INFO:__main__: Average loss: 2.52
INFO:__main__: Perplexity: 12.4
INFO:__main__: Model parameters: 42,123,456
INFO:__main__:✅ Training pipeline completed successfully!
🎯 Customization Options¶
1. Adjust Model Size¶
# Smaller model (faster training, less memory)
model_config = ModelConfig(
vocab_size=8000,
num_layers=4,
num_heads=4,
embedding_dim=256,
max_seq_length=512
)
# Larger model (better quality, more resources)
model_config = ModelConfig(
vocab_size=32000,
num_layers=16,
num_heads=16,
embedding_dim=1024,
max_seq_length=2048
)
2. Modify Training Parameters¶
# Fast training (for testing)
training_config = TrainingConfig(
batch_size=16,
num_epochs=3,
learning_rate=1e-3,
save_every=100
)
# High-quality training (for production)
training_config = TrainingConfig(
batch_size=8,
num_epochs=50,
learning_rate=1e-4,
warmup_steps=2000,
weight_decay=0.01
)
3. Different Data Sources¶
# Load from specific file types
loader = DataLoader(
supported_formats=['.txt', '.md'], # Only text and markdown
min_length=100, # Longer minimum length
max_length=10000, # Maximum length limit
clean_text=True
)
# Custom text cleaning
cleaner = TextCleaner(
normalize_whitespace=True,
remove_urls=True,
remove_emails=True,
remove_phone_numbers=True,
min_sentence_length=30,
language_filter="en",
custom_filters=[
lambda text: text.replace("specific_pattern", "replacement")
]
)
🚨 Troubleshooting¶
Common Issues¶
Out of Memory¶
# Reduce batch size
training_config.batch_size = 4
# Enable gradient checkpointing
model_config.gradient_checkpointing = True
# Use gradient accumulation
training_config.gradient_accumulation_steps = 4
Poor Generation Quality¶
# Train for more epochs
training_config.num_epochs = 20
# Use more training data
# Add more text files to data/raw/
# Adjust generation parameters
generated_text = lb.generate_text(
model_path=model_path,
tokenizer_path=str(tokenizer_dir),
prompt=prompt,
max_new_tokens=100,
temperature=0.7, # Lower temperature
top_k=40, # More focused sampling
repetition_penalty=1.1 # Reduce repetition
)
Slow Training¶
# Use GPU if available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
# Reduce sequence length
model_config.max_seq_length = 512
# Increase batch size (if memory allows)
training_config.batch_size = 16
📚 Next Steps¶
After running this basic example:
- Experiment with different model sizes and training parameters
- Add your own training data to the
data/raw/
directory - Try fine-tuning the trained model on specific tasks
- Export the model for deployment using the export functionality
- Implement evaluation metrics specific to your use case
Training Tips
- Start with small models and datasets to verify everything works
- Monitor training loss to ensure the model is learning
- Save checkpoints frequently during long training runs
- Test generation quality throughout training
- Keep track of what configurations work best for your data