Skip to content

Core API

The core LLMBuilder API provides high-level functions for common tasks. These functions are designed to be simple to use while providing access to the full power of the framework.

🎯 Overview

The core API is accessible through the main llmbuilder module:

import llmbuilder as lb

# High-level functions
config = lb.load_config(preset="cpu_small")
model = lb.build_model(config.model)
text = lb.generate_text(model_path, tokenizer_path, prompt)

📋 Core Functions

Configuration Functions

llmbuilder.load_config(path=None, preset=None)

Load configuration from file or use preset.

Parameters:

Name Type Description Default
path Optional[str]

Optional path to a JSON/YAML configuration file.

None
preset Optional[str]

Optional name of a built-in preset.

None

Returns:

Type Description
Any

A configuration object suitable for model/training builders.

Source code in llmbuilder/__init__.py
def load_config(path: Optional[str] = None, preset: Optional[str] = None) -> Any:
    """Load configuration from file or use preset.

    Args:
        path: Optional path to a JSON/YAML configuration file.
        preset: Optional name of a built-in preset.

    Returns:
        A configuration object suitable for model/training builders.
    """
    from .config import load_config as _load_config

    return _load_config(path, preset)

Model Functions

llmbuilder.build_model(config)

Build a model from configuration.

Source code in llmbuilder/__init__.py
def build_model(config: Any) -> Any:
    """Build a model from configuration."""
    from .model import build_model as _build_model

    return _build_model(config)

Training Functions

llmbuilder.train_model(model, dataset, config)

Train a model with the given dataset and configuration.

Source code in llmbuilder/__init__.py
def train_model(model: Any, dataset: Any, config: Any) -> Any:
    """Train a model with the given dataset and configuration."""
    from .training import train_model as _train_model

    return _train_model(model, dataset, config)

llmbuilder.train(data_path, output_dir, config=None, clean=False)

High-level training function that handles the complete training pipeline.

Parameters:

Name Type Description Default
data_path Union[str, Path, List[Union[str, Path]]]

Path to input data file(s) or directory

required
output_dir Union[str, Path]

Directory to save outputs (tokenizer, checkpoints, etc.)

required
config Optional[Dict[str, Any]]

Optional configuration dictionary

None
clean bool

If True, clean up previous outputs before starting

False

Returns:

Name Type Description
TrainingPipeline TrainingPipeline

The trained pipeline instance

Example

import llmbuilder

Train with default settings

pipeline = llmbuilder.train( ... data_path="./my_data/", ... output_dir="./output/" ... )

Generate text after training

text = pipeline.generate("The future of AI is")

Source code in llmbuilder/__init__.py
def train(
    data_path: Union[str, Path, List[Union[str, Path]]],
    output_dir: Union[str, Path],
    config: Optional[Dict[str, Any]] = None,
    clean: bool = False,
) -> "TrainingPipeline":
    """
    High-level training function that handles the complete training pipeline.

    Args:
        data_path: Path to input data file(s) or directory
        output_dir: Directory to save outputs (tokenizer, checkpoints, etc.)
        config: Optional configuration dictionary
        clean: If True, clean up previous outputs before starting

    Returns:
        TrainingPipeline: The trained pipeline instance

    Example:
        >>> import llmbuilder
        >>>
        >>> # Train with default settings
        >>> pipeline = llmbuilder.train(
        ...     data_path="./my_data/",
        ...     output_dir="./output/"
        ... )
        >>>
        >>> # Generate text after training
        >>> text = pipeline.generate("The future of AI is")
    """
    from .pipeline import train as _train

    return _train(data_path, output_dir, config or {}, clean)

Generation Functions

llmbuilder.generate_text(model_path, tokenizer_path, prompt, **kwargs)

Generate text using a trained model.

Parameters:

Name Type Description Default
model_path str

Path to trained model checkpoint

required
tokenizer_path str

Path to tokenizer directory

required
prompt str

Input text prompt

required
**kwargs Any

Additional generation parameters (temperature, top_k, top_p, etc.)

{}

Returns:

Type Description
str

Generated text string

Example

import llmbuilder

text = llmbuilder.generate_text( ... model_path="./output/checkpoints/model.pt", ... tokenizer_path="./output/tokenizer/", ... prompt="The future of AI is", ... max_new_tokens=100, ... temperature=0.8 ... )

Source code in llmbuilder/__init__.py
def generate_text(
    model_path: str, tokenizer_path: str, prompt: str, **kwargs: Any
) -> str:
    """
    Generate text using a trained model.

    Args:
        model_path: Path to trained model checkpoint
        tokenizer_path: Path to tokenizer directory
        prompt: Input text prompt
        **kwargs: Additional generation parameters (temperature, top_k, top_p, etc.)

    Returns:
        Generated text string

    Example:
        >>> import llmbuilder
        >>>
        >>> text = llmbuilder.generate_text(
        ...     model_path="./output/checkpoints/model.pt",
        ...     tokenizer_path="./output/tokenizer/",
        ...     prompt="The future of AI is",
        ...     max_new_tokens=100,
        ...     temperature=0.8
        ... )
    """
    from .inference import generate_text as _generate_text

    return _generate_text(model_path, tokenizer_path, prompt, **kwargs)

llmbuilder.interactive_cli(model_path, tokenizer_path, **kwargs)

Start an interactive CLI for text generation.

Parameters:

Name Type Description Default
model_path str

Path to trained model checkpoint

required
tokenizer_path str

Path to tokenizer directory

required
**kwargs Any

Additional configuration parameters

{}
Example

import llmbuilder

llmbuilder.interactive_cli( ... model_path="./output/checkpoints/model.pt", ... tokenizer_path="./output/tokenizer/", ... temperature=0.8 ... )

Source code in llmbuilder/__init__.py
def interactive_cli(model_path: str, tokenizer_path: str, **kwargs: Any) -> None:
    """
    Start an interactive CLI for text generation.

    Args:
        model_path: Path to trained model checkpoint
        tokenizer_path: Path to tokenizer directory
        **kwargs: Additional configuration parameters

    Example:
        >>> import llmbuilder
        >>>
        >>> llmbuilder.interactive_cli(
        ...     model_path="./output/checkpoints/model.pt",
        ...     tokenizer_path="./output/tokenizer/",
        ...     temperature=0.8
        ... )
    """
    from .inference import interactive_cli as _interactive_cli

    _interactive_cli(model_path, tokenizer_path, **kwargs)

Fine-tuning Functions

llmbuilder.finetune_model(model, dataset, config, **kwargs)

Fine-tune a model with the given dataset and configuration.

Source code in llmbuilder/__init__.py
def finetune_model(model: Any, dataset: Any, config: Any, **kwargs: Any) -> Any:
    """Fine-tune a model with the given dataset and configuration."""
    from .finetune import finetune_model as _finetune_model

    return _finetune_model(model, dataset, config, **kwargs)

🚀 Quick Examples

Basic Training Pipeline

import llmbuilder as lb

# 1. Load configuration
config = lb.load_config(preset="cpu_small")

# 2. Build model
model = lb.build_model(config.model)

# 3. Prepare dataset
from llmbuilder.data import TextDataset
dataset = TextDataset("training_data.txt", block_size=config.model.max_seq_length)

# 4. Train model
results = lb.train_model(model, dataset, config.training)

# 5. Generate text
text = lb.generate_text(
    model_path="./checkpoints/model.pt",
    tokenizer_path="./tokenizers",
    prompt="Hello world",
    max_new_tokens=50
)

High-Level Training

import llmbuilder as lb

# Complete training pipeline in one function
pipeline = lb.train(
    data_path="./my_data/",
    output_dir="./output/",
    config={
        "model": {"num_layers": 8, "embedding_dim": 512},
        "training": {"num_epochs": 10, "batch_size": 16}
    }
)

# Generate text after training
text = pipeline.generate("The future of AI is")

Interactive Generation

import llmbuilder as lb

# Start interactive text generation
lb.interactive_cli(
    model_path="./model/model.pt",
    tokenizer_path="./tokenizer/",
    temperature=0.8,
    max_new_tokens=100
)

🔧 Advanced Usage

Custom Configuration

import llmbuilder as lb
from llmbuilder.config import Config, ModelConfig, TrainingConfig

# Create custom configuration
config = Config(
    model=ModelConfig(
        vocab_size=32000,
        num_layers=24,
        num_heads=16,
        embedding_dim=1024
    ),
    training=TrainingConfig(
        batch_size=8,
        learning_rate=1e-4,
        num_epochs=20
    )
)

# Use with core functions
model = lb.build_model(config.model)

Error Handling

import llmbuilder as lb
from llmbuilder.utils import ModelError, DataError

try:
    config = lb.load_config("config.json")
    model = lb.build_model(config.model)
except ModelError as e:
    print(f"Model error: {e}")
except DataError as e:
    print(f"Data error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

📊 Return Types

Training Results

results = lb.train_model(model, dataset, config)

# Access training metrics
print(f"Final loss: {results.final_loss}")
print(f"Training time: {results.training_time}")
print(f"Best validation loss: {results.best_val_loss}")
print(f"Model path: {results.model_path}")

Generation Results

# Simple string return
text = lb.generate_text(model_path, tokenizer_path, prompt)

# With detailed results
from llmbuilder.inference import generate_with_details

result = generate_with_details(
    model_path=model_path,
    tokenizer_path=tokenizer_path,
    prompt=prompt,
    return_details=True
)

print(f"Generated text: {result.text}")
print(f"Generation time: {result.generation_time}")
print(f"Tokens per second: {result.tokens_per_second}")

🎯 Best Practices

1. Configuration Management

# Use presets as starting points
config = lb.load_config(preset="gpu_medium")

# Modify specific settings
config.model.num_layers = 16
config.training.learning_rate = 5e-5

# Save for reuse
config.save("my_config.json")

2. Resource Management

# Check available resources
from llmbuilder.utils import get_device_info

device_info = get_device_info()
if device_info.has_cuda:
    config = lb.load_config(preset="gpu_medium")
else:
    config = lb.load_config(preset="cpu_small")

3. Error Recovery

# Implement checkpointing
try:
    results = lb.train_model(model, dataset, config)
except KeyboardInterrupt:
    print("Training interrupted, saving checkpoint...")
    # Checkpoint is automatically saved
except Exception as e:
    print(f"Training failed: {e}")
    # Resume from last checkpoint if available

Core API Tips

  • Start with high-level functions and move to lower-level APIs as needed
  • Use configuration presets as starting points
  • Always handle exceptions appropriately
  • Take advantage of automatic checkpointing for long training runs