Configuration API¶

The configuration module provides a structured, type-safe system for managing Mullama settings. All configuration structs implement Serialize and Deserialize for JSON and YAML file support, with environment variable overrides and comprehensive validation.

MullamaConfig¶

The top-level configuration struct encompassing all settings.

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MullamaConfig {
    pub model: ModelConfig,
    pub context: ContextConfig,
    pub sampler: SamplerConfig,
    pub server: Option<ServerConfig>,
    pub logging: Option<LoggingConfig>,
}

Fields¶

Name	Type	Required	Description
`model`	`ModelConfig`	Yes	Model loading and resource configuration
`context`	`ContextConfig`	Yes	Context parameters for inference
`sampler`	`SamplerConfig`	Yes	Sampling strategy configuration
`server`	`Option<ServerConfig>`	No	Web server configuration (when using `web` feature)
`logging`	`Option<LoggingConfig>`	No	Logging configuration

Loading Configuration¶

use mullama::config::MullamaConfig;

// From JSON file
let config = MullamaConfig::from_file("config.json")?;

// From YAML file
let config = MullamaConfig::from_file("config.yaml")?;

// From string
let json = r#"{"model": {"path": "model.gguf"}, ...}"#;
let config: MullamaConfig = serde_json::from_str(json)?;

// From environment variables (prefix: MULLAMA_)
let config = MullamaConfig::from_env()?;

// With validation
let config = MullamaConfig::from_file("config.json")?.validate()?;

Methods¶

impl MullamaConfig {
    /// Load configuration from a JSON or YAML file (auto-detected by extension)
    pub fn from_file(path: impl AsRef<Path>) -> Result<Self, MullamaError>;

    /// Load from environment variables with MULLAMA_ prefix
    pub fn from_env() -> Result<Self, MullamaError>;

    /// Merge with another config (other takes precedence)
    pub fn merge(&self, other: &Self) -> Self;

    /// Validate all fields and return validated config
    pub fn validate(self) -> Result<Self, MullamaError>;

    /// Get default configuration
    pub fn default() -> Self;

    /// Write configuration to a file
    pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), MullamaError>;
}

ModelConfig¶

Configuration for model loading behavior, corresponding to ModelParams.

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelConfig {
    pub path: String,
    pub gpu_layers: Option<i32>,
    pub split_mode: Option<String>,
    pub main_gpu: Option<i32>,
    pub tensor_split: Option<Vec<f32>>,
    pub vocab_only: Option<bool>,
    pub use_mmap: Option<bool>,
    pub use_mlock: Option<bool>,
    pub check_tensors: Option<bool>,
}

Fields¶

Name	Type	Default	Description
`path`	`String`	(required)	Path to the GGUF model file
`gpu_layers`	`Option<i32>`	`0`	Number of layers to offload to GPU
`split_mode`	`Option<String>`	`"none"`	GPU split mode: "none", "layer", "row"
`main_gpu`	`Option<i32>`	`0`	Primary GPU device index
`tensor_split`	`Option<Vec<f32>>`	`None`	GPU tensor split proportions
`vocab_only`	`Option<bool>`	`false`	Load vocabulary only
`use_mmap`	`Option<bool>`	`true`	Enable memory-mapped loading
`use_mlock`	`Option<bool>`	`false`	Lock model in RAM
`check_tensors`	`Option<bool>`	`true`	Validate tensor integrity on load

Conversion¶

impl From<ModelConfig> for ModelParams {
    fn from(config: ModelConfig) -> Self;
}

// Use in code:
let config = ModelConfig { path: "model.gguf".to_string(), ..Default::default() };
let params: ModelParams = config.into();
let model = Model::load_with_params(&config.path, params)?;

ContextConfig¶

Configuration for inference context parameters.

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContextConfig {
    pub n_ctx: Option<u32>,
    pub n_batch: Option<u32>,
    pub n_threads: Option<i32>,
    pub n_threads_batch: Option<i32>,
    pub rope_freq_base: Option<f32>,
    pub rope_freq_scale: Option<f32>,
    pub flash_attention: Option<bool>,
    pub embeddings: Option<bool>,
    pub offload_kqv: Option<bool>,
    pub kv_cache_type: Option<String>,
}

Fields¶

Name	Type	Default	Description
`n_ctx`	`Option<u32>`	`0` (model default)	Context window size
`n_batch`	`Option<u32>`	`2048`	Maximum batch size
`n_threads`	`Option<i32>`	CPU count	Generation threads
`n_threads_batch`	`Option<i32>`	CPU count	Batch processing threads
`rope_freq_base`	`Option<f32>`	`0.0`	RoPE base frequency
`rope_freq_scale`	`Option<f32>`	`0.0`	RoPE scale factor
`flash_attention`	`Option<bool>`	`false`	Enable flash attention
`embeddings`	`Option<bool>`	`false`	Enable embedding output
`offload_kqv`	`Option<bool>`	`true`	Offload KQV to GPU
`kv_cache_type`	`Option<String>`	`"f16"`	Cache type: "f32", "f16", "bf16", "q8_0", "q4_0"

Conversion¶

impl From<ContextConfig> for ContextParams {
    fn from(config: ContextConfig) -> Self;
}

SamplerConfig¶

Configuration for sampling strategies.

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SamplerConfig {
    pub temperature: Option<f32>,
    pub top_k: Option<i32>,
    pub top_p: Option<f32>,
    pub min_p: Option<f32>,
    pub typical_p: Option<f32>,
    pub repeat_penalty: Option<f32>,
    pub frequency_penalty: Option<f32>,
    pub presence_penalty: Option<f32>,
    pub penalty_last_n: Option<i32>,
    pub seed: Option<u32>,
    pub mirostat: Option<MirostatConfig>,
    pub grammar: Option<String>,
}

Fields¶

Name	Type	Default	Description
`temperature`	`Option<f32>`	`0.8`	Sampling temperature
`top_k`	`Option<i32>`	`40`	Top-K sampling limit
`top_p`	`Option<f32>`	`0.95`	Nucleus sampling threshold
`min_p`	`Option<f32>`	`0.05`	Minimum probability threshold
`typical_p`	`Option<f32>`	`1.0`	Typical sampling threshold
`repeat_penalty`	`Option<f32>`	`1.1`	Repetition penalty
`frequency_penalty`	`Option<f32>`	`0.0`	Frequency penalty
`presence_penalty`	`Option<f32>`	`0.0`	Presence penalty
`penalty_last_n`	`Option<i32>`	`64`	Penalty window size
`seed`	`Option<u32>`	Random	Random seed
`mirostat`	`Option<MirostatConfig>`	`None`	Mirostat sampling config
`grammar`	`Option<String>`	`None`	GBNF grammar for constrained output

MirostatConfig¶

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MirostatConfig {
    pub version: u8,       // 1 or 2
    pub tau: f32,          // Target entropy
    pub eta: f32,          // Learning rate
}

Conversion¶

impl From<SamplerConfig> for SamplerParams {
    fn from(config: SamplerConfig) -> Self;
}

Environment Variable Overrides¶

Every configuration field can be overridden via environment variables with the MULLAMA_ prefix. Nested fields use double underscores:

Environment Variable	Config Field	Example
`MULLAMA_MODEL__PATH`	`model.path`	`/path/to/model.gguf`
`MULLAMA_MODEL__GPU_LAYERS`	`model.gpu_layers`	`32`
`MULLAMA_MODEL__USE_MMAP`	`model.use_mmap`	`true`
`MULLAMA_CONTEXT__N_CTX`	`context.n_ctx`	`4096`
`MULLAMA_CONTEXT__N_BATCH`	`context.n_batch`	`512`
`MULLAMA_CONTEXT__N_THREADS`	`context.n_threads`	`8`
`MULLAMA_CONTEXT__FLASH_ATTENTION`	`context.flash_attention`	`true`
`MULLAMA_CONTEXT__KV_CACHE_TYPE`	`context.kv_cache_type`	`q8_0`
`MULLAMA_SAMPLER__TEMPERATURE`	`sampler.temperature`	`0.7`
`MULLAMA_SAMPLER__TOP_K`	`sampler.top_k`	`50`
`MULLAMA_SAMPLER__TOP_P`	`sampler.top_p`	`0.9`
`MULLAMA_SAMPLER__SEED`	`sampler.seed`	`42`

Priority order (highest to lowest):

Environment variables
CLI arguments (if using the daemon)
Configuration file values
Default values

Example:

# Override model path and GPU layers via environment
export MULLAMA_MODEL__PATH="/models/llama-7b-q4.gguf"
export MULLAMA_MODEL__GPU_LAYERS=32
export MULLAMA_CONTEXT__N_CTX=4096
export MULLAMA_SAMPLER__TEMPERATURE=0.7

# Application uses these overrides automatically
cargo run --example simple

Validation¶

The validate() method checks for:

Model path exists and is readable
GPU layers does not exceed model layer count (when known)
Context size is reasonable (0 < n_ctx <= 1048576)
Batch size is positive and not larger than context size
Thread count is positive
Temperature is non-negative
Top-K is non-negative (0 = disabled)
Top-P is in range [0.0, 1.0]
Penalties are in reasonable ranges
KV cache type string is valid

use mullama::config::MullamaConfig;

let config = MullamaConfig::from_file("config.json")?;

match config.validate() {
    Ok(valid_config) => {
        // Use valid_config safely
    }
    Err(MullamaError::ConfigError(msg)) => {
        eprintln!("Configuration error: {}", msg);
    }
    Err(e) => {
        eprintln!("Unexpected error: {}", e);
    }
}

Example JSON Configuration¶

{
  "model": {
    "path": "models/llama-7b-q4_0.gguf",
    "gpu_layers": 32,
    "use_mmap": true,
    "use_mlock": false,
    "check_tensors": true
  },
  "context": {
    "n_ctx": 4096,
    "n_batch": 512,
    "n_threads": 8,
    "n_threads_batch": 8,
    "flash_attention": true,
    "kv_cache_type": "q8_0",
    "embeddings": false,
    "offload_kqv": true
  },
  "sampler": {
    "temperature": 0.7,
    "top_k": 40,
    "top_p": 0.9,
    "min_p": 0.05,
    "repeat_penalty": 1.1,
    "frequency_penalty": 0.0,
    "presence_penalty": 0.0,
    "penalty_last_n": 64,
    "seed": 42
  },
  "server": {
    "host": "127.0.0.1",
    "port": 8080,
    "cors_origins": ["*"],
    "max_connections": 100
  },
  "logging": {
    "level": "info",
    "format": "json"
  }
}

Example YAML Configuration¶

model:
  path: models/llama-7b-q4_0.gguf
  gpu_layers: 32
  use_mmap: true
  use_mlock: false

context:
  n_ctx: 4096
  n_batch: 512
  n_threads: 8
  flash_attention: true
  kv_cache_type: q8_0

sampler:
  temperature: 0.7
  top_k: 40
  top_p: 0.9
  min_p: 0.05
  repeat_penalty: 1.1
  penalty_last_n: 64

server:
  host: 127.0.0.1
  port: 8080
  cors_origins:
    - "*"

logging:
  level: info
  format: json

ServerConfig¶

Configuration for the web server (requires web feature).

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ServerConfig {
    pub host: String,
    pub port: u16,
    pub cors_origins: Vec<String>,
    pub max_connections: usize,
    pub request_timeout_ms: u64,
    pub tls_cert: Option<String>,
    pub tls_key: Option<String>,
}

LoggingConfig¶

Configuration for logging output.

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LoggingConfig {
    pub level: String,      // "trace", "debug", "info", "warn", "error"
    pub format: String,     // "text", "json"
    pub file: Option<String>,  // Optional log file path
}

Complete Example¶

use mullama::{Model, Context, SamplerParams};
use mullama::config::MullamaConfig;
use std::sync::Arc;

fn main() -> Result<(), mullama::MullamaError> {
    // Load and validate configuration
    let config = MullamaConfig::from_file("mullama.json")?
        .validate()?;

    // Convert config sections to API types
    let model_params = config.model.clone().into();
    let model = Arc::new(Model::load_with_params(&config.model.path, model_params)?);

    let ctx_params = config.context.clone().into();
    let mut ctx = Context::new(model.clone(), ctx_params)?;

    let sampler_params: SamplerParams = config.sampler.clone().into();

    // Use model, context, and sampler...
    let tokens = model.tokenize("Hello, world!", true, false)?;
    let output = ctx.generate_with_params(&tokens, 100, &sampler_params)?;
    println!("{}", output);

    Ok(())
}