Go Bindings¶
Idiomatic Go bindings for the Mullama LLM library, built with cgo for direct integration with the C FFI layer. The bindings provide goroutine-safe model sharing, defer-based resource cleanup, and standard Go error patterns.
Installation¶
Prerequisites¶
The Go bindings require the pre-built libmullama_ffi shared library and header file:
# Build the FFI library from the mullama source
cargo build --release -p mullama-ffi
# The library is output to: target/release/libmullama_ffi.so (Linux)
# target/release/libmullama_ffi.dylib (macOS)
Set the appropriate environment variables so cgo can find the library:
export CGO_CFLAGS="-I/path/to/mullama/bindings/ffi/include"
export CGO_LDFLAGS="-L/path/to/mullama/target/release -lmullama_ffi"
export LD_LIBRARY_PATH="/path/to/mullama/target/release:$LD_LIBRARY_PATH"
Requirements
- Go >= 1.21
- cgo enabled (
CGO_ENABLED=1, the default) libmullama_ffishared librarymullama.hheader file
Quick Start¶
package main
import (
"fmt"
"log"
"github.com/cognisoc/mullama"
)
func main() {
// Load a model with GPU offloading
model, err := mullama.LoadModel("./model.gguf", &mullama.ModelParams{
NGPULayers: 32,
})
if err != nil {
log.Fatal(err)
}
defer model.Free()
// Create an inference context
ctx, err := mullama.NewContext(model, &mullama.ContextParams{
NCtx: 2048,
})
if err != nil {
log.Fatal(err)
}
defer ctx.Free()
// Generate text
text, err := ctx.Generate("Once upon a time", 100, nil)
if err != nil {
log.Fatal(err)
}
fmt.Println(text)
}
API Reference¶
Backend Functions¶
BackendInit()¶
Initialize the mullama backend. Called automatically on first model load. Safe to call multiple times (uses sync.Once internally).
BackendFree()¶
Free backend resources. Call before program exit for clean shutdown.
SupportsGPUOffload()¶
Check if GPU offloading is available.
SystemInfo()¶
Get system information about the backend.
MaxDevices()¶
Get the maximum number of compute devices.
Version()¶
Get the library version.
ModelParams¶
Configuration for model loading.
type ModelParams struct {
// NGPULayers is the number of layers to offload to GPU
// 0 = CPU only, -1 = all layers
NGPULayers int32
// UseMmap enables memory mapping for model loading
UseMmap bool
// UseMlock locks the model in memory (prevents swapping)
UseMlock bool
// VocabOnly loads only the vocabulary (for tokenization)
VocabOnly bool
}
DefaultModelParams()¶
Returns sensible defaults for model loading.
func DefaultModelParams() ModelParams
// NGPULayers: 0, UseMmap: true, UseMlock: false, VocabOnly: false
Model¶
The Model type represents a loaded LLM model.
LoadModel(path, params)¶
Load a model from a GGUF file.
| Parameter | Type | Description |
|---|---|---|
path |
string |
Path to the GGUF model file |
params |
*ModelParams |
Loading parameters (nil for defaults) |
Returns: (*Model, error)
model, err := mullama.LoadModel("./model.gguf", &mullama.ModelParams{
NGPULayers: -1, // offload all layers
UseMmap: true,
})
if err != nil {
log.Fatalf("Failed to load model: %v", err)
}
defer model.Free()
Memory Management
Always call model.Free() when done, or use defer model.Free(). The Go garbage collector will also call Free() via a finalizer, but explicit cleanup is recommended for deterministic resource release.
model.Free()¶
Release model resources. Safe to call multiple times.
model.Tokenize(text, addBos, special)¶
Convert text to token IDs.
tokens, err := model.Tokenize("Hello, world!", true, false)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Tokens: %v\n", tokens)
model.Detokenize(tokens, removeSpecial, unparseSpecial)¶
Convert token IDs back to text.
text, err := model.Detokenize(tokens, false, false)
if err != nil {
log.Fatal(err)
}
fmt.Println(text)
Model Properties¶
func (m *Model) NCtxTrain() int32 // Training context size
func (m *Model) NEmbd() int32 // Embedding dimension
func (m *Model) NVocab() int32 // Vocabulary size
func (m *Model) NLayer() int32 // Number of layers
func (m *Model) NHead() int32 // Number of attention heads
func (m *Model) TokenBOS() int32 // BOS token ID
func (m *Model) TokenEOS() int32 // EOS token ID
func (m *Model) Size() uint64 // Model size in bytes
func (m *Model) NParams() uint64 // Number of parameters
func (m *Model) Description() string // Model description
func (m *Model) TokenIsEOG(token int32) bool // Check if token is EOG
fmt.Printf("Model: %s\n", model.Description())
fmt.Printf("Parameters: %d\n", model.NParams())
fmt.Printf("Layers: %d\n", model.NLayer())
fmt.Printf("Embedding dim: %d\n", model.NEmbd())
fmt.Printf("Context size: %d\n", model.NCtxTrain())
ContextParams¶
Configuration for context creation.
type ContextParams struct {
// NCtx is the context size (0 = model default)
NCtx uint32
// NBatch is the batch size for prompt processing
NBatch uint32
// NThreads is the number of threads (0 = auto)
NThreads int32
// Embeddings enables embeddings mode
Embeddings bool
}
DefaultContextParams()¶
Returns sensible defaults for context creation.
func DefaultContextParams() ContextParams
// NCtx: 0, NBatch: 2048, NThreads: runtime.NumCPU(), Embeddings: false
Context¶
The Context type represents an inference context.
NewContext(model, params)¶
Create a new inference context from a model.
ctx, err := mullama.NewContext(model, &mullama.ContextParams{
NCtx: 4096,
NBatch: 512,
NThreads: 8,
})
if err != nil {
log.Fatal(err)
}
defer ctx.Free()
ctx.Free()¶
Release context resources.
ctx.Generate(prompt, maxTokens, params)¶
Generate text from a string prompt.
| Parameter | Type | Description |
|---|---|---|
prompt |
string |
Text prompt |
maxTokens |
int |
Maximum tokens to generate |
params |
*SamplerParams |
Sampling parameters (nil for defaults) |
text, err := ctx.Generate("Hello, AI!", 100, nil)
if err != nil {
log.Fatal(err)
}
fmt.Println(text)
ctx.GenerateFromTokens(tokens, maxTokens, params)¶
Generate text from pre-tokenized input.
func (c *Context) GenerateFromTokens(tokens []int32, maxTokens int, params *SamplerParams) (string, error)
tokens, _ := model.Tokenize("Hello!", true, false)
text, err := ctx.GenerateFromTokens(tokens, 100, nil)
ctx.GenerateStream(prompt, maxTokens, params, callback)¶
Generate text with a streaming callback.
func (c *Context) GenerateStream(prompt string, maxTokens int, params *SamplerParams, callback StreamCallback) error
The StreamCallback type:
Return true from the callback to continue generation, false to stop.
err := ctx.GenerateStream("Once upon a time", 200, nil, func(token string) bool {
fmt.Print(token)
return true // continue
})
if err != nil {
log.Fatal(err)
}
fmt.Println()
ctx.ClearCache()¶
Clear the KV cache.
Context Properties¶
SamplerParams¶
Configuration for text generation sampling.
type SamplerParams struct {
Temperature float32
TopK int32
TopP float32
MinP float32
TypicalP float32
PenaltyRepeat float32
PenaltyFreq float32
PenaltyPresent float32
PenaltyLastN int32
Seed uint32
}
Preset Functions¶
func DefaultSamplerParams() SamplerParams // temperature=0.8, topK=40
func GreedySamplerParams() SamplerParams // temperature=0.0, topK=1
func CreativeSamplerParams() SamplerParams // temperature=1.2, topK=100
func PreciseSamplerParams() SamplerParams // temperature=0.3, topK=20
// Deterministic generation
text, _ := ctx.Generate("2+2=", 10, &mullama.GreedySamplerParams())
// Creative generation
params := mullama.CreativeSamplerParams()
text, _ := ctx.Generate("Write a story:", 300, ¶ms)
// Custom parameters
params := &mullama.SamplerParams{
Temperature: 0.7,
TopK: 50,
TopP: 0.9,
PenaltyRepeat: 1.1,
}
text, _ := ctx.Generate("Hello", 100, params)
EmbeddingGenerator¶
The EmbeddingGenerator type generates text embeddings.
NewEmbeddingGenerator(model, nCtx, normalize)¶
Create a new embedding generator.
| Parameter | Type | Description |
|---|---|---|
model |
*Model |
Model for embeddings |
nCtx |
uint32 |
Context size (0 = 512 default) |
normalize |
bool |
Normalize embeddings to unit length |
eg, err := mullama.NewEmbeddingGenerator(model, 512, true)
if err != nil {
log.Fatal(err)
}
defer eg.Free()
eg.Free()¶
Release embedding generator resources.
eg.Embed(text)¶
Generate an embedding vector for text.
embedding, err := eg.Embed("Hello, world!")
if err != nil {
log.Fatal(err)
}
fmt.Printf("Dimensions: %d\n", len(embedding))
eg.EmbedBatch(texts)¶
Generate embeddings for multiple texts.
texts := []string{"Hello", "World", "Test"}
embeddings, err := eg.EmbedBatch(texts)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Count: %d\n", len(embeddings))
eg.NEmbd()¶
Get the embedding dimension.
Utility Functions¶
CosineSimilarity(a, b)¶
Compute cosine similarity between two vectors.
Returns: Similarity value between -1 and 1, or error if vectors have different lengths.
sim, err := mullama.CosineSimilarity(emb1, emb2)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Similarity: %.4f\n", sim)
Error Variables¶
The package defines sentinel errors for common failure cases:
var (
ErrNullPointer = errors.New("null pointer")
ErrModelLoad = errors.New("failed to load model")
ErrContext = errors.New("failed to create context")
ErrTokenization = errors.New("tokenization failed")
ErrGeneration = errors.New("generation failed")
ErrEmbedding = errors.New("embedding generation failed")
ErrInvalidInput = errors.New("invalid input")
)
Use errors.Is() to check for specific error types:
_, err := mullama.LoadModel("bad_path.gguf", nil)
if err != nil {
fmt.Printf("Error: %v\n", err) // detailed message from FFI layer
}
Examples¶
Basic Generation¶
package main
import (
"fmt"
"log"
"github.com/cognisoc/mullama"
)
func main() {
defer mullama.BackendFree()
model, err := mullama.LoadModel("./model.gguf", &mullama.ModelParams{
NGPULayers: -1,
})
if err != nil {
log.Fatal(err)
}
defer model.Free()
ctx, err := mullama.NewContext(model, &mullama.ContextParams{
NCtx: 2048,
})
if err != nil {
log.Fatal(err)
}
defer ctx.Free()
params := mullama.PreciseSamplerParams()
text, err := ctx.Generate("Explain Go concurrency in one paragraph:", 200, ¶ms)
if err != nil {
log.Fatal(err)
}
fmt.Println(text)
}
Streaming Generation¶
package main
import (
"fmt"
"log"
"github.com/cognisoc/mullama"
)
func main() {
model, err := mullama.LoadModel("./model.gguf", nil)
if err != nil {
log.Fatal(err)
}
defer model.Free()
ctx, err := mullama.NewContext(model, nil)
if err != nil {
log.Fatal(err)
}
defer ctx.Free()
fmt.Print("Response: ")
err = ctx.GenerateStream("Tell me a joke:", 150, nil, func(token string) bool {
fmt.Print(token)
return true
})
if err != nil {
log.Fatal(err)
}
fmt.Println()
}
Embeddings and Similarity¶
package main
import (
"fmt"
"log"
"sort"
"github.com/cognisoc/mullama"
)
type SearchResult struct {
Text string
Score float32
}
func main() {
model, err := mullama.LoadModel("./embedding-model.gguf", nil)
if err != nil {
log.Fatal(err)
}
defer model.Free()
eg, err := mullama.NewEmbeddingGenerator(model, 512, true)
if err != nil {
log.Fatal(err)
}
defer eg.Free()
// Index documents
documents := []string{
"Go is a statically typed language",
"Python is dynamically typed",
"Rust emphasizes memory safety",
"The weather is nice today",
}
docEmbeddings, err := eg.EmbedBatch(documents)
if err != nil {
log.Fatal(err)
}
// Query
queryEmb, err := eg.Embed("Which language is memory safe?")
if err != nil {
log.Fatal(err)
}
// Rank results
var results []SearchResult
for i, docEmb := range docEmbeddings {
score, _ := mullama.CosineSimilarity(queryEmb, docEmb)
results = append(results, SearchResult{
Text: documents[i],
Score: score,
})
}
sort.Slice(results, func(i, j int) bool {
return results[i].Score > results[j].Score
})
fmt.Println("Search results:")
for _, r := range results {
fmt.Printf(" [%.4f] %s\n", r.Score, r.Text)
}
}
Concurrency with Goroutines¶
package main
import (
"fmt"
"log"
"sync"
"github.com/cognisoc/mullama"
)
func main() {
model, err := mullama.LoadModel("./model.gguf", &mullama.ModelParams{
NGPULayers: -1,
})
if err != nil {
log.Fatal(err)
}
defer model.Free()
prompts := []string{
"What is Go?",
"What is Rust?",
"What is Python?",
}
var wg sync.WaitGroup
results := make([]string, len(prompts))
for i, prompt := range prompts {
wg.Add(1)
go func(idx int, p string) {
defer wg.Done()
// Each goroutine gets its own context
ctx, err := mullama.NewContext(model, &mullama.ContextParams{
NCtx: 1024,
})
if err != nil {
log.Printf("Context error: %v", err)
return
}
defer ctx.Free()
text, err := ctx.Generate(p, 100, nil)
if err != nil {
log.Printf("Generation error: %v", err)
return
}
results[idx] = text
}(i, prompt)
}
wg.Wait()
for i, result := range results {
fmt.Printf("\n--- %s ---\n%s\n", prompts[i], result)
}
}
Thread Safety
A Model can be shared across goroutines, but each goroutine must create its own Context. Contexts are not thread-safe and must not be shared between goroutines.
Memory Management¶
Go bindings use finalizers for automatic cleanup, but explicit Free() calls are strongly recommended:
model, err := mullama.LoadModel("./model.gguf", nil)
if err != nil {
log.Fatal(err)
}
defer model.Free() // Always defer Free() immediately after creation
ctx, err := mullama.NewContext(model, nil)
if err != nil {
log.Fatal(err)
}
defer ctx.Free()
eg, err := mullama.NewEmbeddingGenerator(model, 512, true)
if err != nil {
log.Fatal(err)
}
defer eg.Free()
Key rules:
- Always
defer obj.Free()immediately after successful creation. Free()is safe to call multiple times (idempotent).- Do not use objects after calling
Free(). - The GC finalizer will call
Free()if you forget, but timing is non-deterministic.
Error Handling¶
Go bindings follow idiomatic Go error handling patterns:
model, err := mullama.LoadModel("./model.gguf", nil)
if err != nil {
// err contains the detailed error message from the FFI layer
log.Fatalf("Model load failed: %v", err)
}
ctx, err := mullama.NewContext(model, nil)
if err != nil {
log.Fatalf("Context creation failed: %v", err)
}
text, err := ctx.Generate("Hello", 100, nil)
if err != nil {
log.Fatalf("Generation failed: %v", err)
}
All errors include descriptive messages from the underlying C FFI layer, making debugging straightforward.
HTTP Server Example¶
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"github.com/cognisoc/mullama"
)
var model *mullama.Model
type GenerateRequest struct {
Prompt string `json:"prompt"`
MaxTokens int `json:"max_tokens"`
Temperature float32 `json:"temperature"`
}
type GenerateResponse struct {
Text string `json:"text"`
}
func generateHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req GenerateRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
if req.MaxTokens == 0 {
req.MaxTokens = 200
}
if req.Temperature == 0 {
req.Temperature = 0.8
}
ctx, err := mullama.NewContext(model, &mullama.ContextParams{NCtx: 2048})
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer ctx.Free()
params := &mullama.SamplerParams{
Temperature: req.Temperature,
TopK: 40,
TopP: 0.95,
}
text, err := ctx.Generate(req.Prompt, req.MaxTokens, params)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(GenerateResponse{Text: text})
}
func streamHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req GenerateRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
if req.MaxTokens == 0 {
req.MaxTokens = 200
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "Streaming not supported", http.StatusInternalServerError)
return
}
ctx, err := mullama.NewContext(model, &mullama.ContextParams{NCtx: 2048})
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer ctx.Free()
err = ctx.GenerateStream(req.Prompt, req.MaxTokens, nil, func(token string) bool {
data, _ := json.Marshal(map[string]string{"token": token})
fmt.Fprintf(w, "data: %s\n\n", data)
flusher.Flush()
return true
})
if err != nil {
log.Printf("Stream error: %v", err)
}
fmt.Fprintf(w, "data: [DONE]\n\n")
flusher.Flush()
}
func main() {
defer mullama.BackendFree()
var err error
model, err = mullama.LoadModel("./model.gguf", &mullama.ModelParams{
NGPULayers: -1,
})
if err != nil {
log.Fatal(err)
}
defer model.Free()
http.HandleFunc("/generate", generateHandler)
http.HandleFunc("/generate/stream", streamHandler)
log.Println("Server listening on :8080")
log.Fatal(http.ListenAndServe(":8080", nil))
}
Performance Tips¶
-
GPU offloading -- set
NGPULayersto-1for maximum throughput. -
Reuse models -- model loading is expensive. Load once at startup and share across goroutines.
-
Per-goroutine contexts -- contexts are not thread-safe. Create a new context per goroutine or per request.
-
Defer Free() -- always
defer obj.Free()immediately after creation to prevent resource leaks. -
Batch embeddings -- use
EmbedBatch()for multiple texts rather than callingEmbed()in a loop. -
Context pooling -- for high-throughput servers, consider a sync.Pool of pre-created contexts (remember to call
ClearCache()between uses).