Version: 0.9

LM Primitives

Low-level access to a loaded model's tokenizer and static metadata. Useful for budgeting prompts against the context window, pre-tokenizing prefixes, building custom samplers, or inspecting which device a model is running on.

These primitives share the same in-process model thread used for inference, so the model is loaded on demand and reused across calls — no separate load step is required.

note

Logits and hidden activations are intentionally not exposed in this release. They are model-specific, large, and have a high compatibility risk across model architectures and quantization formats.

API

Method	Returns	Purpose
`tokenize(model, text)`	`list[int]`	Encode `text` using the model's tokenizer
`detokenize(model, tokens, skip_special_tokens?)`	`str`	Decode a token sequence back to text
`model_capabilities(model)`	`ModelCapabilities`	Inspect vocab size, context window, dtype, device

ModelCapabilities fields:

Field	Type	Description
`model_id`	string	The model identifier as loaded
`vocab_size`	int	Tokenizer vocabulary size
`max_position_embeddings`	int	Context window in tokens
`dtype`	string	Active weight dtype (e.g. `"BF16"`, `"F16"`, `"Q4_K_M"`)
`backend_kind`	string	Active device (e.g. `"metal"`, `"cuda"`, `"cpu"`)

Usage

import json
import atelico

engine = atelico.Engine()

model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M"
engine.load_model(model)

# Inspect the model
caps = json.loads(engine.model_capabilities(model))
print(f"context window = {caps['max_position_embeddings']} tokens, "
      f"running as {caps['dtype']} on {caps['backend_kind']}")

# Budget a prompt against the context window
prompt = "You are a tavern keeper named Boris..."
tokens = engine.tokenize(model, prompt)
budget = caps["max_position_embeddings"] - len(tokens)
print(f"prompt = {len(tokens)} tokens, {budget} tokens remaining for the response")

# Round-trip
text = engine.detokenize(model, tokens, skip_special_tokens=True)
assert text.strip() == prompt

# AtelicoEngineNode is the singleton node exposed by the Godot extension
@onready var engine: AtelicoEngineNode = $AtelicoEngine

func budget_prompt(model: String, prompt: String) -> int:
    var caps_json := engine.llm_model_capabilities(model)
    if caps_json.is_empty():
        push_error("model_capabilities failed for %s" % model)
        return -1
    var caps := JSON.parse_string(caps_json)
    print("device=%s dtype=%s ctx=%d" % [caps.backend_kind, caps.dtype, caps.max_position_embeddings])

    var tokens_json := engine.llm_tokenize(model, prompt)
    var tokens: Array = JSON.parse_string(tokens_json)["tokens"]
    return caps.max_position_embeddings - tokens.size()

func decode_tokens(model: String, tokens: Array) -> String:
    var request := JSON.stringify({
        "model_id": model,
        "tokens": tokens,
        "skip_special_tokens": true
    })
    return JSON.parse_string(engine.llm_detokenize(request))["text"]

using System.Text.Json;

var engine = new AtelicoEngine();
const string model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M";

// Inspect the model
var capsJson = engine.Llm.ModelCapabilities(model);
using var capsDoc = JsonDocument.Parse(capsJson);
int ctx = capsDoc.RootElement.GetProperty("max_position_embeddings").GetInt32();
string dtype = capsDoc.RootElement.GetProperty("dtype").GetString();
Debug.Log($"context window = {ctx} tokens, dtype = {dtype}");

// Budget a prompt
var tokenizeJson = engine.Llm.Tokenize(model, "You are a tavern keeper named Boris...");
using var tokDoc = JsonDocument.Parse(tokenizeJson);
int promptTokens = tokDoc.RootElement.GetProperty("tokens").GetArrayLength();
int budget = ctx - promptTokens;
Debug.Log($"prompt = {promptTokens} tokens, {budget} remaining for the response");

// Decode
var detokRequest = JsonSerializer.Serialize(new {
    model_id = model,
    tokens = new[] { 128000, 9906, 11, 1917, 0 },
    skip_special_tokens = true,
});
var textJson = engine.Llm.Detokenize(detokRequest);
using var textDoc = JsonDocument.Parse(textJson);
string text = textDoc.RootElement.GetProperty("text").GetString();

auto* Atelico = GEngine->GetEngineSubsystem<UAtelicoAISubsystem>();
const FString Model = TEXT("in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M");

// Inspect
FString CapsJson = Atelico->ModelCapabilities(Model);
TSharedPtr<FJsonObject> CapsObj;
auto Reader = TJsonReaderFactory<>::Create(CapsJson);
FJsonSerializer::Deserialize(Reader, CapsObj);
const int32 ContextWindow = CapsObj->GetIntegerField(TEXT("max_position_embeddings"));
const FString Dtype = CapsObj->GetStringField(TEXT("dtype"));
UE_LOG(LogTemp, Log, TEXT("[Atelico] ctx=%d dtype=%s"), ContextWindow, *Dtype);

// Budget a prompt
FString TokensJson = Atelico->Tokenize(Model, TEXT("You are a tavern keeper named Boris..."));
TSharedPtr<FJsonObject> TokensObj;
auto TokensReader = TJsonReaderFactory<>::Create(TokensJson);
FJsonSerializer::Deserialize(TokensReader, TokensObj);
const int32 PromptTokens = TokensObj->GetArrayField(TEXT("tokens")).Num();
UE_LOG(LogTemp, Log, TEXT("[Atelico] %d tokens, %d remaining"),
    PromptTokens, ContextWindow - PromptTokens);

// Decode
const FString DetokRequest = TEXT(R"({
    "model_id": "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
    "tokens": [128000, 9906, 11, 1917, 0],
    "skip_special_tokens": true
})");
FString TextJson = Atelico->Detokenize(DetokRequest);

#include "atelico_ffi.h"
#include <stdio.h>

void inspect_and_budget(struct AtelicoEngine *engine, const char *model,
                        const char *prompt) {
    const char *caps_json = NULL;
    if (atelico_llm_model_capabilities(engine, model, &caps_json) == ATELICO_OK) {
        printf("model capabilities: %s\n", caps_json);
    }

    const char *tokens_json = NULL;
    if (atelico_llm_tokenize(engine, model, prompt, &tokens_json) == ATELICO_OK) {
        printf("tokens: %s\n", tokens_json);
    }

    const char *text_json = NULL;
    const char *detok_request =
        "{\"model_id\":\"in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M\","
        " \"tokens\":[128000,9906,11,1917,0],"
        " \"skip_special_tokens\":true}";
    if (atelico_llm_detokenize(engine, detok_request, &text_json) == ATELICO_OK) {
        printf("decoded: %s\n", text_json);
    }
}

use atelico_sdk::{Engine, EngineConfig};

# tokio_test::block_on(async {
let engine = Engine::new(EngineConfig::default()).await?;
let model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M";

let caps = engine.llm().model_capabilities(model).await?;
println!("ctx={} dtype={} device={}",
    caps.max_position_embeddings, caps.dtype, caps.backend_kind);

let tokens = engine.llm().tokenize(model, "Hello, world!").await?;
let text = engine.llm().detokenize(model, tokens, true).await?;
# Ok::<_, atelico_sdk::SdkError>(())
# });

When to reach for these

Prompt budgeting — measure a prompt's token cost before sending it; surface a "context full" warning to the user instead of silent truncation.
Prefix design — pair with the Prefix Cache so the cached prefix matches your tokenization exactly.
Custom samplers / tooling — drive your own decoding loop or build editor-style token-by-token introspection.
Telemetry — log dtype + backend_kind alongside latency so a slow request can be diagnosed (CPU fallback, wrong quantization, etc).

API​

Usage​

When to reach for these​

API

Usage

When to reach for these