LM Primitives
Low-level access to a loaded model's tokenizer and static metadata. Useful for budgeting prompts against the context window, pre-tokenizing prefixes, building custom samplers, or inspecting which device a model is running on.
These primitives share the same in-process model thread used for inference, so the model is loaded on demand and reused across calls — no separate load step is required.
note
Logits and hidden activations are intentionally not exposed in this release. They are model-specific, large, and have a high compatibility risk across model architectures and quantization formats.
API
| Method | Returns | Purpose |
|---|---|---|
tokenize(model, text) | list[int] | Encode text using the model's tokenizer |
detokenize(model, tokens, skip_special_tokens?) | str | Decode a token sequence back to text |
model_capabilities(model) | ModelCapabilities | Inspect vocab size, context window, dtype, device |
ModelCapabilities fields:
| Field | Type | Description |
|---|---|---|
model_id | string | The model identifier as loaded |
vocab_size | int | Tokenizer vocabulary size |
max_position_embeddings | int | Context window in tokens |
dtype | string | Active weight dtype (e.g. "BF16", "F16", "Q4_K_M") |
backend_kind | string | Active device (e.g. "metal", "cuda", "cpu") |
Usage
- Python
- Godot (GDScript)
- Unity (C#)
- Unreal (C++)
- C FFI
- Rust SDK
import json
import atelico
engine = atelico.Engine()
model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M"
engine.load_model(model)
# Inspect the model
caps = json.loads(engine.model_capabilities(model))
print(f"context window = {caps['max_position_embeddings']} tokens, "
f"running as {caps['dtype']} on {caps['backend_kind']}")
# Budget a prompt against the context window
prompt = "You are a tavern keeper named Boris..."
tokens = engine.tokenize(model, prompt)
budget = caps["max_position_embeddings"] - len(tokens)
print(f"prompt = {len(tokens)} tokens, {budget} tokens remaining for the response")
# Round-trip
text = engine.detokenize(model, tokens, skip_special_tokens=True)
assert text.strip() == prompt
# AtelicoEngineNode is the singleton node exposed by the Godot extension
@onready var engine: AtelicoEngineNode = $AtelicoEngine
func budget_prompt(model: String, prompt: String) -> int:
var caps_json := engine.llm_model_capabilities(model)
if caps_json.is_empty():
push_error("model_capabilities failed for %s" % model)
return -1
var caps := JSON.parse_string(caps_json)
print("device=%s dtype=%s ctx=%d" % [caps.backend_kind, caps.dtype, caps.max_position_embeddings])
var tokens_json := engine.llm_tokenize(model, prompt)
var tokens: Array = JSON.parse_string(tokens_json)["tokens"]
return caps.max_position_embeddings - tokens.size()
func decode_tokens(model: String, tokens: Array) -> String:
var request := JSON.stringify({
"model_id": model,
"tokens": tokens,
"skip_special_tokens": true
})
return JSON.parse_string(engine.llm_detokenize(request))["text"]
using System.Text.Json;
var engine = new AtelicoEngine();
const string model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M";
// Inspect the model
var capsJson = engine.Llm.ModelCapabilities(model);
using var capsDoc = JsonDocument.Parse(capsJson);
int ctx = capsDoc.RootElement.GetProperty("max_position_embeddings").GetInt32();
string dtype = capsDoc.RootElement.GetProperty("dtype").GetString();
Debug.Log($"context window = {ctx} tokens, dtype = {dtype}");
// Budget a prompt
var tokenizeJson = engine.Llm.Tokenize(model, "You are a tavern keeper named Boris...");
using var tokDoc = JsonDocument.Parse(tokenizeJson);
int promptTokens = tokDoc.RootElement.GetProperty("tokens").GetArrayLength();
int budget = ctx - promptTokens;
Debug.Log($"prompt = {promptTokens} tokens, {budget} remaining for the response");
// Decode
var detokRequest = JsonSerializer.Serialize(new {
model_id = model,
tokens = new[] { 128000, 9906, 11, 1917, 0 },
skip_special_tokens = true,
});
var textJson = engine.Llm.Detokenize(detokRequest);
using var textDoc = JsonDocument.Parse(textJson);
string text = textDoc.RootElement.GetProperty("text").GetString();
auto* Atelico = GEngine->GetEngineSubsystem<UAtelicoAISubsystem>();
const FString Model = TEXT("in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M");
// Inspect
FString CapsJson = Atelico->ModelCapabilities(Model);
TSharedPtr<FJsonObject> CapsObj;
auto Reader = TJsonReaderFactory<>::Create(CapsJson);
FJsonSerializer::Deserialize(Reader, CapsObj);
const int32 ContextWindow = CapsObj->GetIntegerField(TEXT("max_position_embeddings"));
const FString Dtype = CapsObj->GetStringField(TEXT("dtype"));
UE_LOG(LogTemp, Log, TEXT("[Atelico] ctx=%d dtype=%s"), ContextWindow, *Dtype);
// Budget a prompt
FString TokensJson = Atelico->Tokenize(Model, TEXT("You are a tavern keeper named Boris..."));
TSharedPtr<FJsonObject> TokensObj;
auto TokensReader = TJsonReaderFactory<>::Create(TokensJson);
FJsonSerializer::Deserialize(TokensReader, TokensObj);
const int32 PromptTokens = TokensObj->GetArrayField(TEXT("tokens")).Num();
UE_LOG(LogTemp, Log, TEXT("[Atelico] %d tokens, %d remaining"),
PromptTokens, ContextWindow - PromptTokens);
// Decode
const FString DetokRequest = TEXT(R"({
"model_id": "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
"tokens": [128000, 9906, 11, 1917, 0],
"skip_special_tokens": true
})");
FString TextJson = Atelico->Detokenize(DetokRequest);
#include "atelico_ffi.h"
#include <stdio.h>
void inspect_and_budget(struct AtelicoEngine *engine, const char *model,
const char *prompt) {
const char *caps_json = NULL;
if (atelico_llm_model_capabilities(engine, model, &caps_json) == ATELICO_OK) {
printf("model capabilities: %s\n", caps_json);
}
const char *tokens_json = NULL;
if (atelico_llm_tokenize(engine, model, prompt, &tokens_json) == ATELICO_OK) {
printf("tokens: %s\n", tokens_json);
}
const char *text_json = NULL;
const char *detok_request =
"{\"model_id\":\"in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M\","
" \"tokens\":[128000,9906,11,1917,0],"
" \"skip_special_tokens\":true}";
if (atelico_llm_detokenize(engine, detok_request, &text_json) == ATELICO_OK) {
printf("decoded: %s\n", text_json);
}
}
use atelico_sdk::{Engine, EngineConfig};
# tokio_test::block_on(async {
let engine = Engine::new(EngineConfig::default()).await?;
let model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M";
let caps = engine.llm().model_capabilities(model).await?;
println!("ctx={} dtype={} device={}",
caps.max_position_embeddings, caps.dtype, caps.backend_kind);
let tokens = engine.llm().tokenize(model, "Hello, world!").await?;
let text = engine.llm().detokenize(model, tokens, true).await?;
# Ok::<_, atelico_sdk::SdkError>(())
# });
When to reach for these
- Prompt budgeting — measure a prompt's token cost before sending it; surface a "context full" warning to the user instead of silent truncation.
- Prefix design — pair with the Prefix Cache so the cached prefix matches your tokenization exactly.
- Custom samplers / tooling — drive your own decoding loop or build editor-style token-by-token introspection.
- Telemetry — log
dtype+backend_kindalongside latency so a slow request can be diagnosed (CPU fallback, wrong quantization, etc).