Version: 0.9

Getting Started

This guide assumes you have an Atelico server bundle containing:

atelico-server -- the inference server binary
atelico-asset-downloader -- model downloading tool

1. Download a Model

List available models:

./atelico-asset-downloader list --namespace models

Download a model. For a quick start, the 1B quantized model is small and fast:

./atelico-asset-downloader download meta-llama/Llama-3.2-1B-Instruct-Q4_K_M

For better quality responses, use the 3B model:

./atelico-asset-downloader download meta-llama/Llama-3.2-3B-Instruct-Q4_K_M

Or use interactive mode to browse and select:

./atelico-asset-downloader interactive

Models are cached locally and only need to be downloaded once:

macOS: ~/Library/Caches/atelico/models/
Linux: ~/.cache/atelico/models/
Windows: %LOCALAPPDATA%\atelico\models\

2. Start the Server

./atelico-server

The server starts on port 11434 and auto-detects your GPU:

Mac: uses Metal (Apple Silicon GPU)
NVIDIA: uses CUDA
No GPU: falls back to CPU

To use a different port:

./atelico-server --port 8080

3. Verify It's Running

curl http://localhost:11434/v1/models

You should see a list of available models:

{
  "object": "list",
  "data": [
    {"id": "in-memory::meta-llama/Llama-3.2-1B-Instruct-Q4_K_M", "object": "model", "owned_by": "atelico"},
    ...
  ]
}

4. Send Your First Request

curl http://localhost:11434/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "in-memory::meta-llama/Llama-3.2-1B-Instruct-Q4_K_M",
    "messages": [{"role": "user", "content": "Hello! What can you do?"}]
  }'

That's it. The first request takes a few seconds while the model loads into GPU memory. Subsequent requests are fast.

5. Use from Your Engine or Language

The server speaks the OpenAI API protocol. Any client library that works with OpenAI works with Atelico -- just point it at http://localhost:11434/v1.

Python
Godot (GDScript)
Unity (C#)
Unreal (C++)

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="not-needed"  # required by the library but not used
)

response = client.chat.completions.create(
    model="in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
    messages=[{"role": "user", "content": "Tell me a short joke"}],
    temperature=0.7,
)

print(response.choices[0].message.content)

# Using the AtelicoEngineNode (GDExtension, no HTTP needed)
@onready var engine = $AtelicoEngineNode

func _ready():
    engine.initialize_engine()

func ask_npc(prompt: String) -> void:
    var request = {
        "model": "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7
    }
    var request_json = JSON.stringify(request)
    engine.async_chat_completions(request_json)

func _on_async_request_completed(job_id: int, response: String) -> void:
    var parsed = JSON.parse_string(response)
    var text = parsed["choices"][0]["message"]["content"]
    print(text)

Or using the HTTP server with HTTPRequest:

var http = HTTPRequest.new()

func _ready():
    add_child(http)
    http.request_completed.connect(_on_response)

func ask(prompt: String) -> void:
    var body = JSON.stringify({
        "model": "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
        "messages": [{"role": "user", "content": prompt}]
    })
    http.request("http://localhost:11434/v1/chat/completions",
        ["Content-Type: application/json"], HTTPClient.METHOD_POST, body)

func _on_response(_result, _code, _headers, body: PackedByteArray) -> void:
    var parsed = JSON.parse_string(body.get_string_from_utf8())
    print(parsed["choices"][0]["message"]["content"])

using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using UnityEngine;

public class AtelicoClient : MonoBehaviour
{
    private static readonly HttpClient client = new HttpClient();
    private const string BaseUrl = "http://localhost:11434/v1";

    public async Task<string> Ask(string prompt)
    {
        var request = new
        {
            model = "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M",
            messages = new[] { new { role = "user", content = prompt } },
            temperature = 0.7
        };

        var json = JsonSerializer.Serialize(request);
        var content = new StringContent(json, Encoding.UTF8, "application/json");
        var response = await client.PostAsync($"{BaseUrl}/chat/completions", content);
        var responseJson = await response.Content.ReadAsStringAsync();

        using var doc = JsonDocument.Parse(responseJson);
        return doc.RootElement
            .GetProperty("choices")[0]
            .GetProperty("message")
            .GetProperty("content")
            .GetString();
    }
}

#include "HttpModule.h"
#include "Interfaces/IHttpRequest.h"
#include "Interfaces/IHttpResponse.h"
#include "Dom/JsonObject.h"
#include "Serialization/JsonSerializer.h"

void UAtelicoClient::Ask(const FString& Prompt)
{
    auto Request = FHttpModule::Get().CreateRequest();
    Request->SetURL(TEXT("http://localhost:11434/v1/chat/completions"));
    Request->SetVerb(TEXT("POST"));
    Request->SetHeader(TEXT("Content-Type"), TEXT("application/json"));

    TSharedPtr<FJsonObject> Body = MakeShareable(new FJsonObject);
    Body->SetStringField("model", "in-memory::meta-llama/Llama-3.2-3B-Instruct-Q4_K_M");

    TArray<TSharedPtr<FJsonValue>> Messages;
    TSharedPtr<FJsonObject> Message = MakeShareable(new FJsonObject);
    Message->SetStringField("role", "user");
    Message->SetStringField("content", Prompt);
    Messages.Add(MakeShareable(new FJsonValueObject(Message)));
    Body->SetArrayField("messages", Messages);

    FString BodyString;
    TSharedRef<TJsonWriter<>> Writer = TJsonWriterFactory<>::Create(&BodyString);
    FJsonSerializer::Serialize(Body.ToSharedRef(), Writer);
    Request->SetContentAsString(BodyString);

    Request->OnProcessRequestComplete().BindLambda(
        [](FHttpRequestPtr Req, FHttpResponsePtr Resp, bool bSuccess)
    {
        if (bSuccess && Resp.IsValid())
        {
            TSharedPtr<FJsonObject> Json;
            TSharedRef<TJsonReader<>> Reader = TJsonReaderFactory<>::Create(Resp->GetContentAsString());
            FJsonSerializer::Deserialize(Reader, Json);

            auto Choices = Json->GetArrayField("choices");
            FString Content = Choices[0]->AsObject()
                ->GetObjectField("message")
                ->GetStringField("content");
            UE_LOG(LogTemp, Log, TEXT("Response: %s"), *Content);
        }
    });

    Request->ProcessRequest();
}

Next Steps

Chat Completions API -- Streaming, system prompts, multi-turn conversations
Structured Generation -- Force the model to output valid JSON
Models -- Available models, formats, and how to manage them
Server Configuration -- Ports, environment variables, proxy backends

1. Download a Model​

2. Start the Server​

3. Verify It's Running​

4. Send Your First Request​

5. Use from Your Engine or Language​

Next Steps​