llm

package
v0.0.0-...-e1ffba4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 3, 2025 License: MIT Imports: 37 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
View Source
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

Functions

func LoadModel

func LoadModel(model string, maxArraySize int) (*ggml.GGML, error)

LoadModel will load a model from disk. The model must be in the GGML format.

It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.

func StartRunner

func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error)

Types

type CompletionRequest

type CompletionRequest struct {
	Prompt  string
	Format  json.RawMessage
	Images  []ImageData
	Options *api.Options

	Grammar  string // set before sending the request to the subprocess
	Shift    bool
	Truncate bool
}

type CompletionResponse

type CompletionResponse struct {
	Content            string        `json:"content"`
	DoneReason         DoneReason    `json:"done_reason"`
	Done               bool          `json:"done"`
	PromptEvalCount    int           `json:"prompt_eval_count"`
	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
	EvalCount          int           `json:"eval_count"`
	EvalDuration       time.Duration `json:"eval_duration"`
}

type DetokenizeRequest

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type DoneReason

type DoneReason int

DoneReason represents the reason why a completion response is done

const (
	// DoneReasonStop indicates the completion stopped naturally
	DoneReasonStop DoneReason = iota
	// DoneReasonLength indicates the completion stopped due to length limits
	DoneReasonLength
	// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
	DoneReasonConnectionClosed
)

func (DoneReason) String

func (d DoneReason) String() string

type EmbeddingRequest

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse

type EmbeddingResponse struct {
	Embedding []float32 `json:"embedding"`
}

type ImageData

type ImageData struct {
	Data []byte `json:"data"`
	ID   int    `json:"id"`
}

type LlamaServer

type LlamaServer interface {
	ModelPath() string
	Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
	Ping(ctx context.Context) error
	WaitUntilRunning(ctx context.Context) error
	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
	Embedding(ctx context.Context, input string) ([]float32, error)
	Tokenize(ctx context.Context, content string) ([]int, error)
	Detokenize(ctx context.Context, tokens []int) (string, error)
	Close() error
	VRAMSize() uint64 // Total VRAM across all GPUs
	TotalSize() uint64
	VRAMByGPU(id ml.DeviceID) uint64
	Pid() int
	GetPort() int
	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
	HasExited() bool
}

func NewLlamaServer

func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)

NewLlamaServer will run a server for the given GPUs

type LoadOperation

type LoadOperation int
const (
	LoadOperationFit    LoadOperation = iota // Return memory requirements but do not allocate
	LoadOperationAlloc                       // Allocate memory but do not load the weights
	LoadOperationCommit                      // Load weights - further changes cannot be made after this
	LoadOperationClose                       // Close model and free memory
)

The order of these constants are significant because we iterate over the operations. They should be in order of increasingly loading the model.

func (LoadOperation) String

func (o LoadOperation) String() string

type LoadRequest

type LoadRequest struct {
	Operation LoadOperation

	LoraPath       []string
	Parallel       int
	BatchSize      int
	FlashAttention bool
	KvSize         int
	KvCacheType    string
	NumThreads     int
	GPULayers      ml.GPULayersList
	MultiUserCache bool

	// Legacy fields - not used with the Ollama engine
	ProjectorPath string
	MainGPU       int
	UseMmap       bool
}

type LoadResponse

type LoadResponse struct {
	Success bool
	Memory  ml.BackendMemory
}

type MemoryEstimate

type MemoryEstimate struct {
	// How many layers we predict we can load
	Layers int

	// The size of the graph which occupies the main GPU
	Graph uint64

	// How much VRAM will be allocated given the number of layers we predict
	VRAMSize uint64

	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
	TotalSize uint64

	// For multi-GPU scenarios, this provides the tensor split parameter
	TensorSplit []int

	// For multi-GPU scenarios, this is the size in bytes per GPU
	GPUSizes []uint64
	// contains filtered or unexported fields
}

func (MemoryEstimate) LogValue

func (m MemoryEstimate) LogValue() slog.Value

type ServerStatus

type ServerStatus int
const (
	ServerStatusReady ServerStatus = iota
	ServerStatusNoSlotsAvailable
	ServerStatusLaunched
	ServerStatusLoadingModel
	ServerStatusNotResponding
	ServerStatusError
)

func (ServerStatus) String

func (s ServerStatus) String() string

type ServerStatusResponse

type ServerStatusResponse struct {
	Status   ServerStatus `json:"status"`
	Progress float32      `json:"progress"`
}

type StatusWriter

type StatusWriter struct {
	LastErrMsg string
	// contains filtered or unexported fields
}

StatusWriter is a writer that captures error messages from the llama runner process

func NewStatusWriter

func NewStatusWriter(out *os.File) *StatusWriter

func (*StatusWriter) Write

func (w *StatusWriter) Write(b []byte) (int, error)

type TokenizeRequest

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL