llm

package

v0.0.0-...-e1ffba4 Latest Latest Go to latest Published: Nov 3, 2025 License: MIT Imports: 37 Imported by: 0

Documentation ¶

Rendered for

Index ¶

Variables
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error)
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, ...) (cmd *exec.Cmd, port int, err error)
type CompletionRequest
type CompletionResponse
type DetokenizeRequest
type DetokenizeResponse
type DoneReason
- func (d DoneReason) String() string
type EmbeddingRequest
type EmbeddingResponse
type ImageData
type LlamaServer
- func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, ...) (LlamaServer, error)
type LoadOperation
- func (o LoadOperation) String() string
type LoadRequest
type LoadResponse
type MemoryEstimate
- func (m MemoryEstimate) LogValue() slog.Value
type ServerStatus
- func (s ServerStatus) String() string
type ServerStatusResponse
type StatusWriter
- func NewStatusWriter(out *os.File) *StatusWriter
- func (w *StatusWriter) Write(b []byte) (int, error)
type TokenizeRequest
type TokenizeResponse

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

View Source

var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

Functions ¶

func LoadModel ¶

func LoadModel(model string, maxArraySize int) (*ggml.GGML, error)

LoadModel will load a model from disk. The model must be in the GGML format.

It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.

func StartRunner ¶

func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error)

Types ¶

type CompletionRequest ¶

type CompletionRequest struct {
	Prompt  string
	Format  json.RawMessage
	Images  []ImageData
	Options *api.Options

	Grammar  string // set before sending the request to the subprocess
	Shift    bool
	Truncate bool
}

type CompletionResponse ¶

type CompletionResponse struct {
	Content            string        `json:"content"`
	DoneReason         DoneReason    `json:"done_reason"`
	Done               bool          `json:"done"`
	PromptEvalCount    int           `json:"prompt_eval_count"`
	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
	EvalCount          int           `json:"eval_count"`
	EvalDuration       time.Duration `json:"eval_duration"`
}

type DetokenizeRequest ¶

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse ¶

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type DoneReason ¶

type DoneReason int

DoneReason represents the reason why a completion response is done

const (
	// DoneReasonStop indicates the completion stopped naturally
	DoneReasonStop DoneReason = iota
	// DoneReasonLength indicates the completion stopped due to length limits
	DoneReasonLength
	// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
	DoneReasonConnectionClosed
)

func (DoneReason) String ¶

func (d DoneReason) String() string

type EmbeddingRequest ¶

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse ¶

type EmbeddingResponse struct {
	Embedding []float32 `json:"embedding"`
}

type ImageData ¶

type ImageData struct {
	Data []byte `json:"data"`
	ID   int    `json:"id"`
}

type LlamaServer ¶

type LlamaServer interface {
	ModelPath() string
	Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
	Ping(ctx context.Context) error
	WaitUntilRunning(ctx context.Context) error
	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
	Embedding(ctx context.Context, input string) ([]float32, error)
	Tokenize(ctx context.Context, content string) ([]int, error)
	Detokenize(ctx context.Context, tokens []int) (string, error)
	Close() error
	VRAMSize() uint64 // Total VRAM across all GPUs
	TotalSize() uint64
	VRAMByGPU(id ml.DeviceID) uint64
	Pid() int
	GetPort() int
	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
	HasExited() bool
}

func NewLlamaServer ¶

func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)

NewLlamaServer will run a server for the given GPUs

type LoadOperation ¶

type LoadOperation int

const (
	LoadOperationFit    LoadOperation = iota // Return memory requirements but do not allocate
	LoadOperationAlloc                       // Allocate memory but do not load the weights
	LoadOperationCommit                      // Load weights - further changes cannot be made after this
	LoadOperationClose                       // Close model and free memory
)

The order of these constants are significant because we iterate over the operations. They should be in order of increasingly loading the model.

func (LoadOperation) String ¶

func (o LoadOperation) String() string

type LoadRequest ¶

type LoadRequest struct {
	Operation LoadOperation

	LoraPath       []string
	Parallel       int
	BatchSize      int
	FlashAttention bool
	KvSize         int
	KvCacheType    string
	NumThreads     int
	GPULayers      ml.GPULayersList
	MultiUserCache bool

	// Legacy fields - not used with the Ollama engine
	ProjectorPath string
	MainGPU       int
	UseMmap       bool
}

type LoadResponse ¶

type LoadResponse struct {
	Success bool
	Memory  ml.BackendMemory
}

type MemoryEstimate ¶

type MemoryEstimate struct {
	// How many layers we predict we can load
	Layers int

	// The size of the graph which occupies the main GPU
	Graph uint64

	// How much VRAM will be allocated given the number of layers we predict
	VRAMSize uint64

	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
	TotalSize uint64

	// For multi-GPU scenarios, this provides the tensor split parameter
	TensorSplit []int

	// For multi-GPU scenarios, this is the size in bytes per GPU
	GPUSizes []uint64
	// contains filtered or unexported fields
}

func (MemoryEstimate) LogValue ¶

func (m MemoryEstimate) LogValue() slog.Value

type ServerStatus ¶

type ServerStatus int

const (
	ServerStatusReady ServerStatus = iota
	ServerStatusNoSlotsAvailable
	ServerStatusLaunched
	ServerStatusLoadingModel
	ServerStatusNotResponding
	ServerStatusError
)

func (ServerStatus) String ¶

func (s ServerStatus) String() string

type ServerStatusResponse ¶

type ServerStatusResponse struct {
	Status   ServerStatus `json:"status"`
	Progress float32      `json:"progress"`
}

type StatusWriter ¶

type StatusWriter struct {
	LastErrMsg string
	// contains filtered or unexported fields
}

StatusWriter is a writer that captures error messages from the llama runner process

func NewStatusWriter ¶

func NewStatusWriter(out *os.File) *StatusWriter

func (*StatusWriter) Write ¶

func (w *StatusWriter) Write(b []byte) (int, error)

type TokenizeRequest ¶

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse ¶

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL