Documentation
¶
Index ¶
- Variables
- func LoadModel(model string, maxArraySize int) (*ggml.GGML, error)
- func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, ...) (cmd *exec.Cmd, port int, err error)
- type CompletionRequest
- type CompletionResponse
- type DetokenizeRequest
- type DetokenizeResponse
- type DoneReason
- type EmbeddingRequest
- type EmbeddingResponse
- type ImageData
- type LlamaServer
- type LoadOperation
- type LoadRequest
- type LoadResponse
- type MemoryEstimate
- type ServerStatus
- type ServerStatusResponse
- type StatusWriter
- type TokenizeRequest
- type TokenizeResponse
Constants ¶
This section is empty.
Variables ¶
View Source
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
View Source
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
Functions ¶
Types ¶
type CompletionRequest ¶
type CompletionResponse ¶
type CompletionResponse struct {
Content string `json:"content"`
DoneReason DoneReason `json:"done_reason"`
Done bool `json:"done"`
PromptEvalCount int `json:"prompt_eval_count"`
PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
EvalCount int `json:"eval_count"`
EvalDuration time.Duration `json:"eval_duration"`
}
type DetokenizeRequest ¶
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse ¶
type DetokenizeResponse struct {
Content string `json:"content"`
}
type DoneReason ¶
type DoneReason int
DoneReason represents the reason why a completion response is done
const ( // DoneReasonStop indicates the completion stopped naturally DoneReasonStop DoneReason = iota // DoneReasonLength indicates the completion stopped due to length limits DoneReasonLength // DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed DoneReasonConnectionClosed )
func (DoneReason) String ¶
func (d DoneReason) String() string
type EmbeddingRequest ¶
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse ¶
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
}
type LlamaServer ¶
type LlamaServer interface {
ModelPath() string
Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embedding(ctx context.Context, input string) ([]float32, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
VRAMSize() uint64 // Total VRAM across all GPUs
TotalSize() uint64
VRAMByGPU(id ml.DeviceID) uint64
Pid() int
GetPort() int
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
HasExited() bool
}
func NewLlamaServer ¶
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)
NewLlamaServer will run a server for the given GPUs
type LoadOperation ¶
type LoadOperation int
const ( LoadOperationFit LoadOperation = iota // Return memory requirements but do not allocate LoadOperationAlloc // Allocate memory but do not load the weights LoadOperationCommit // Load weights - further changes cannot be made after this LoadOperationClose // Close model and free memory )
The order of these constants are significant because we iterate over the operations. They should be in order of increasingly loading the model.
func (LoadOperation) String ¶
func (o LoadOperation) String() string
type LoadRequest ¶
type LoadRequest struct {
Operation LoadOperation
LoraPath []string
Parallel int
BatchSize int
FlashAttention bool
KvSize int
KvCacheType string
NumThreads int
GPULayers ml.GPULayersList
MultiUserCache bool
// Legacy fields - not used with the Ollama engine
ProjectorPath string
MainGPU int
UseMmap bool
}
type LoadResponse ¶
type LoadResponse struct {
Success bool
Memory ml.BackendMemory
}
type MemoryEstimate ¶
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
// The size of the graph which occupies the main GPU
Graph uint64
// How much VRAM will be allocated given the number of layers we predict
VRAMSize uint64
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit []int
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
// contains filtered or unexported fields
}
func (MemoryEstimate) LogValue ¶
func (m MemoryEstimate) LogValue() slog.Value
type ServerStatus ¶
type ServerStatus int
const ( ServerStatusReady ServerStatus = iota ServerStatusNoSlotsAvailable ServerStatusLaunched ServerStatusLoadingModel ServerStatusNotResponding ServerStatusError )
func (ServerStatus) String ¶
func (s ServerStatus) String() string
type ServerStatusResponse ¶
type ServerStatusResponse struct {
Status ServerStatus `json:"status"`
Progress float32 `json:"progress"`
}
type StatusWriter ¶
type StatusWriter struct {
LastErrMsg string
// contains filtered or unexported fields
}
StatusWriter is a writer that captures error messages from the llama runner process
func NewStatusWriter ¶
func NewStatusWriter(out *os.File) *StatusWriter
type TokenizeRequest ¶
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse ¶
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
Click to show internal directories.
Click to hide internal directories.