qwen25vl

package

v0.0.0-...-6e6905b Latest Latest Go to latest Published: Sep 22, 2025 License: MIT Imports: 14 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/zhuangjie1125/ollama

Links

Open Source Insights

Documentation ¶

Index ¶

func New(c fs.Config) (model.Model, error)
type Grid
type ImageProcessor
- func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
- func (p *ImageProcessor) SmartResize(height, width int) (int, int)
type Layer
- func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, ...) ml.Tensor
type MLP
- func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor
type Model
- func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error)
- func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error)
- func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error)
- func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error)
type PatchEmbedding
- func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor
type SelfAttention
- func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, ...) ml.Tensor
type TextModel
- func NewTextModel(c fs.Config) *TextModel
- func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, ...) (ml.Tensor, error)
- func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
type TextOptions
type VisionEncoderLayer
- func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, ...) ml.Tensor
type VisionMLP
- func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor
type VisionModel
- func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor
- func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
- func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
type VisionModelOptions
type VisionPatchMerger
- func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor
type VisionSelfAttention
- func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, ...) ml.Tensor

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func New ¶

func New(c fs.Config) (model.Model, error)

Types ¶

type Grid ¶

type Grid struct {
	Height   int
	Width    int
	Temporal int
}

type ImageProcessor ¶

type ImageProcessor struct {
	// contains filtered or unexported fields
}

ImageProcessor contains configuration for the Qwen 2.5 VL image processing

func (*ImageProcessor) ProcessImage ¶

func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)

func (*ImageProcessor) SmartResize ¶

func (p *ImageProcessor) SmartResize(height, width int) (int, int)

SmartResize implements the smart resize algorithm

type Layer ¶

type Layer struct {
	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
	SelfAttention *SelfAttention
	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
	MLP           *MLP
}

Layer represents a single transformer layer combining self-attention and feed-forward components

func (*Layer) Forward ¶

func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor

type MLP ¶

type MLP struct {
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
	Gate *nn.Linear `gguf:"ffn_gate"`
}

MLP implements the feed-forward network component with SwiGLU activation

func (*MLP) Forward ¶

func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor

type Model ¶

type Model struct {
	model.Base
	model.BytePairEncoding

	*TextModel
	*VisionModel `gguf:"v"`

	ImageProcessor
}

func (*Model) EncodeMultimodal ¶

func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error)

func (*Model) Forward ¶

func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error)

func (*Model) PixelValues ¶

func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error)

func (*Model) PostTokenize ¶

func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error)

PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass

type PatchEmbedding ¶

type PatchEmbedding struct {
	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
}

func (*PatchEmbedding) Forward ¶

func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor

type SelfAttention ¶

type SelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_output"`
}

SelfAttention implements the multi-head self-attention mechanism with separate projections for query, key, value and output transformations

func (*SelfAttention) Forward ¶

func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor

type TextModel ¶

type TextModel struct {
	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
	Layers         []Layer       `gguf:"blk"`
	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

	*TextOptions
}

func NewTextModel ¶

func NewTextModel(c fs.Config) *TextModel

func (*TextModel) Forward ¶

func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error)

func (*TextModel) Shift ¶

func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)

Shift applies rotary position embeddings to the key tensor for causal attention caching

type TextOptions ¶

type TextOptions struct {
	// contains filtered or unexported fields
}

type VisionEncoderLayer ¶

type VisionEncoderLayer struct {
	Norm1         *nn.RMSNorm `gguf:"ln1"`
	SelfAttention *VisionSelfAttention
	Norm2         *nn.RMSNorm `gguf:"ln2"`
	MLP           *VisionMLP
}

func (*VisionEncoderLayer) Forward ¶

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor

type VisionMLP ¶

type VisionMLP struct {
	Gate *nn.Linear `gguf:"ffn_gate"`
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
}

VisionMLP implements the multi-layer perceptron

func (*VisionMLP) Forward ¶

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor

type VisionModel ¶

type VisionModel struct {
	PatchEmbedding *PatchEmbedding
	Layers         []VisionEncoderLayer `gguf:"blk"`
	PatchMerger    *VisionPatchMerger   `gguf:"merger"`

	*VisionModelOptions
}

VisionModel implements the Qwen vision model

func (*VisionModel) Forward ¶

func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor

Forward computes the vision model for an input tensor

func (*VisionModel) PositionalEmbedding ¶

func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor

PositionalEmbedding generates rotary position embeddings for attention mechanisms

func (*VisionModel) WindowIndex ¶

func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)

WindowIndex divides the grid into windows and returns:

A tensor containing flattened indices of all grid points organized by windows
A slice of boundaries that mark where each window's data begins and ends in the flattened representation, scaled by spatialMergeSize squared

The boundaries slice always starts with 0 and contains cumulative ending positions for each window, allowing downstream processing to identify window boundaries in the tensor data.

type VisionModelOptions ¶

type VisionModelOptions struct {
	// contains filtered or unexported fields
}

VisionModelOptions contains configuration options

type VisionPatchMerger ¶

type VisionPatchMerger struct {
	LNQ  *nn.RMSNorm `gguf:"ln_q"`
	MLP0 *nn.Linear  `gguf:"mlp.0"`
	MLP2 *nn.Linear  `gguf:"mlp.2"`
}

VisionPatchMerger implements patch merging for the Qwen vision model

func (*VisionPatchMerger) Forward ¶

func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor

Forward computes patch merging for the vision model

type VisionSelfAttention ¶

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_out"`
}

func (*VisionSelfAttention) Forward ¶

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL