qwen25vl

package
v0.0.0-...-6e6905b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 22, 2025 License: MIT Imports: 14 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func New

func New(c fs.Config) (model.Model, error)

Types

type Grid

type Grid struct {
	Height   int
	Width    int
	Temporal int
}

type ImageProcessor

type ImageProcessor struct {
	// contains filtered or unexported fields
}

ImageProcessor contains configuration for the Qwen 2.5 VL image processing

func (*ImageProcessor) ProcessImage

func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)

func (*ImageProcessor) SmartResize

func (p *ImageProcessor) SmartResize(height, width int) (int, int)

SmartResize implements the smart resize algorithm

type Layer

type Layer struct {
	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
	SelfAttention *SelfAttention
	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
	MLP           *MLP
}

Layer represents a single transformer layer combining self-attention and feed-forward components

func (*Layer) Forward

func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor

type MLP

type MLP struct {
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
	Gate *nn.Linear `gguf:"ffn_gate"`
}

MLP implements the feed-forward network component with SwiGLU activation

func (*MLP) Forward

func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor

type Model

type Model struct {
	model.Base
	model.BytePairEncoding

	*TextModel
	*VisionModel `gguf:"v"`

	ImageProcessor
}

func (*Model) EncodeMultimodal

func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error)

func (*Model) Forward

func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error)

func (*Model) PixelValues

func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error)

func (*Model) PostTokenize

func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error)

PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass

type PatchEmbedding

type PatchEmbedding struct {
	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
}

func (*PatchEmbedding) Forward

func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor

type SelfAttention

type SelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_output"`
}

SelfAttention implements the multi-head self-attention mechanism with separate projections for query, key, value and output transformations

func (*SelfAttention) Forward

func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor

type TextModel

type TextModel struct {
	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
	Layers         []Layer       `gguf:"blk"`
	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

	*TextOptions
}

func NewTextModel

func NewTextModel(c fs.Config) *TextModel

func (*TextModel) Forward

func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error)

func (*TextModel) Shift

func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)

Shift applies rotary position embeddings to the key tensor for causal attention caching

type TextOptions

type TextOptions struct {
	// contains filtered or unexported fields
}

type VisionEncoderLayer

type VisionEncoderLayer struct {
	Norm1         *nn.RMSNorm `gguf:"ln1"`
	SelfAttention *VisionSelfAttention
	Norm2         *nn.RMSNorm `gguf:"ln2"`
	MLP           *VisionMLP
}

func (*VisionEncoderLayer) Forward

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor

type VisionMLP

type VisionMLP struct {
	Gate *nn.Linear `gguf:"ffn_gate"`
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
}

VisionMLP implements the multi-layer perceptron

func (*VisionMLP) Forward

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor

type VisionModel

type VisionModel struct {
	PatchEmbedding *PatchEmbedding
	Layers         []VisionEncoderLayer `gguf:"blk"`
	PatchMerger    *VisionPatchMerger   `gguf:"merger"`

	*VisionModelOptions
}

VisionModel implements the Qwen vision model

func (*VisionModel) Forward

func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor

Forward computes the vision model for an input tensor

func (*VisionModel) PositionalEmbedding

func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor

PositionalEmbedding generates rotary position embeddings for attention mechanisms

func (*VisionModel) WindowIndex

func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)

WindowIndex divides the grid into windows and returns:

  1. A tensor containing flattened indices of all grid points organized by windows
  2. A slice of boundaries that mark where each window's data begins and ends in the flattened representation, scaled by spatialMergeSize squared

The boundaries slice always starts with 0 and contains cumulative ending positions for each window, allowing downstream processing to identify window boundaries in the tensor data.

type VisionModelOptions

type VisionModelOptions struct {
	// contains filtered or unexported fields
}

VisionModelOptions contains configuration options

type VisionPatchMerger

type VisionPatchMerger struct {
	LNQ  *nn.RMSNorm `gguf:"ln_q"`
	MLP0 *nn.Linear  `gguf:"mlp.0"`
	MLP2 *nn.Linear  `gguf:"mlp.2"`
}

VisionPatchMerger implements patch merging for the Qwen vision model

func (*VisionPatchMerger) Forward

func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor

Forward computes patch merging for the vision model

type VisionSelfAttention

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_out"`
}

func (*VisionSelfAttention) Forward

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL