package convert import ( "cmp" "slices" "strings" "github.com/ollama/ollama/fs/ggml" ) type qwen25VLModel struct { qwen2Model VisionModel struct { Depth uint32 `json:"depth"` HiddenSize uint32 `json:"hidden_size"` NumHeads uint32 `json:"num_heads"` InChannels uint32 `json:"in_chans"` PatchSize uint32 `json:"patch_size"` SpatialMergeSize uint32 `json:"spatial_merge_size"` SpatialPatchSize uint32 `json:"spatial_patch_size"` WindowSize uint32 `json:"window_size"` RMSNormEps float32 `json:"layer_norm_epsilon"` RopeTheta float32 `json:"rope_theta"` FullAttentionBlocks []int32 `json:"fullatt_block_indexes"` TemporalPatchSize uint32 `json:"temporal_patch_size"` } `json:"vision_config"` } var _ ModelConverter = (*qwen25VLModel)(nil) func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV { kv := q.ModelParameters.KV(t) kv["general.architecture"] = "qwen25vl" for k, v := range q.qwen2Model.KV(t) { if strings.HasPrefix(k, "qwen2.") { kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v } } if q.VisionModel.FullAttentionBlocks == nil { kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31} } kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32) kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16) kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14) kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2) kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112) kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6) kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4) kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2) return kv } func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor { var out []*ggml.Tensor for _, t := range ts { if strings.Contains(t.Name(), "patch_embed.proj") { for t := range splitDim(t, 2, strings.NewReplacer("patch_embed.proj", "patch_embd_0"), strings.NewReplacer("patch_embed.proj", "patch_embd_1"), ) { t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 }) out = append(out, t) } } else if strings.Contains(t.Name(), "attn.qkv") { out = append(out, slices.Collect(splitDim(t, 0, strings.NewReplacer("attn.qkv", "attn_q"), strings.NewReplacer("attn.qkv", "attn_k"), strings.NewReplacer("attn.qkv", "attn_v"), ))...) } else { out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, }) } } return out } func (p *qwen25VLModel) Replacements() []string { return append( p.qwen2Model.Replacements(), "visual", "v", "blocks", "blk", "attn.proj", "attn_out", "norm1", "ln1", "norm2", "ln2", ) }