You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
2.4 KiB
73 lines
2.4 KiB
3 weeks ago
|
package input
|
||
|
|
||
|
import "github.com/ollama/ollama/ml"
|
||
|
|
||
|
// Multimodal is a multimodal embedding or a component of one.
|
||
|
// For example, it could be a row of an image that can be processed
|
||
|
// independently.
|
||
|
type Multimodal struct {
|
||
|
// Tensor is the embedding data. Implementations may chose what to
|
||
|
// store here or it may be nil if not needed. However, any ml.Tensor
|
||
|
// objects must be stored here and not in Data.
|
||
|
Tensor ml.Tensor
|
||
|
|
||
|
// Data is implementation-specific opaque data, such as metadata on how
|
||
|
// to layout Tensor. It may be nil if not needed. It may also store larger
|
||
|
// objects such as complete images if they are to be processed later.
|
||
|
Data any
|
||
|
}
|
||
|
|
||
|
// Input represents one token in the input stream
|
||
|
type Input struct {
|
||
|
// Token is a single element of text.
|
||
|
Token int32
|
||
|
|
||
|
// Multimodal is represents a non-text element such as an
|
||
|
// image (or part of one if the image can be processed in pieces).
|
||
|
// It may be used either together with Token or on its own.
|
||
|
Multimodal []Multimodal
|
||
|
|
||
|
// MultimodalHash is a unique representation of the data
|
||
|
// stored in Multimodal, used for caching and comparing
|
||
|
// equality.
|
||
|
MultimodalHash uint64
|
||
|
|
||
|
// SameBatch forces the following number of tokens to be processed
|
||
|
// in a single batch, breaking and extending batches as needed.
|
||
|
// Useful for things like images that must be processed in one
|
||
|
// shot.
|
||
|
SameBatch int
|
||
|
}
|
||
|
|
||
|
// MultimodalIndex is a multimodal element (such as an image)
|
||
|
// together with an index into the slice of Inputs with the
|
||
|
// corresponding token. Note that the index is not the same
|
||
|
// as the position - to find that use the index with the
|
||
|
// Positions slice.
|
||
|
type MultimodalIndex struct {
|
||
|
Index int
|
||
|
Multimodal []Multimodal
|
||
|
}
|
||
|
|
||
|
// Batch contains the inputs for a model forward pass
|
||
|
type Batch struct {
|
||
|
// Inputs is the input tokens, including placeholders for multimodal inputs.
|
||
|
Inputs ml.Tensor
|
||
|
|
||
|
// Multimodal is a set of multimodal embeddings previously created by
|
||
|
// EncodeMultimodal, along with an index into Inputs. Unused for text-only
|
||
|
// models or for batches without multimodal elements.
|
||
|
Multimodal []MultimodalIndex
|
||
|
|
||
|
// Positions is the position for each Input, relative to its sequence. Equal
|
||
|
// in length to Inputs.
|
||
|
Positions []int32
|
||
|
|
||
|
// Sequences is the sequence for each Input. Equal in length to Inputs.
|
||
|
Sequences []int
|
||
|
|
||
|
// Outputs are the set of indicies into Inputs for which output data should
|
||
|
// be returned.
|
||
|
Outputs []int32
|
||
|
}
|