package mllama import ( "image" "math" "slices" "golang.org/x/image/draw" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/model/imageproc" ) type supportedAspectRatio struct { rank, width, height int } func (a supportedAspectRatio) Point() image.Point { return image.Point{a.width, a.height} } func (a supportedAspectRatio) numTiles() int { return a.width * a.height } type ImageProcessor struct { imageSize, numChannels, maxNumTiles int mean, std [3]float32 } func newImageProcessor(c fs.Config) ImageProcessor { return ImageProcessor{ imageSize: int(c.Uint("vision.image_size")), numChannels: int(c.Uint("vision.num_channels")), maxNumTiles: int(c.Uint("vision.max_num_tiles")), mean: imageproc.ClipDefaultMean, std: imageproc.ClipDefaultSTD, } } func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) { for w := 1; w <= p.maxNumTiles; w++ { for h := 1; h <= p.maxNumTiles/w; h++ { ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h}) } } return ratios } func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point { tw := min(max(imageSize.X, p.imageSize), canvasSize.X) th := min(max(imageSize.Y, p.imageSize), canvasSize.Y) r := math.Min( float64(tw)/float64(imageSize.X), float64(th)/float64(imageSize.Y), ) w := min(int(math.Floor(float64(imageSize.X)*r)), tw) h := min(int(math.Floor(float64(imageSize.Y)*r)), th) return image.Point{w, h} } func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point { possibleTileArrangements := p.supportedAspectRatios() possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements)) for i, pta := range possibleTileArrangements { possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize} } scales := make([]float64, len(possibleCanvasSizes)) for i, pcs := range possibleCanvasSizes { scales[i] = min( float64(pcs.Y)/float64(imageSize.Y), float64(pcs.X)/float64(imageSize.X), ) } var minUpscale float64 var maxDownscale float64 var upscale bool for _, s := range scales { if s > 1.0 { upscale = true if minUpscale == 0 { minUpscale = s } else { minUpscale = math.Min(minUpscale, s) } } else { maxDownscale = math.Max(maxDownscale, s) } } selectedScale := maxDownscale if upscale { selectedScale = minUpscale } var selectedCanvas image.Point for n, pcs := range possibleCanvasSizes { if scales[n] == selectedScale { // choose the smallest possible canvas if selectedCanvas.X == 0 && selectedCanvas.Y == 0 { selectedCanvas = pcs } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y { selectedCanvas = pcs } } } return selectedCanvas } func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X height := b.Max.Y - b.Min.Y tileHeight := height / numTilesSize.Y tileWidth := width / numTilesSize.X images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X) for h := range numTilesSize.Y { for w := range numTilesSize.X { rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) if subImg, ok := img.(interface { SubImage(image.Rectangle) image.Image }); ok { images = append(images, subImg.SubImage(rect)) } else { // Handle the case where img does not implement SubImage // This is a fallback and may not be efficient newImg := image.NewRGBA(rect) draw.Draw(newImg, rect, img, rect.Min, draw.Src) images = append(images, newImg) } } } return images } func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) { b := img.Bounds() canvasSize := p.optimalTiledCanvas(b.Max) aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize} newSize := p.fitToCanvas(b.Max, canvasSize) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) // scaling choices: // NearestNeighbor fast, blocky output // ApproxBiLinear fast, medium quality // BiLinear slow, high quality // CatmullRom very slow, very high quality draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil) return dst, aspectRatio } func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image { paddedSize := image.Point{ X: p.imageSize * aspectRatio.X, Y: p.imageSize * aspectRatio.Y, } dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) return dst } func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 { subImages := p.splitToTiles(img, aspectRatio) var pixelVals []float32 for _, subImg := range subImages { bounds := subImg.Bounds() var rVals, gVals, bVals []float32 for y := bounds.Min.Y; y < bounds.Max.Y; y++ { for x := bounds.Min.X; x < bounds.Max.X; x++ { c := subImg.At(x, y) r, g, b, _ := c.RGBA() rVal := float32(r>>8) / 255.0 gVal := float32(g>>8) / 255.0 bVal := float32(b>>8) / 255.0 rVal = (rVal - p.mean[0]) / p.std[0] gVal = (gVal - p.mean[1]) / p.std[1] bVal = (bVal - p.mean[2]) / p.std[2] rVals = append(rVals, rVal) gVals = append(gVals, gVal) bVals = append(bVals, bVal) } } pixelVals = append(pixelVals, rVals...) pixelVals = append(pixelVals, gVals...) pixelVals = append(pixelVals, bVals...) } return pixelVals } func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) { newImage, newImageRatio := p.resize(img) newImage = p.pad(newImage, newImageRatio) pixelValues := p.pack(newImage, newImageRatio) supportedAspectRatios := p.supportedAspectRatios() aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool { return i.width == newImageRatio.X && i.height == newImageRatio.Y }) return pixelValues, supportedAspectRatios[aspectRatioID], nil }