mirror of
https://github.com/ollama/ollama.git
synced 2025-05-17 15:04:26 +02:00
222 lines
5.9 KiB
Go
222 lines
5.9 KiB
Go
package mllama
|
|
|
|
import (
|
|
"image"
|
|
"math"
|
|
"slices"
|
|
|
|
"golang.org/x/image/draw"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/model/imageproc"
|
|
)
|
|
|
|
type supportedAspectRatio struct {
|
|
rank, width, height int
|
|
}
|
|
|
|
func (a supportedAspectRatio) Point() image.Point {
|
|
return image.Point{a.width, a.height}
|
|
}
|
|
|
|
func (a supportedAspectRatio) numTiles() int {
|
|
return a.width * a.height
|
|
}
|
|
|
|
type ImageProcessor struct {
|
|
imageSize, numChannels, maxNumTiles int
|
|
|
|
mean, std [3]float32
|
|
}
|
|
|
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
|
return ImageProcessor{
|
|
imageSize: int(c.Uint("vision.image_size")),
|
|
numChannels: int(c.Uint("vision.num_channels")),
|
|
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
|
|
|
mean: imageproc.ClipDefaultMean,
|
|
std: imageproc.ClipDefaultSTD,
|
|
}
|
|
}
|
|
|
|
func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
|
|
for w := 1; w <= p.maxNumTiles; w++ {
|
|
for h := 1; h <= p.maxNumTiles/w; h++ {
|
|
ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
|
|
}
|
|
}
|
|
return ratios
|
|
}
|
|
|
|
func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
|
|
tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
|
|
th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
|
|
|
|
r := math.Min(
|
|
float64(tw)/float64(imageSize.X),
|
|
float64(th)/float64(imageSize.Y),
|
|
)
|
|
|
|
w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
|
|
h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
|
|
|
|
return image.Point{w, h}
|
|
}
|
|
|
|
func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
|
|
possibleTileArrangements := p.supportedAspectRatios()
|
|
possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
|
|
for i, pta := range possibleTileArrangements {
|
|
possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
|
|
}
|
|
|
|
scales := make([]float64, len(possibleCanvasSizes))
|
|
for i, pcs := range possibleCanvasSizes {
|
|
scales[i] = min(
|
|
float64(pcs.Y)/float64(imageSize.Y),
|
|
float64(pcs.X)/float64(imageSize.X),
|
|
)
|
|
}
|
|
|
|
var minUpscale float64
|
|
var maxDownscale float64
|
|
var upscale bool
|
|
|
|
for _, s := range scales {
|
|
if s > 1.0 {
|
|
upscale = true
|
|
if minUpscale == 0 {
|
|
minUpscale = s
|
|
} else {
|
|
minUpscale = math.Min(minUpscale, s)
|
|
}
|
|
} else {
|
|
maxDownscale = math.Max(maxDownscale, s)
|
|
}
|
|
}
|
|
|
|
selectedScale := maxDownscale
|
|
if upscale {
|
|
selectedScale = minUpscale
|
|
}
|
|
|
|
var selectedCanvas image.Point
|
|
for n, pcs := range possibleCanvasSizes {
|
|
if scales[n] == selectedScale {
|
|
// choose the smallest possible canvas
|
|
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
|
selectedCanvas = pcs
|
|
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
|
selectedCanvas = pcs
|
|
}
|
|
}
|
|
}
|
|
return selectedCanvas
|
|
}
|
|
|
|
func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
|
b := img.Bounds()
|
|
width := b.Max.X - b.Min.X
|
|
height := b.Max.Y - b.Min.Y
|
|
tileHeight := height / numTilesSize.Y
|
|
tileWidth := width / numTilesSize.X
|
|
|
|
images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
|
|
|
|
for h := range numTilesSize.Y {
|
|
for w := range numTilesSize.X {
|
|
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
|
if subImg, ok := img.(interface {
|
|
SubImage(image.Rectangle) image.Image
|
|
}); ok {
|
|
images = append(images, subImg.SubImage(rect))
|
|
} else {
|
|
// Handle the case where img does not implement SubImage
|
|
// This is a fallback and may not be efficient
|
|
newImg := image.NewRGBA(rect)
|
|
draw.Draw(newImg, rect, img, rect.Min, draw.Src)
|
|
images = append(images, newImg)
|
|
}
|
|
}
|
|
}
|
|
|
|
return images
|
|
}
|
|
|
|
func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
|
|
b := img.Bounds()
|
|
|
|
canvasSize := p.optimalTiledCanvas(b.Max)
|
|
aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
|
|
newSize := p.fitToCanvas(b.Max, canvasSize)
|
|
|
|
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
|
|
|
// scaling choices:
|
|
// NearestNeighbor fast, blocky output
|
|
// ApproxBiLinear fast, medium quality
|
|
// BiLinear slow, high quality
|
|
// CatmullRom very slow, very high quality
|
|
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
|
|
|
return dst, aspectRatio
|
|
}
|
|
|
|
func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
|
|
paddedSize := image.Point{
|
|
X: p.imageSize * aspectRatio.X,
|
|
Y: p.imageSize * aspectRatio.Y,
|
|
}
|
|
|
|
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
|
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
|
|
|
return dst
|
|
}
|
|
|
|
func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
|
|
subImages := p.splitToTiles(img, aspectRatio)
|
|
|
|
var pixelVals []float32
|
|
|
|
for _, subImg := range subImages {
|
|
bounds := subImg.Bounds()
|
|
var rVals, gVals, bVals []float32
|
|
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
|
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
|
c := subImg.At(x, y)
|
|
r, g, b, _ := c.RGBA()
|
|
rVal := float32(r>>8) / 255.0
|
|
gVal := float32(g>>8) / 255.0
|
|
bVal := float32(b>>8) / 255.0
|
|
|
|
rVal = (rVal - p.mean[0]) / p.std[0]
|
|
gVal = (gVal - p.mean[1]) / p.std[1]
|
|
bVal = (bVal - p.mean[2]) / p.std[2]
|
|
|
|
rVals = append(rVals, rVal)
|
|
gVals = append(gVals, gVal)
|
|
bVals = append(bVals, bVal)
|
|
}
|
|
}
|
|
pixelVals = append(pixelVals, rVals...)
|
|
pixelVals = append(pixelVals, gVals...)
|
|
pixelVals = append(pixelVals, bVals...)
|
|
}
|
|
|
|
return pixelVals
|
|
}
|
|
|
|
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
|
|
newImage, newImageRatio := p.resize(img)
|
|
newImage = p.pad(newImage, newImageRatio)
|
|
pixelValues := p.pack(newImage, newImageRatio)
|
|
|
|
supportedAspectRatios := p.supportedAspectRatios()
|
|
aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
|
|
return i.width == newImageRatio.X && i.height == newImageRatio.Y
|
|
})
|
|
|
|
return pixelValues, supportedAspectRatios[aspectRatioID], nil
|
|
}
|