From 8c9fb8eb73afc220e8bf99772572096b6498b748 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Sat, 14 Dec 2024 19:50:15 -0800 Subject: [PATCH] imageproc mllama refactor (#7537) Refactor mllama image processing code, and add pixtral and qwen2vl --- model/imageproc/images.go | 111 +++++++++ model/imageproc/images_test.go | 177 ++++++++++++++ .../images.go => model/mllama/imageproc.go | 181 ++++++--------- .../mllama/imageproc_test.go | 22 +- model/pixtral/imageproc.go | 68 ++++++ model/pixtral/imageproc_test.go | 219 ++++++++++++++++++ model/qwen2vl/imageproc.go | 74 ++++++ model/qwen2vl/imageproc_test.go | 78 +++++++ server/prompt.go | 11 +- server/routes.go | 12 +- 10 files changed, 828 insertions(+), 125 deletions(-) create mode 100644 model/imageproc/images.go create mode 100644 model/imageproc/images_test.go rename server/imageproc/images.go => model/mllama/imageproc.go (60%) rename server/imageproc/images_test.go => model/mllama/imageproc_test.go (95%) create mode 100644 model/pixtral/imageproc.go create mode 100644 model/pixtral/imageproc_test.go create mode 100644 model/qwen2vl/imageproc.go create mode 100644 model/qwen2vl/imageproc_test.go diff --git a/model/imageproc/images.go b/model/imageproc/images.go new file mode 100644 index 000000000..7afe36701 --- /dev/null +++ b/model/imageproc/images.go @@ -0,0 +1,111 @@ +package imageproc + +import ( + "image" + "image/color" + + "golang.org/x/image/draw" +) + +var ( + ImageNetDefaultMean = [3]float32{0.485, 0.456, 0.406} + ImageNetDefaultSTD = [3]float32{0.229, 0.224, 0.225} + ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5} + ImageNetStandardSTD = [3]float32{0.5, 0.5, 0.5} + ClipDefaultMean = [3]float32{0.48145466, 0.4578275, 0.40821073} + ClipDefaultSTD = [3]float32{0.26862954, 0.26130258, 0.27577711} +) + +const ( + ResizeBilinear = iota + ResizeNearestNeighbor + ResizeApproxBilinear + ResizeCatmullrom +) + +// Composite returns an image with the alpha channel removed by drawing over a white background. +func Composite(img image.Image) image.Image { + dst := image.NewRGBA(img.Bounds()) + + white := color.RGBA{255, 255, 255, 255} + draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) + draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) + + return dst +} + +// Resize returns an image which has been scaled to a new size. +func Resize(img image.Image, newSize image.Point, method int) image.Image { + dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) + + kernels := map[int]draw.Interpolator{ + ResizeBilinear: draw.BiLinear, + ResizeNearestNeighbor: draw.NearestNeighbor, + ResizeApproxBilinear: draw.ApproxBiLinear, + ResizeCatmullrom: draw.CatmullRom, + } + + kernel, ok := kernels[method] + if !ok { + panic("no resizing method found") + } + + kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil) + + return dst +} + +// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value. +func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 { + var pixelVals []float32 + + bounds := img.Bounds() + if channelFirst { + var rVals, gVals, bVals []float32 + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { + c := img.At(x, y) + r, g, b, _ := c.RGBA() + var rVal, gVal, bVal float32 + if rescale { + rVal = float32(r>>8) / 255.0 + gVal = float32(g>>8) / 255.0 + bVal = float32(b>>8) / 255.0 + } + + rVal = (rVal - mean[0]) / std[0] + gVal = (gVal - mean[1]) / std[1] + bVal = (bVal - mean[2]) / std[2] + + rVals = append(rVals, rVal) + gVals = append(gVals, gVal) + bVals = append(bVals, bVal) + } + } + + pixelVals = append(pixelVals, rVals...) + pixelVals = append(pixelVals, gVals...) + pixelVals = append(pixelVals, bVals...) + } else { + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { + c := img.At(x, y) + r, g, b, _ := c.RGBA() + var rVal, gVal, bVal float32 + if rescale { + rVal = float32(r>>8) / 255.0 + gVal = float32(g>>8) / 255.0 + bVal = float32(b>>8) / 255.0 + } + + rVal = (rVal - mean[0]) / std[0] + gVal = (gVal - mean[1]) / std[1] + bVal = (bVal - mean[2]) / std[2] + + pixelVals = append(pixelVals, rVal, gVal, bVal) + } + } + } + + return pixelVals +} diff --git a/model/imageproc/images_test.go b/model/imageproc/images_test.go new file mode 100644 index 000000000..a2e9ed94d --- /dev/null +++ b/model/imageproc/images_test.go @@ -0,0 +1,177 @@ +package imageproc + +import ( + "image" + "image/color" + "image/draw" + "reflect" + "testing" +) + +func createImage(width, height int, fillCol color.RGBA) image.Image { + img := image.NewRGBA(image.Rect(0, 0, width, height)) + draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src) + return img +} + +func TestComposite(t *testing.T) { + tests := []struct { + name string + img image.Image + expectedRGBA color.RGBA + }{ + { + name: "Transparent image", + img: createImage(5, 5, color.RGBA{0, 0, 0, 0}), + expectedRGBA: color.RGBA{255, 255, 255, 255}, + }, + { + name: "Solid red image", + img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), + expectedRGBA: color.RGBA{255, 0, 0, 255}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + resultImg := Composite(tt.img) + + // Check the pixel values in the resulting image + for x := range resultImg.Bounds().Dx() { + for y := range resultImg.Bounds().Dy() { + r, g, b, a := resultImg.At(x, y).RGBA() + expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA() + + if r != expectedR || g != expectedG || b != expectedB || a != expectedA { + t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)", + x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA) + } + } + } + }) + } +} + +func TestResize(t *testing.T) { + tests := []struct { + name string + img image.Image + newSize image.Point + method int + expected image.Point + }{ + { + name: "Resize with bilinear interpolation", + img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), + newSize: image.Point{10, 10}, + method: ResizeBilinear, + expected: image.Point{10, 10}, + }, + { + name: "Resize with nearest neighbor", + img: createImage(10, 10, color.RGBA{0, 255, 0, 255}), + newSize: image.Point{5, 5}, + method: ResizeNearestNeighbor, + expected: image.Point{5, 5}, + }, + { + name: "Resize with catmullrom", + img: createImage(1024, 1024, color.RGBA{0, 0, 255, 255}), + newSize: image.Point{10, 10}, + method: ResizeCatmullrom, + expected: image.Point{10, 10}, + }, + { + name: "Resize with approx bilinear", + img: createImage(1024, 768, color.RGBA{100, 100, 100, 255}), + newSize: image.Point{4, 3}, + method: ResizeApproxBilinear, + expected: image.Point{4, 3}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + resizedImg := Resize(tt.img, tt.newSize, tt.method) + + if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y { + t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)", + resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y) + } + }) + } +} + +func TestResizeInvalidMethod(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic for invalid resizing method, but did not panic") + } + }() + + img := createImage(10, 10, color.RGBA{0, 0, 0, 255}) + Resize(img, image.Point{5, 5}, -1) +} + +func TestNormalize(t *testing.T) { + tests := []struct { + name string + img image.Image + mean [3]float32 + std [3]float32 + rescale bool + channelFirst bool + expected []float32 + }{ + { + name: "Rescale with channel first", + img: createImage(2, 2, color.RGBA{128, 128, 128, 255}), + mean: ImageNetStandardMean, + std: ImageNetStandardSTD, + rescale: true, + channelFirst: true, + expected: []float32{ + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values + 0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values + }, + }, + { + name: "Rescale without channel first", + img: createImage(2, 2, color.RGBA{255, 0, 0, 255}), + mean: [3]float32{0.0, 0.0, 0.0}, + std: [3]float32{1.0, 1.0, 1.0}, + rescale: true, + channelFirst: false, + expected: []float32{ + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + }, + }, + { + name: "No rescale with mean/std adjustment", + img: createImage(2, 2, color.RGBA{100, 150, 200, 255}), + mean: ClipDefaultMean, + std: ClipDefaultSTD, + rescale: false, + channelFirst: false, + expected: []float32{ + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + -1.7922626, -1.7520971, -1.4802198, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst) + + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected) + } + }) + } +} diff --git a/server/imageproc/images.go b/model/mllama/imageproc.go similarity index 60% rename from server/imageproc/images.go rename to model/mllama/imageproc.go index 688cbf8ad..13f2fb8b3 100644 --- a/server/imageproc/images.go +++ b/model/mllama/imageproc.go @@ -1,19 +1,20 @@ -package imageproc +package mllama import ( - "bytes" "fmt" "image" - "image/color" _ "image/jpeg" _ "image/png" + "io" "math" "slices" "golang.org/x/image/draw" + + "github.com/ollama/ollama/model/imageproc" ) -func GetSupportedAspectRatios(maxTiles int) []image.Point { +func getSupportedAspectRatios(maxTiles int) []image.Point { ratios := []image.Point{} for w := range maxTiles { @@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int { return a } -func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } - - return image.Point{w, h} -} - func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles) + possibleTileArrangements := getSupportedAspectRatios(maxImageTiles) possibleCanvasSizes := []image.Point{} for _, pta := range possibleTileArrangements { possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) @@ -113,6 +94,53 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i return selectedCanvas } +func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { + targetWidth := clip(imageSize.X, tileSize, canvasSize.X) + targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) + + scaleWidth := float64(targetWidth) / float64(imageSize.X) + scaleHeight := float64(targetHeight) / float64(imageSize.Y) + + var w, h int + + if scaleWidth < scaleHeight { + w = targetWidth + h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) + } else { + w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) + h = targetHeight + } + + return image.Point{w, h} +} + +func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { + if format == "png" { + img = imageproc.Composite(img) + } + + b := img.Bounds() + tileSize := outputSize.Y + + canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) + aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} + newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) + + return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio +} + +func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { + paddedSize := image.Point{ + X: outputSize.X * aspectRatio.X, + Y: outputSize.Y * aspectRatio.Y, + } + + dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) + draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) + + return dst +} + func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X @@ -134,107 +162,40 @@ func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { return images } -// remove the "alpha" channel by drawing over a prefilled image -func compositeImage(img image.Image) image.Image { - dst := image.NewRGBA(img.Bounds()) - - white := color.RGBA{255, 255, 255, 255} - draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) - draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) - - return dst -} - -func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { - if format == "png" { - img = compositeImage(img) - } - - b := img.Bounds() - tileSize := outputSize.Y - - canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) - - dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) - - // scaling choices: - // NearestNeighbor fast, blocky output - // ApproxBiLinear fast, medium quality - // BiLinear slow, high quality - // CatmullRom very slow, very high quality - draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil) - - return dst, aspectRatio -} - -func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { - paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, - } - - dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) - draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) - - return dst -} - -func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { +func packImages(img image.Image, aspectRatio image.Point) []float32 { subImages := splitToTiles(img, aspectRatio) var pixelVals []float32 + rescale := true + channelFirst := true + for _, subImg := range subImages { - bounds := subImg.Bounds() - var rVals, gVals, bVals []float32 - for y := bounds.Min.Y; y < bounds.Max.Y; y++ { - for x := bounds.Min.X; x < bounds.Max.X; x++ { - c := subImg.At(x, y) - r, g, b, _ := c.RGBA() - rVal := float32(r>>8) / 255.0 - gVal := float32(g>>8) / 255.0 - bVal := float32(b>>8) / 255.0 - - rVal = (rVal - mean[0]) / std[0] - gVal = (gVal - mean[1]) / std[1] - bVal = (bVal - mean[2]) / std[2] - - rVals = append(rVals, rVal) - gVals = append(gVals, gVal) - bVals = append(bVals, bVal) - } - } - pixelVals = append(pixelVals, rVals...) - pixelVals = append(pixelVals, gVals...) - pixelVals = append(pixelVals, bVals...) + vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst) + pixelVals = append(pixelVals, vals...) } return pixelVals } -func Preprocess(imageData []byte) ([]float32, int, error) { - // todo: need guard in here for bad image data - - // mllama values +func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { outputSize := image.Point{560, 560} maxTiles := 4 - // clip values - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} - - img, format, err := image.Decode(bytes.NewReader(imageData)) + img, format, err := image.Decode(imageData) if err != nil { - return nil, 0, fmt.Errorf("failed to decode image: %w", err) + return nil, nil, fmt.Errorf("failed to decode image: %w", err) } - newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles) - newImage = PadImage(newImage, outputSize, aspectRatio) + newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles) + newImage = padImage(newImage, outputSize, aspectRatio) - data := PackImages(newImage, aspectRatio, mean, std) - aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1 + data := packImages(newImage, aspectRatio) + aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1 - return data, aspectRatioIndex, nil + opts := map[string]any{ + "aspectRatioIndex": aspectRatioIndex, + } + + return data, opts, nil } diff --git a/server/imageproc/images_test.go b/model/mllama/imageproc_test.go similarity index 95% rename from server/imageproc/images_test.go rename to model/mllama/imageproc_test.go index 7ad5329b6..a14b91bd1 100644 --- a/server/imageproc/images_test.go +++ b/model/mllama/imageproc_test.go @@ -1,4 +1,4 @@ -package imageproc +package mllama import ( "bytes" @@ -35,7 +35,7 @@ func TestAspectRatios(t *testing.T) { } for _, c := range cases { - actual := GetSupportedAspectRatios(c.MaxTiles) + actual := getSupportedAspectRatios(c.MaxTiles) if diff := cmp.Diff(actual, c.Expected); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) @@ -299,7 +299,7 @@ func TestResize(t *testing.T) { } for _, c := range cases { - actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) + actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) if actualImage.Bounds() != c.ExpectedImage.Bounds() { t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds()) @@ -329,7 +329,7 @@ func TestPad(t *testing.T) { } for _, c := range cases { - actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio) + actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio) if actual.Bounds() != c.Expected.Bounds() { t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) @@ -344,9 +344,6 @@ func TestPackImages(t *testing.T) { ExpectedVals int } - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} - cases := []packCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), @@ -366,7 +363,7 @@ func TestPackImages(t *testing.T) { } for _, c := range cases { - actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std) + actualVals := packImages(c.TestImage, c.AspectRatio) if len(actualVals) != c.ExpectedVals { t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) } @@ -400,7 +397,7 @@ func TestPreprocess(t *testing.T) { t.Fatal(err) } - imgData, aspectRatioID, err := Preprocess(buf.Bytes()) + imgData, opts, err := Preprocess(&buf) if err != nil { t.Fatalf("error processing: %q", err) } @@ -409,6 +406,13 @@ func TestPreprocess(t *testing.T) { t.Errorf("no image data returned") } + ar, ok := opts["aspectRatioIndex"] + if !ok { + t.Fatalf("no aspect ratio found") + } + + aspectRatioID := ar.(int) + if aspectRatioID != c.ExpectedAspectRatioID { t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) } diff --git a/model/pixtral/imageproc.go b/model/pixtral/imageproc.go new file mode 100644 index 000000000..16ec0c410 --- /dev/null +++ b/model/pixtral/imageproc.go @@ -0,0 +1,68 @@ +package pixtral + +import ( + "fmt" + "image" + _ "image/jpeg" + _ "image/png" + "io" + "math" + + "github.com/ollama/ollama/model/imageproc" +) + +func getNumImageTokens(imageSize, patchSize image.Point) image.Point { + return image.Point{ + (imageSize.X-1)/patchSize.X + 1, + (imageSize.Y-1)/patchSize.Y + 1, + } +} + +func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point { + b := img.Bounds() + le := float64(longestEdge) + ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le) + + newSize := img.Bounds().Max + + if ratio > 1.0 { + newSize = image.Point{ + int(math.Ceil(float64(b.Max.X) / ratio)), + int(math.Ceil(float64(b.Max.Y) / ratio)), + } + } + + tokens := getNumImageTokens(newSize, patchSize) + return image.Point{ + tokens.X * patchSize.X, + tokens.Y * patchSize.Y, + } +} + +func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image { + if format == "png" { + img = imageproc.Composite(img) + } + + newSize := getResizeOutputImageSize(img, longestEdge, patchSize) + + // todo should be ResizeBicubic, but it doesn't exist + return imageproc.Resize(img, newSize, imageproc.ResizeBilinear) +} + +func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { + img, format, err := image.Decode(imageData) + if err != nil { + return nil, nil, fmt.Errorf("failed to decode image: %w", err) + } + + longestEdge := 1024 + patchSize := image.Point{16, 16} + + img = resizeImage(img, format, longestEdge, patchSize) + + data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) + + opts := map[string]any{} + return data, opts, nil +} diff --git a/model/pixtral/imageproc_test.go b/model/pixtral/imageproc_test.go new file mode 100644 index 000000000..1d9e4ffe5 --- /dev/null +++ b/model/pixtral/imageproc_test.go @@ -0,0 +1,219 @@ +package pixtral + +import ( + "bytes" + "encoding/binary" + "image" + "image/png" + "math" + "os" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestGetNumImageTokens(t *testing.T) { + type numImageTokensCase struct { + ImageSize image.Point + PatchSize image.Point + Expected image.Point + } + + cases := []numImageTokensCase{ + { + ImageSize: image.Point{1024, 764}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{64, 48}, + }, + { + ImageSize: image.Point{800, 600}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{50, 38}, + }, + { + ImageSize: image.Point{640, 480}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{40, 30}, + }, + { + ImageSize: image.Point{320, 200}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{20, 13}, + }, + { + ImageSize: image.Point{1320, 200}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{83, 13}, + }, + { + ImageSize: image.Point{2000, 200}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{125, 13}, + }, + { + ImageSize: image.Point{10000, 200}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{625, 13}, + }, + { + ImageSize: image.Point{1131, 577}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{71, 37}, + }, + { + ImageSize: image.Point{16, 16}, + PatchSize: image.Point{16, 16}, + Expected: image.Point{1, 1}, + }, + } + + for _, c := range cases { + actual := getNumImageTokens(c.ImageSize, c.PatchSize) + + if diff := cmp.Diff(actual, c.Expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestGetResizeOutputImageSize(t *testing.T) { + type resizeCase struct { + Image image.Image + LongestEdge int + PatchSize image.Point + Expected image.Point + } + + cases := []resizeCase{ + { + Image: image.NewRGBA(image.Rect(0, 0, 1024, 768)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.Point{1024, 768}, + }, + { + Image: image.NewRGBA(image.Rect(0, 0, 1162, 690)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.Point{1024, 624}, + }, + { + Image: image.NewRGBA(image.Rect(0, 0, 300, 200)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.Point{304, 208}, + }, + { + Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.Point{1024, 288}, + }, + } + + for _, c := range cases { + actual := getResizeOutputImageSize(c.Image, c.LongestEdge, c.PatchSize) + + if diff := cmp.Diff(actual, c.Expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestResize(t *testing.T) { + type resizeCase struct { + Image image.Image + LongestEdge int + PatchSize image.Point + Expected image.Image + } + + cases := []resizeCase{ + { + Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.NewRGBA(image.Rect(0, 0, 1024, 288)), + }, + { + Image: image.NewRGBA(image.Rect(0, 0, 10, 10)), + LongestEdge: 1024, + PatchSize: image.Point{16, 16}, + Expected: image.NewRGBA(image.Rect(0, 0, 16, 16)), + }, + } + + for _, c := range cases { + actual := resizeImage(c.Image, "png", c.LongestEdge, c.PatchSize) + + if actual.Bounds() != c.Expected.Bounds() { + t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) + } + } +} + +func TestPreprocess(t *testing.T) { + type preprocessCase struct { + TestImage image.Image + ExpectedLen int + } + + cases := []preprocessCase{ + { + TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), + ExpectedLen: 16 * 16 * 3 * 1, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), + ExpectedLen: 1024 * 1024 * 3 * 1, + }, + } + + for _, c := range cases { + var buf bytes.Buffer + err := png.Encode(&buf, c.TestImage) + if err != nil { + t.Fatal(err) + } + + imgData, _, err := Preprocess(&buf) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + switch len(imgData) { + case 0: + t.Errorf("no image data returned") + case c.ExpectedLen: + // ok + default: + t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen) + } + } +} + +func TestPreprocessImages(t *testing.T) { + for _, testFile := range []string{"flight.png", "sportsball.png"} { + f, err := os.Open(testFile) + if err != nil { + t.Skipf("skipping test, no test image found at %s", testFile) + } + defer f.Close() + + imgData, _, err := Preprocess(f) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + byteData := make([]byte, len(imgData)*4) // float32 is 4 bytes + for i, f := range imgData { + binary.LittleEndian.PutUint32(byteData[i*4:], math.Float32bits(f)) + } + + outputPath := "processed_" + testFile + ".bin" + err = os.WriteFile(outputPath, byteData, 0o644) + if err != nil { + t.Fatalf("error writing processed image: %q", err) + } + } +} diff --git a/model/qwen2vl/imageproc.go b/model/qwen2vl/imageproc.go new file mode 100644 index 000000000..964b39072 --- /dev/null +++ b/model/qwen2vl/imageproc.go @@ -0,0 +1,74 @@ +package qwen2vl + +import ( + "fmt" + "image" + _ "image/jpeg" + _ "image/png" + "io" + "math" + + "github.com/ollama/ollama/model/imageproc" +) + +const ( + DefaultFactor = 28 + DefaultMinPixels = 56 * 56 + DefaultMaxPixels = 14 * 14 * 4 * 1280 +) + +// smartResize calculates the size of the image to resize to based on the +// factor, minPixels, and maxPixels. +func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point { + // 1. Both dimensions of size are divisible by factor + // 2. The area of the image is between minPixels and maxPixels + // 3. The aspect ratio of the image is as close to 1:1 as possible + + if size.Y < factor || size.X < factor { + panic("image is too small to resize") + } else if max(size.X, size.Y)/min(size.X, size.Y) > 200 { + panic("aspect ratio must be less than 200:1") + } + + f := float64(factor) + width := float64(size.X) + height := float64(size.Y) + + xBar := math.Round(width/f) * f + yBar := math.Round(height/f) * f + + if xBar*yBar > float64(maxPixels) { + beta := math.Sqrt(height * width / float64(maxPixels)) + xBar = math.Floor(width/beta/f) * f + yBar = math.Floor(height/beta/f) * f + } else if xBar*yBar < float64(minPixels) { + beta := math.Sqrt(float64(minPixels) / (height * width)) + xBar = math.Ceil(width*beta/f) * f + yBar = math.Ceil(height*beta/f) * f + } + + return image.Point{int(xBar), int(yBar)} +} + +func resizeImage(img image.Image, format string, size image.Point) image.Image { + if format == "png" { + img = imageproc.Composite(img) + } + + return imageproc.Resize(img, size, imageproc.ResizeBilinear) +} + +func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { + img, format, err := image.Decode(imageData) + if err != nil { + return nil, nil, fmt.Errorf("failed to decode image: %w", err) + } + + size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) + img = resizeImage(img, format, size) + + data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) + + opts := map[string]any{} + return data, opts, nil +} diff --git a/model/qwen2vl/imageproc_test.go b/model/qwen2vl/imageproc_test.go new file mode 100644 index 000000000..817b61a5c --- /dev/null +++ b/model/qwen2vl/imageproc_test.go @@ -0,0 +1,78 @@ +package qwen2vl + +import ( + "bytes" + "image" + "image/png" + "testing" +) + +func TestSmartResize(t *testing.T) { + type smartResizeCase struct { + TestImage image.Image + Expected image.Point + } + + cases := []smartResizeCase{ + { + TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)), + Expected: image.Point{980, 980}, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), + Expected: image.Point{1036, 756}, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), + Expected: image.Point{980, 980}, + }, + } + + for _, c := range cases { + b := c.TestImage.Bounds().Max + actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) + if actual != c.Expected { + t.Errorf("expected: %v, actual: %v", c.Expected, actual) + } + } +} + +func TestPreprocess(t *testing.T) { + type preprocessCase struct { + TestImage image.Image + ExpectedLen int + } + + cases := []preprocessCase{ + { + TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)), + ExpectedLen: 252 * 252 * 3 * 1, + }, + { + TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), + ExpectedLen: 980 * 980 * 3 * 1, + }, + } + + for _, c := range cases { + var buf bytes.Buffer + err := png.Encode(&buf, c.TestImage) + if err != nil { + t.Fatal(err) + } + + imgData, _, err := Preprocess(&buf) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + switch len(imgData) { + case 0: + t.Errorf("no image data returned") + case c.ExpectedLen: + // ok + default: + t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen) + } + } +} diff --git a/server/prompt.go b/server/prompt.go index 2ea8e4da5..cc69fe8cf 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -11,7 +11,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/server/imageproc" + "github.com/ollama/ollama/model/mllama" "github.com/ollama/ollama/template" ) @@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. var imgData llm.ImageData if isMllama { - data, aspectRatioID, err := imageproc.Preprocess(i) + data, opts, err := mllama.Preprocess(bytes.NewReader(i)) if err != nil { return "", nil, err } @@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. return "", nil, err } + ar, ok := opts["aspectRatioIndex"].(int) + if !ok { + return "", nil, fmt.Errorf("missing aspect ratio for image") + } + imgData = llm.ImageData{ ID: len(images), Data: buf.Bytes(), - AspectRatioID: aspectRatioID, + AspectRatioID: ar, } imgPrompt = "<|image|>" } else { diff --git a/server/routes.go b/server/routes.go index d7a1b88db..0154dde70 100644 --- a/server/routes.go +++ b/server/routes.go @@ -31,10 +31,10 @@ import ( "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/model/mllama" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/runners" - "github.com/ollama/ollama/server/imageproc" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -205,12 +205,18 @@ func (s *Server) GenerateHandler(c *gin.Context) { images := make([]llm.ImageData, len(req.Images)) for i := range req.Images { if isMllama { - data, aspectRatioID, err := imageproc.Preprocess(req.Images[i]) + data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i])) if err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) return } + ar, ok := opts["aspectRatioIndex"].(int) + if !ok { + c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) + return + } + buf := new(bytes.Buffer) err = binary.Write(buf, binary.LittleEndian, data) if err != nil { @@ -218,7 +224,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID} + images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar} } else { images[i] = llm.ImageData{ID: i, Data: req.Images[i]} }