From 8c9fb8eb73afc220e8bf99772572096b6498b748 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Sat, 14 Dec 2024 19:50:15 -0800
Subject: [PATCH] imageproc mllama refactor (#7537)

Refactor mllama image processing code, and add pixtral and qwen2vl
---
 model/imageproc/images.go                     | 111 +++++++++
 model/imageproc/images_test.go                | 177 ++++++++++++++
 .../images.go => model/mllama/imageproc.go    | 181 ++++++---------
 .../mllama/imageproc_test.go                  |  22 +-
 model/pixtral/imageproc.go                    |  68 ++++++
 model/pixtral/imageproc_test.go               | 219 ++++++++++++++++++
 model/qwen2vl/imageproc.go                    |  74 ++++++
 model/qwen2vl/imageproc_test.go               |  78 +++++++
 server/prompt.go                              |  11 +-
 server/routes.go                              |  12 +-
 10 files changed, 828 insertions(+), 125 deletions(-)
 create mode 100644 model/imageproc/images.go
 create mode 100644 model/imageproc/images_test.go
 rename server/imageproc/images.go => model/mllama/imageproc.go (60%)
 rename server/imageproc/images_test.go => model/mllama/imageproc_test.go (95%)
 create mode 100644 model/pixtral/imageproc.go
 create mode 100644 model/pixtral/imageproc_test.go
 create mode 100644 model/qwen2vl/imageproc.go
 create mode 100644 model/qwen2vl/imageproc_test.go

diff --git a/model/imageproc/images.go b/model/imageproc/images.go
new file mode 100644
index 000000000..7afe36701
--- /dev/null
+++ b/model/imageproc/images.go
@@ -0,0 +1,111 @@
+package imageproc
+
+import (
+	"image"
+	"image/color"
+
+	"golang.org/x/image/draw"
+)
+
+var (
+	ImageNetDefaultMean  = [3]float32{0.485, 0.456, 0.406}
+	ImageNetDefaultSTD   = [3]float32{0.229, 0.224, 0.225}
+	ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5}
+	ImageNetStandardSTD  = [3]float32{0.5, 0.5, 0.5}
+	ClipDefaultMean      = [3]float32{0.48145466, 0.4578275, 0.40821073}
+	ClipDefaultSTD       = [3]float32{0.26862954, 0.26130258, 0.27577711}
+)
+
+const (
+	ResizeBilinear = iota
+	ResizeNearestNeighbor
+	ResizeApproxBilinear
+	ResizeCatmullrom
+)
+
+// Composite returns an image with the alpha channel removed by drawing over a white background.
+func Composite(img image.Image) image.Image {
+	dst := image.NewRGBA(img.Bounds())
+
+	white := color.RGBA{255, 255, 255, 255}
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
+
+	return dst
+}
+
+// Resize returns an image which has been scaled to a new size.
+func Resize(img image.Image, newSize image.Point, method int) image.Image {
+	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
+
+	kernels := map[int]draw.Interpolator{
+		ResizeBilinear:        draw.BiLinear,
+		ResizeNearestNeighbor: draw.NearestNeighbor,
+		ResizeApproxBilinear:  draw.ApproxBiLinear,
+		ResizeCatmullrom:      draw.CatmullRom,
+	}
+
+	kernel, ok := kernels[method]
+	if !ok {
+		panic("no resizing method found")
+	}
+
+	kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil)
+
+	return dst
+}
+
+// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
+func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 {
+	var pixelVals []float32
+
+	bounds := img.Bounds()
+	if channelFirst {
+		var rVals, gVals, bVals []float32
+		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
+				c := img.At(x, y)
+				r, g, b, _ := c.RGBA()
+				var rVal, gVal, bVal float32
+				if rescale {
+					rVal = float32(r>>8) / 255.0
+					gVal = float32(g>>8) / 255.0
+					bVal = float32(b>>8) / 255.0
+				}
+
+				rVal = (rVal - mean[0]) / std[0]
+				gVal = (gVal - mean[1]) / std[1]
+				bVal = (bVal - mean[2]) / std[2]
+
+				rVals = append(rVals, rVal)
+				gVals = append(gVals, gVal)
+				bVals = append(bVals, bVal)
+			}
+		}
+
+		pixelVals = append(pixelVals, rVals...)
+		pixelVals = append(pixelVals, gVals...)
+		pixelVals = append(pixelVals, bVals...)
+	} else {
+		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
+				c := img.At(x, y)
+				r, g, b, _ := c.RGBA()
+				var rVal, gVal, bVal float32
+				if rescale {
+					rVal = float32(r>>8) / 255.0
+					gVal = float32(g>>8) / 255.0
+					bVal = float32(b>>8) / 255.0
+				}
+
+				rVal = (rVal - mean[0]) / std[0]
+				gVal = (gVal - mean[1]) / std[1]
+				bVal = (bVal - mean[2]) / std[2]
+
+				pixelVals = append(pixelVals, rVal, gVal, bVal)
+			}
+		}
+	}
+
+	return pixelVals
+}
diff --git a/model/imageproc/images_test.go b/model/imageproc/images_test.go
new file mode 100644
index 000000000..a2e9ed94d
--- /dev/null
+++ b/model/imageproc/images_test.go
@@ -0,0 +1,177 @@
+package imageproc
+
+import (
+	"image"
+	"image/color"
+	"image/draw"
+	"reflect"
+	"testing"
+)
+
+func createImage(width, height int, fillCol color.RGBA) image.Image {
+	img := image.NewRGBA(image.Rect(0, 0, width, height))
+	draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src)
+	return img
+}
+
+func TestComposite(t *testing.T) {
+	tests := []struct {
+		name         string
+		img          image.Image
+		expectedRGBA color.RGBA
+	}{
+		{
+			name:         "Transparent image",
+			img:          createImage(5, 5, color.RGBA{0, 0, 0, 0}),
+			expectedRGBA: color.RGBA{255, 255, 255, 255},
+		},
+		{
+			name:         "Solid red image",
+			img:          createImage(5, 5, color.RGBA{255, 0, 0, 255}),
+			expectedRGBA: color.RGBA{255, 0, 0, 255},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			resultImg := Composite(tt.img)
+
+			// Check the pixel values in the resulting image
+			for x := range resultImg.Bounds().Dx() {
+				for y := range resultImg.Bounds().Dy() {
+					r, g, b, a := resultImg.At(x, y).RGBA()
+					expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA()
+
+					if r != expectedR || g != expectedG || b != expectedB || a != expectedA {
+						t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)",
+							x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestResize(t *testing.T) {
+	tests := []struct {
+		name     string
+		img      image.Image
+		newSize  image.Point
+		method   int
+		expected image.Point
+	}{
+		{
+			name:     "Resize with bilinear interpolation",
+			img:      createImage(5, 5, color.RGBA{255, 0, 0, 255}),
+			newSize:  image.Point{10, 10},
+			method:   ResizeBilinear,
+			expected: image.Point{10, 10},
+		},
+		{
+			name:     "Resize with nearest neighbor",
+			img:      createImage(10, 10, color.RGBA{0, 255, 0, 255}),
+			newSize:  image.Point{5, 5},
+			method:   ResizeNearestNeighbor,
+			expected: image.Point{5, 5},
+		},
+		{
+			name:     "Resize with catmullrom",
+			img:      createImage(1024, 1024, color.RGBA{0, 0, 255, 255}),
+			newSize:  image.Point{10, 10},
+			method:   ResizeCatmullrom,
+			expected: image.Point{10, 10},
+		},
+		{
+			name:     "Resize with approx bilinear",
+			img:      createImage(1024, 768, color.RGBA{100, 100, 100, 255}),
+			newSize:  image.Point{4, 3},
+			method:   ResizeApproxBilinear,
+			expected: image.Point{4, 3},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			resizedImg := Resize(tt.img, tt.newSize, tt.method)
+
+			if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y {
+				t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)",
+					resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y)
+			}
+		})
+	}
+}
+
+func TestResizeInvalidMethod(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Errorf("Expected panic for invalid resizing method, but did not panic")
+		}
+	}()
+
+	img := createImage(10, 10, color.RGBA{0, 0, 0, 255})
+	Resize(img, image.Point{5, 5}, -1)
+}
+
+func TestNormalize(t *testing.T) {
+	tests := []struct {
+		name         string
+		img          image.Image
+		mean         [3]float32
+		std          [3]float32
+		rescale      bool
+		channelFirst bool
+		expected     []float32
+	}{
+		{
+			name:         "Rescale with channel first",
+			img:          createImage(2, 2, color.RGBA{128, 128, 128, 255}),
+			mean:         ImageNetStandardMean,
+			std:          ImageNetStandardSTD,
+			rescale:      true,
+			channelFirst: true,
+			expected: []float32{
+				0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values
+				0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values
+				0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values
+			},
+		},
+		{
+			name:         "Rescale without channel first",
+			img:          createImage(2, 2, color.RGBA{255, 0, 0, 255}),
+			mean:         [3]float32{0.0, 0.0, 0.0},
+			std:          [3]float32{1.0, 1.0, 1.0},
+			rescale:      true,
+			channelFirst: false,
+			expected: []float32{
+				1.0, 0.0, 0.0,
+				1.0, 0.0, 0.0,
+				1.0, 0.0, 0.0,
+				1.0, 0.0, 0.0,
+			},
+		},
+		{
+			name:         "No rescale with mean/std adjustment",
+			img:          createImage(2, 2, color.RGBA{100, 150, 200, 255}),
+			mean:         ClipDefaultMean,
+			std:          ClipDefaultSTD,
+			rescale:      false,
+			channelFirst: false,
+			expected: []float32{
+				-1.7922626, -1.7520971, -1.4802198,
+				-1.7922626, -1.7520971, -1.4802198,
+				-1.7922626, -1.7520971, -1.4802198,
+				-1.7922626, -1.7520971, -1.4802198,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst)
+
+			if !reflect.DeepEqual(result, tt.expected) {
+				t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/server/imageproc/images.go b/model/mllama/imageproc.go
similarity index 60%
rename from server/imageproc/images.go
rename to model/mllama/imageproc.go
index 688cbf8ad..13f2fb8b3 100644
--- a/server/imageproc/images.go
+++ b/model/mllama/imageproc.go
@@ -1,19 +1,20 @@
-package imageproc
+package mllama
 
 import (
-	"bytes"
 	"fmt"
 	"image"
-	"image/color"
 	_ "image/jpeg"
 	_ "image/png"
+	"io"
 	"math"
 	"slices"
 
 	"golang.org/x/image/draw"
+
+	"github.com/ollama/ollama/model/imageproc"
 )
 
-func GetSupportedAspectRatios(maxTiles int) []image.Point {
+func getSupportedAspectRatios(maxTiles int) []image.Point {
 	ratios := []image.Point{}
 
 	for w := range maxTiles {
@@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int {
 	return a
 }
 
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
-
-	return image.Point{w, h}
-}
-
 func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
+	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
 	possibleCanvasSizes := []image.Point{}
 	for _, pta := range possibleTileArrangements {
 		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
@@ -113,6 +94,53 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
 	return selectedCanvas
 }
 
+func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
+	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
+	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
+
+	scaleWidth := float64(targetWidth) / float64(imageSize.X)
+	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
+
+	var w, h int
+
+	if scaleWidth < scaleHeight {
+		w = targetWidth
+		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
+	} else {
+		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
+		h = targetHeight
+	}
+
+	return image.Point{w, h}
+}
+
+func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+	if format == "png" {
+		img = imageproc.Composite(img)
+	}
+
+	b := img.Bounds()
+	tileSize := outputSize.Y
+
+	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
+	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
+	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
+
+	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
+}
+
+func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+	paddedSize := image.Point{
+		X: outputSize.X * aspectRatio.X,
+		Y: outputSize.Y * aspectRatio.Y,
+	}
+
+	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
+	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
+
+	return dst
+}
+
 func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
@@ -134,107 +162,40 @@ func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	return images
 }
 
-// remove the "alpha" channel by drawing over a prefilled image
-func compositeImage(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
-	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
-
-	return dst
-}
-
-func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
-	if format == "png" {
-		img = compositeImage(img)
-	}
-
-	b := img.Bounds()
-	tileSize := outputSize.Y
-
-	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
-
-	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
-
-	// scaling choices:
-	//   NearestNeighbor	fast, blocky output
-	//   ApproxBiLinear	fast, medium quality
-	//   BiLinear		slow, high quality
-	//   CatmullRom		very slow, very high quality
-	draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
-
-	return dst, aspectRatio
-}
-
-func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
-	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
-	}
-
-	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
-	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
-
-	return dst
-}
-
-func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+func packImages(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := splitToTiles(img, aspectRatio)
 
 	var pixelVals []float32
 
+	rescale := true
+	channelFirst := true
+
 	for _, subImg := range subImages {
-		bounds := subImg.Bounds()
-		var rVals, gVals, bVals []float32
-		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
-			for x := bounds.Min.X; x < bounds.Max.X; x++ {
-				c := subImg.At(x, y)
-				r, g, b, _ := c.RGBA()
-				rVal := float32(r>>8) / 255.0
-				gVal := float32(g>>8) / 255.0
-				bVal := float32(b>>8) / 255.0
-
-				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - mean[2]) / std[2]
-
-				rVals = append(rVals, rVal)
-				gVals = append(gVals, gVal)
-				bVals = append(bVals, bVal)
-			}
-		}
-		pixelVals = append(pixelVals, rVals...)
-		pixelVals = append(pixelVals, gVals...)
-		pixelVals = append(pixelVals, bVals...)
+		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
+		pixelVals = append(pixelVals, vals...)
 	}
 
 	return pixelVals
 }
 
-func Preprocess(imageData []byte) ([]float32, int, error) {
-	// todo: need guard in here for bad image data
-
-	// mllama values
+func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
 	outputSize := image.Point{560, 560}
 	maxTiles := 4
 
-	// clip values
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
-
-	img, format, err := image.Decode(bytes.NewReader(imageData))
+	img, format, err := image.Decode(imageData)
 	if err != nil {
-		return nil, 0, fmt.Errorf("failed to decode image: %w", err)
+		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
 	}
 
-	newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
-	newImage = PadImage(newImage, outputSize, aspectRatio)
+	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
+	newImage = padImage(newImage, outputSize, aspectRatio)
 
-	data := PackImages(newImage, aspectRatio, mean, std)
-	aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1
+	data := packImages(newImage, aspectRatio)
+	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
 
-	return data, aspectRatioIndex, nil
+	opts := map[string]any{
+		"aspectRatioIndex": aspectRatioIndex,
+	}
+
+	return data, opts, nil
 }
diff --git a/server/imageproc/images_test.go b/model/mllama/imageproc_test.go
similarity index 95%
rename from server/imageproc/images_test.go
rename to model/mllama/imageproc_test.go
index 7ad5329b6..a14b91bd1 100644
--- a/server/imageproc/images_test.go
+++ b/model/mllama/imageproc_test.go
@@ -1,4 +1,4 @@
-package imageproc
+package mllama
 
 import (
 	"bytes"
@@ -35,7 +35,7 @@ func TestAspectRatios(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		actual := GetSupportedAspectRatios(c.MaxTiles)
+		actual := getSupportedAspectRatios(c.MaxTiles)
 
 		if diff := cmp.Diff(actual, c.Expected); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
@@ -299,7 +299,7 @@ func TestResize(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
+		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
 
 		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
@@ -329,7 +329,7 @@ func TestPad(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio)
+		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
 
 		if actual.Bounds() != c.Expected.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
@@ -344,9 +344,6 @@ func TestPackImages(t *testing.T) {
 		ExpectedVals int
 	}
 
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
-
 	cases := []packCase{
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
@@ -366,7 +363,7 @@ func TestPackImages(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
+		actualVals := packImages(c.TestImage, c.AspectRatio)
 		if len(actualVals) != c.ExpectedVals {
 			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
 		}
@@ -400,7 +397,7 @@ func TestPreprocess(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		imgData, aspectRatioID, err := Preprocess(buf.Bytes())
+		imgData, opts, err := Preprocess(&buf)
 		if err != nil {
 			t.Fatalf("error processing: %q", err)
 		}
@@ -409,6 +406,13 @@ func TestPreprocess(t *testing.T) {
 			t.Errorf("no image data returned")
 		}
 
+		ar, ok := opts["aspectRatioIndex"]
+		if !ok {
+			t.Fatalf("no aspect ratio found")
+		}
+
+		aspectRatioID := ar.(int)
+
 		if aspectRatioID != c.ExpectedAspectRatioID {
 			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
 		}
diff --git a/model/pixtral/imageproc.go b/model/pixtral/imageproc.go
new file mode 100644
index 000000000..16ec0c410
--- /dev/null
+++ b/model/pixtral/imageproc.go
@@ -0,0 +1,68 @@
+package pixtral
+
+import (
+	"fmt"
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+	"io"
+	"math"
+
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
+	return image.Point{
+		(imageSize.X-1)/patchSize.X + 1,
+		(imageSize.Y-1)/patchSize.Y + 1,
+	}
+}
+
+func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
+	b := img.Bounds()
+	le := float64(longestEdge)
+	ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
+
+	newSize := img.Bounds().Max
+
+	if ratio > 1.0 {
+		newSize = image.Point{
+			int(math.Ceil(float64(b.Max.X) / ratio)),
+			int(math.Ceil(float64(b.Max.Y) / ratio)),
+		}
+	}
+
+	tokens := getNumImageTokens(newSize, patchSize)
+	return image.Point{
+		tokens.X * patchSize.X,
+		tokens.Y * patchSize.Y,
+	}
+}
+
+func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
+	if format == "png" {
+		img = imageproc.Composite(img)
+	}
+
+	newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
+
+	// todo should be ResizeBicubic, but it doesn't exist
+	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
+}
+
+func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
+	img, format, err := image.Decode(imageData)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
+	}
+
+	longestEdge := 1024
+	patchSize := image.Point{16, 16}
+
+	img = resizeImage(img, format, longestEdge, patchSize)
+
+	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+
+	opts := map[string]any{}
+	return data, opts, nil
+}
diff --git a/model/pixtral/imageproc_test.go b/model/pixtral/imageproc_test.go
new file mode 100644
index 000000000..1d9e4ffe5
--- /dev/null
+++ b/model/pixtral/imageproc_test.go
@@ -0,0 +1,219 @@
+package pixtral
+
+import (
+	"bytes"
+	"encoding/binary"
+	"image"
+	"image/png"
+	"math"
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestGetNumImageTokens(t *testing.T) {
+	type numImageTokensCase struct {
+		ImageSize image.Point
+		PatchSize image.Point
+		Expected  image.Point
+	}
+
+	cases := []numImageTokensCase{
+		{
+			ImageSize: image.Point{1024, 764},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{64, 48},
+		},
+		{
+			ImageSize: image.Point{800, 600},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{50, 38},
+		},
+		{
+			ImageSize: image.Point{640, 480},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{40, 30},
+		},
+		{
+			ImageSize: image.Point{320, 200},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{20, 13},
+		},
+		{
+			ImageSize: image.Point{1320, 200},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{83, 13},
+		},
+		{
+			ImageSize: image.Point{2000, 200},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{125, 13},
+		},
+		{
+			ImageSize: image.Point{10000, 200},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{625, 13},
+		},
+		{
+			ImageSize: image.Point{1131, 577},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{71, 37},
+		},
+		{
+			ImageSize: image.Point{16, 16},
+			PatchSize: image.Point{16, 16},
+			Expected:  image.Point{1, 1},
+		},
+	}
+
+	for _, c := range cases {
+		actual := getNumImageTokens(c.ImageSize, c.PatchSize)
+
+		if diff := cmp.Diff(actual, c.Expected); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestGetResizeOutputImageSize(t *testing.T) {
+	type resizeCase struct {
+		Image       image.Image
+		LongestEdge int
+		PatchSize   image.Point
+		Expected    image.Point
+	}
+
+	cases := []resizeCase{
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.Point{1024, 768},
+		},
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 1162, 690)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.Point{1024, 624},
+		},
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 300, 200)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.Point{304, 208},
+		},
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 1862, 522)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.Point{1024, 288},
+		},
+	}
+
+	for _, c := range cases {
+		actual := getResizeOutputImageSize(c.Image, c.LongestEdge, c.PatchSize)
+
+		if diff := cmp.Diff(actual, c.Expected); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestResize(t *testing.T) {
+	type resizeCase struct {
+		Image       image.Image
+		LongestEdge int
+		PatchSize   image.Point
+		Expected    image.Image
+	}
+
+	cases := []resizeCase{
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 1862, 522)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.NewRGBA(image.Rect(0, 0, 1024, 288)),
+		},
+		{
+			Image:       image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			LongestEdge: 1024,
+			PatchSize:   image.Point{16, 16},
+			Expected:    image.NewRGBA(image.Rect(0, 0, 16, 16)),
+		},
+	}
+
+	for _, c := range cases {
+		actual := resizeImage(c.Image, "png", c.LongestEdge, c.PatchSize)
+
+		if actual.Bounds() != c.Expected.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	type preprocessCase struct {
+		TestImage   image.Image
+		ExpectedLen int
+	}
+
+	cases := []preprocessCase{
+		{
+			TestImage:   image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			ExpectedLen: 16 * 16 * 3 * 1,
+		},
+		{
+			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
+			ExpectedLen: 1024 * 1024 * 3 * 1,
+		},
+	}
+
+	for _, c := range cases {
+		var buf bytes.Buffer
+		err := png.Encode(&buf, c.TestImage)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		imgData, _, err := Preprocess(&buf)
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		switch len(imgData) {
+		case 0:
+			t.Errorf("no image data returned")
+		case c.ExpectedLen:
+			// ok
+		default:
+			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
+		}
+	}
+}
+
+func TestPreprocessImages(t *testing.T) {
+	for _, testFile := range []string{"flight.png", "sportsball.png"} {
+		f, err := os.Open(testFile)
+		if err != nil {
+			t.Skipf("skipping test, no test image found at %s", testFile)
+		}
+		defer f.Close()
+
+		imgData, _, err := Preprocess(f)
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		byteData := make([]byte, len(imgData)*4) // float32 is 4 bytes
+		for i, f := range imgData {
+			binary.LittleEndian.PutUint32(byteData[i*4:], math.Float32bits(f))
+		}
+
+		outputPath := "processed_" + testFile + ".bin"
+		err = os.WriteFile(outputPath, byteData, 0o644)
+		if err != nil {
+			t.Fatalf("error writing processed image: %q", err)
+		}
+	}
+}
diff --git a/model/qwen2vl/imageproc.go b/model/qwen2vl/imageproc.go
new file mode 100644
index 000000000..964b39072
--- /dev/null
+++ b/model/qwen2vl/imageproc.go
@@ -0,0 +1,74 @@
+package qwen2vl
+
+import (
+	"fmt"
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+	"io"
+	"math"
+
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+const (
+	DefaultFactor    = 28
+	DefaultMinPixels = 56 * 56
+	DefaultMaxPixels = 14 * 14 * 4 * 1280
+)
+
+// smartResize calculates the size of the image to resize to based on the
+// factor, minPixels, and maxPixels.
+func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
+	// 1. Both dimensions of size are divisible by factor
+	// 2. The area of the image is between minPixels and maxPixels
+	// 3. The aspect ratio of the image is as close to 1:1 as possible
+
+	if size.Y < factor || size.X < factor {
+		panic("image is too small to resize")
+	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
+		panic("aspect ratio must be less than 200:1")
+	}
+
+	f := float64(factor)
+	width := float64(size.X)
+	height := float64(size.Y)
+
+	xBar := math.Round(width/f) * f
+	yBar := math.Round(height/f) * f
+
+	if xBar*yBar > float64(maxPixels) {
+		beta := math.Sqrt(height * width / float64(maxPixels))
+		xBar = math.Floor(width/beta/f) * f
+		yBar = math.Floor(height/beta/f) * f
+	} else if xBar*yBar < float64(minPixels) {
+		beta := math.Sqrt(float64(minPixels) / (height * width))
+		xBar = math.Ceil(width*beta/f) * f
+		yBar = math.Ceil(height*beta/f) * f
+	}
+
+	return image.Point{int(xBar), int(yBar)}
+}
+
+func resizeImage(img image.Image, format string, size image.Point) image.Image {
+	if format == "png" {
+		img = imageproc.Composite(img)
+	}
+
+	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
+}
+
+func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
+	img, format, err := image.Decode(imageData)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
+	}
+
+	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
+	img = resizeImage(img, format, size)
+
+	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+
+	opts := map[string]any{}
+	return data, opts, nil
+}
diff --git a/model/qwen2vl/imageproc_test.go b/model/qwen2vl/imageproc_test.go
new file mode 100644
index 000000000..817b61a5c
--- /dev/null
+++ b/model/qwen2vl/imageproc_test.go
@@ -0,0 +1,78 @@
+package qwen2vl
+
+import (
+	"bytes"
+	"image"
+	"image/png"
+	"testing"
+)
+
+func TestSmartResize(t *testing.T) {
+	type smartResizeCase struct {
+		TestImage image.Image
+		Expected  image.Point
+	}
+
+	cases := []smartResizeCase{
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
+			Expected:  image.Point{980, 980},
+		},
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			Expected:  image.Point{1036, 756},
+		},
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
+			Expected:  image.Point{980, 980},
+		},
+	}
+
+	for _, c := range cases {
+		b := c.TestImage.Bounds().Max
+		actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
+		if actual != c.Expected {
+			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	type preprocessCase struct {
+		TestImage   image.Image
+		ExpectedLen int
+	}
+
+	cases := []preprocessCase{
+		{
+			TestImage:   image.NewRGBA(image.Rect(0, 0, 256, 256)),
+			ExpectedLen: 252 * 252 * 3 * 1,
+		},
+		{
+			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
+			ExpectedLen: 980 * 980 * 3 * 1,
+		},
+	}
+
+	for _, c := range cases {
+		var buf bytes.Buffer
+		err := png.Encode(&buf, c.TestImage)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		imgData, _, err := Preprocess(&buf)
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		switch len(imgData) {
+		case 0:
+			t.Errorf("no image data returned")
+		case c.ExpectedLen:
+			// ok
+		default:
+			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
+		}
+	}
+}
diff --git a/server/prompt.go b/server/prompt.go
index 2ea8e4da5..cc69fe8cf 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -11,7 +11,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/server/imageproc"
+	"github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/template"
 )
 
@@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			var imgData llm.ImageData
 
 			if isMllama {
-				data, aspectRatioID, err := imageproc.Preprocess(i)
+				data, opts, err := mllama.Preprocess(bytes.NewReader(i))
 				if err != nil {
 					return "", nil, err
 				}
@@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 					return "", nil, err
 				}
 
+				ar, ok := opts["aspectRatioIndex"].(int)
+				if !ok {
+					return "", nil, fmt.Errorf("missing aspect ratio for image")
+				}
+
 				imgData = llm.ImageData{
 					ID:            len(images),
 					Data:          buf.Bytes(),
-					AspectRatioID: aspectRatioID,
+					AspectRatioID: ar,
 				}
 				imgPrompt = "<|image|>"
 			} else {
diff --git a/server/routes.go b/server/routes.go
index d7a1b88db..0154dde70 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -31,10 +31,10 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runners"
-	"github.com/ollama/ollama/server/imageproc"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -205,12 +205,18 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
 		if isMllama {
-			data, aspectRatioID, err := imageproc.Preprocess(req.Images[i])
+			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
 			if err != nil {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
 				return
 			}
 
+			ar, ok := opts["aspectRatioIndex"].(int)
+			if !ok {
+				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
+				return
+			}
+
 			buf := new(bytes.Buffer)
 			err = binary.Write(buf, binary.LittleEndian, data)
 			if err != nil {
@@ -218,7 +224,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				return
 			}
 
-			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
+			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
 		} else {
 			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 		}