runner.go: Better abstract vision model integration

-Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me>
2025-05-11 10:26:53 +02:00 · 2024-10-11 15:34:01 -07:00 · 2024-10-11 15:34:01 -07:00 · c826e57475
commit c826e57475
parent 712e99d477
13 changed files with 534 additions and 454 deletions
--- a/server/routes.go
+++ b/server/routes.go
@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				return
 			}

-			images[i] = llm.ImageData{Data: buf.Bytes(), AspectRatioID: aspectRatioID}
+			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
 		} else {
 			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 		}
@ -239,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}

 			for _, i := range images {
+				imgPrompt := ""
 				if isMllama {
-					msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
-				} else {
-					msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
+					imgPrompt = "<|image|>"
 				}
+				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}

 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})