runner.go: Better abstract vision model integration

-Update mllama to take the cross attention state as embeddings in
a batch, more similar to how Llava handles it. This improves
integration with the input cache.
-Pass locations in a prompt for embeddings using tags similar to Llava.
-Abstract interface to vision models so the main runner accesses Clip
and Mllama similarly

Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
Jesse Gross 2024-10-11 15:34:01 -07:00 committed by Jesse Gross
parent 712e99d477
commit c826e57475
13 changed files with 534 additions and 454 deletions

View file

@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
images[i] = llm.ImageData{Data: buf.Bytes(), AspectRatioID: aspectRatioID}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
} else {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
@ -239,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
for _, i := range images {
imgPrompt := ""
if isMllama {
msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
} else {
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
imgPrompt = "<|image|>"
}
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
}
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})