mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
runner.go: Better abstract vision model integration
-Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
parent
712e99d477
commit
c826e57475
13 changed files with 534 additions and 454 deletions
|
@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
images[i] = llm.ImageData{Data: buf.Bytes(), AspectRatioID: aspectRatioID}
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
|
||||
} else {
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
|
@ -239,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
}
|
||||
|
||||
for _, i := range images {
|
||||
imgPrompt := ""
|
||||
if isMllama {
|
||||
msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
|
||||
} else {
|
||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
|
||||
imgPrompt = "<|image|>"
|
||||
}
|
||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
||||
}
|
||||
|
||||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue