mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
remove mllama integration, use ollama engine
This commit is contained in:
parent
f8586c6b2b
commit
54d47159f7
7 changed files with 14 additions and 200 deletions
|
@ -509,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
|
|||
return embed, nil
|
||||
}
|
||||
|
||||
type MllamaContext struct {
|
||||
c *C.struct_mllama_ctx
|
||||
}
|
||||
|
||||
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
||||
mp := C.CString(modelPath)
|
||||
defer C.free(unsafe.Pointer(mp))
|
||||
c := C.mllama_model_load(mp, 1)
|
||||
if c == nil {
|
||||
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
||||
}
|
||||
|
||||
projEmbedSize := int(C.mllama_n_embd(c))
|
||||
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||
if projEmbedSize != modelEmbedSize {
|
||||
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||
}
|
||||
|
||||
return &MllamaContext{c: c}, nil
|
||||
}
|
||||
|
||||
func (m *MllamaContext) Free() {
|
||||
C.mllama_free(m.c)
|
||||
}
|
||||
|
||||
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||
img := C.mllama_image_init()
|
||||
defer C.mllama_image_free(img)
|
||||
|
||||
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
||||
if !ok {
|
||||
return nil, errors.New("unable to load mllama image data")
|
||||
}
|
||||
|
||||
rows := make([]float32, m.EmbedSize(llamaContext))
|
||||
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
||||
if !ok {
|
||||
return nil, errors.New("unable to make mllama embedding from image")
|
||||
}
|
||||
|
||||
embed := make([][]float32, 1)
|
||||
embed[0] = rows
|
||||
|
||||
return embed, nil
|
||||
}
|
||||
|
||||
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
||||
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
||||
numEmbed := llamaContext.Model().NEmbd()
|
||||
|
||||
return numTokens * numEmbed
|
||||
}
|
||||
|
||||
func (c *Context) SetCrossAttention(state bool) {
|
||||
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||
}
|
||||
|
||||
func (c *Context) Synchronize() {
|
||||
C.llama_synchronize(c.c)
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package mllama
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"image"
|
||||
"slices"
|
||||
|
@ -34,10 +33,6 @@ const (
|
|||
)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
// Verify unified config
|
||||
if c.Uint("vision.block_count") == 0 {
|
||||
return nil, fmt.Errorf("non-unified vision model not supported")
|
||||
}
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
|
|
|
@ -5,7 +5,6 @@ import (
|
|||
"fmt"
|
||||
"hash/maphash"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -18,8 +17,7 @@ type ImageContext struct {
|
|||
// mu is required to be held when generating embeddings or accessing the cache
|
||||
mu sync.Mutex
|
||||
|
||||
clip *llama.ClipContext
|
||||
mllama *llama.MllamaContext
|
||||
clip *llama.ClipContext
|
||||
|
||||
// cache of images to embeddings
|
||||
images []imageCache
|
||||
|
@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
|
|||
var c ImageContext
|
||||
if arch == "clip" {
|
||||
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
||||
} else if arch == "mllama" {
|
||||
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
||||
}
|
||||
|
@ -58,9 +54,6 @@ func (c *ImageContext) Free(modelPath string) {
|
|||
if c.clip != nil {
|
||||
c.clip.Free()
|
||||
}
|
||||
if c.mllama != nil {
|
||||
c.mllama.Free()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||
|
@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
|
|||
|
||||
embed, err := c.findImage(hash)
|
||||
if err != nil {
|
||||
if c.mllama != nil {
|
||||
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else if c.clip != nil {
|
||||
if c.clip != nil {
|
||||
embed, err = c.clip.NewEmbed(llamaContext, data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -109,29 +97,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
|
|||
// and doesn't support more than a single image per request.
|
||||
// The embeddings are large (100 MB), so allocating a big batch can fail
|
||||
// on some systems
|
||||
if c.mllama != nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
return configuredBatchSize
|
||||
}
|
||||
|
||||
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
||||
if c != nil && c.mllama != nil {
|
||||
return c.mllama.EmbedSize(llamaContext)
|
||||
} else {
|
||||
return llamaContext.Model().NEmbd()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
||||
if c == nil || c.mllama == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return slices.ContainsFunc(inputs, func(input input) bool {
|
||||
return input.embed != nil
|
||||
})
|
||||
return llamaContext.Model().NEmbd()
|
||||
}
|
||||
|
||||
type imageCache struct {
|
||||
|
|
|
@ -413,9 +413,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||
if batch == nil {
|
||||
if !embedding {
|
||||
batch = tokenBatch
|
||||
} else {
|
||||
batch = embedBatch
|
||||
seq.crossAttention = s.image.NeedCrossAttention(input)
|
||||
}
|
||||
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
|
||||
s.nextSeq = seqIdx
|
||||
|
@ -439,8 +436,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||
return nil
|
||||
}
|
||||
|
||||
s.lc.SetCrossAttention(crossAttention)
|
||||
|
||||
err := s.lc.Decode(batch)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
|
@ -621,8 +616,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||
return
|
||||
}
|
||||
|
||||
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
|
||||
|
||||
s.seqs[i] = seq
|
||||
s.cond.Signal()
|
||||
found = true
|
||||
|
|
|
@ -3,7 +3,6 @@ package server
|
|||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
@ -11,7 +10,6 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/model/models/mllama"
|
||||
"github.com/ollama/ollama/template"
|
||||
)
|
||||
|
||||
|
@ -25,25 +23,14 @@ var errTooManyImages = errors.New("vision model only supports a single image per
|
|||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
||||
var system []api.Message
|
||||
|
||||
isMllama := checkMllamaModelFamily(m)
|
||||
|
||||
var imageNumTokens int
|
||||
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||
if isMllama {
|
||||
// Our mllama implementation packs all of the embeddings into a single token
|
||||
imageNumTokens = 1
|
||||
} else {
|
||||
// Clip images are represented as 768 tokens, each an embedding
|
||||
imageNumTokens = 768
|
||||
}
|
||||
// Clip images are represented as 768 tokens, each an embedding
|
||||
imageNumTokens = 768
|
||||
|
||||
n := len(msgs) - 1
|
||||
// in reverse, find all messages that fit into context window
|
||||
for i := n; i >= 0; i-- {
|
||||
if isMllama && len(msgs[i].Images) > 1 {
|
||||
return "", nil, errTooManyImages
|
||||
}
|
||||
|
||||
// always include the last message
|
||||
if i == n {
|
||||
continue
|
||||
|
@ -91,41 +78,9 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||
for _, i := range msg.Images {
|
||||
var imgData llm.ImageData
|
||||
|
||||
if isMllama {
|
||||
if len(m.ProjectorPaths) == 0 {
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: i,
|
||||
}
|
||||
} else {
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||
}
|
||||
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: buf.Bytes(),
|
||||
AspectRatioID: ar,
|
||||
}
|
||||
}
|
||||
imgPrompt = "<|image|>"
|
||||
} else {
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: i,
|
||||
}
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: i,
|
||||
}
|
||||
|
||||
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
||||
|
@ -148,12 +103,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||
|
||||
return b.String(), images, nil
|
||||
}
|
||||
|
||||
func checkMllamaModelFamily(m *Model) bool {
|
||||
for _, arch := range m.Config.ModelFamilies {
|
||||
if arch == "mllama" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@ import (
|
|||
"bytes"
|
||||
"cmp"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
@ -33,7 +32,6 @@ import (
|
|||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/model/models/mllama"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/server/internal/client/ollama"
|
||||
"github.com/ollama/ollama/server/internal/registry"
|
||||
|
@ -204,38 +202,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
isMllama := checkMllamaModelFamily(m)
|
||||
if isMllama && len(req.Images) > 1 {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
|
||||
return
|
||||
}
|
||||
|
||||
images := make([]llm.ImageData, len(req.Images))
|
||||
for i := range req.Images {
|
||||
if isMllama && len(m.ProjectorPaths) > 0 {
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
|
||||
} else {
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
|
||||
prompt := req.Prompt
|
||||
|
@ -267,9 +236,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
|
||||
for _, i := range images {
|
||||
imgPrompt := ""
|
||||
if isMllama {
|
||||
imgPrompt = "<|image|>"
|
||||
}
|
||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"os"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"slices"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
continue
|
||||
}
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
// TODO (jmorganca): mllama doesn't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
|
||||
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
||||
// ref: https://github.com/ollama/ollama/issues/4165
|
||||
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("mllama doesn't support parallel requests yet")
|
||||
slog.Warn("mllama does not currently support parallel requests")
|
||||
}
|
||||
|
||||
for {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue