mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
remove mllama integration, use ollama engine
This commit is contained in:
parent
f8586c6b2b
commit
54d47159f7
7 changed files with 14 additions and 200 deletions
|
@ -509,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
|
||||||
return embed, nil
|
return embed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type MllamaContext struct {
|
|
||||||
c *C.struct_mllama_ctx
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
|
||||||
mp := C.CString(modelPath)
|
|
||||||
defer C.free(unsafe.Pointer(mp))
|
|
||||||
c := C.mllama_model_load(mp, 1)
|
|
||||||
if c == nil {
|
|
||||||
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
projEmbedSize := int(C.mllama_n_embd(c))
|
|
||||||
modelEmbedSize := llamaContext.Model().NEmbd()
|
|
||||||
if projEmbedSize != modelEmbedSize {
|
|
||||||
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &MllamaContext{c: c}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MllamaContext) Free() {
|
|
||||||
C.mllama_free(m.c)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
|
||||||
img := C.mllama_image_init()
|
|
||||||
defer C.mllama_image_free(img)
|
|
||||||
|
|
||||||
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
|
||||||
if !ok {
|
|
||||||
return nil, errors.New("unable to load mllama image data")
|
|
||||||
}
|
|
||||||
|
|
||||||
rows := make([]float32, m.EmbedSize(llamaContext))
|
|
||||||
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
|
||||||
if !ok {
|
|
||||||
return nil, errors.New("unable to make mllama embedding from image")
|
|
||||||
}
|
|
||||||
|
|
||||||
embed := make([][]float32, 1)
|
|
||||||
embed[0] = rows
|
|
||||||
|
|
||||||
return embed, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
|
||||||
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
|
||||||
numEmbed := llamaContext.Model().NEmbd()
|
|
||||||
|
|
||||||
return numTokens * numEmbed
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Context) SetCrossAttention(state bool) {
|
|
||||||
C.llama_set_cross_attention(c.c, C.bool(state))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Context) Synchronize() {
|
func (c *Context) Synchronize() {
|
||||||
C.llama_synchronize(c.c)
|
C.llama_synchronize(c.c)
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,6 @@ package mllama
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
"image"
|
"image"
|
||||||
"slices"
|
"slices"
|
||||||
|
@ -34,10 +33,6 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
// Verify unified config
|
|
||||||
if c.Uint("vision.block_count") == 0 {
|
|
||||||
return nil, fmt.Errorf("non-unified vision model not supported")
|
|
||||||
}
|
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
|
|
|
@ -5,7 +5,6 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/maphash"
|
"hash/maphash"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"slices"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -18,8 +17,7 @@ type ImageContext struct {
|
||||||
// mu is required to be held when generating embeddings or accessing the cache
|
// mu is required to be held when generating embeddings or accessing the cache
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
|
|
||||||
clip *llama.ClipContext
|
clip *llama.ClipContext
|
||||||
mllama *llama.MllamaContext
|
|
||||||
|
|
||||||
// cache of images to embeddings
|
// cache of images to embeddings
|
||||||
images []imageCache
|
images []imageCache
|
||||||
|
@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
|
||||||
var c ImageContext
|
var c ImageContext
|
||||||
if arch == "clip" {
|
if arch == "clip" {
|
||||||
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
||||||
} else if arch == "mllama" {
|
|
||||||
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
|
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
||||||
}
|
}
|
||||||
|
@ -58,9 +54,6 @@ func (c *ImageContext) Free(modelPath string) {
|
||||||
if c.clip != nil {
|
if c.clip != nil {
|
||||||
c.clip.Free()
|
c.clip.Free()
|
||||||
}
|
}
|
||||||
if c.mllama != nil {
|
|
||||||
c.mllama.Free()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
|
@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
|
||||||
|
|
||||||
embed, err := c.findImage(hash)
|
embed, err := c.findImage(hash)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if c.mllama != nil {
|
if c.clip != nil {
|
||||||
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
} else if c.clip != nil {
|
|
||||||
embed, err = c.clip.NewEmbed(llamaContext, data)
|
embed, err = c.clip.NewEmbed(llamaContext, data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -109,29 +97,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
|
||||||
// and doesn't support more than a single image per request.
|
// and doesn't support more than a single image per request.
|
||||||
// The embeddings are large (100 MB), so allocating a big batch can fail
|
// The embeddings are large (100 MB), so allocating a big batch can fail
|
||||||
// on some systems
|
// on some systems
|
||||||
if c.mllama != nil {
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
return configuredBatchSize
|
return configuredBatchSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
||||||
if c != nil && c.mllama != nil {
|
return llamaContext.Model().NEmbd()
|
||||||
return c.mllama.EmbedSize(llamaContext)
|
|
||||||
} else {
|
|
||||||
return llamaContext.Model().NEmbd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
|
||||||
if c == nil || c.mllama == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
return slices.ContainsFunc(inputs, func(input input) bool {
|
|
||||||
return input.embed != nil
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type imageCache struct {
|
type imageCache struct {
|
||||||
|
|
|
@ -413,9 +413,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||||
if batch == nil {
|
if batch == nil {
|
||||||
if !embedding {
|
if !embedding {
|
||||||
batch = tokenBatch
|
batch = tokenBatch
|
||||||
} else {
|
|
||||||
batch = embedBatch
|
|
||||||
seq.crossAttention = s.image.NeedCrossAttention(input)
|
|
||||||
}
|
}
|
||||||
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
|
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
|
||||||
s.nextSeq = seqIdx
|
s.nextSeq = seqIdx
|
||||||
|
@ -439,8 +436,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
s.lc.SetCrossAttention(crossAttention)
|
|
||||||
|
|
||||||
err := s.lc.Decode(batch)
|
err := s.lc.Decode(batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to decode batch: %w", err)
|
return fmt.Errorf("failed to decode batch: %w", err)
|
||||||
|
@ -621,8 +616,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
|
|
||||||
|
|
||||||
s.seqs[i] = seq
|
s.seqs[i] = seq
|
||||||
s.cond.Signal()
|
s.cond.Signal()
|
||||||
found = true
|
found = true
|
||||||
|
|
|
@ -3,7 +3,6 @@ package server
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/binary"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
@ -11,7 +10,6 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/model/models/mllama"
|
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -25,25 +23,14 @@ var errTooManyImages = errors.New("vision model only supports a single image per
|
||||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
||||||
var system []api.Message
|
var system []api.Message
|
||||||
|
|
||||||
isMllama := checkMllamaModelFamily(m)
|
|
||||||
|
|
||||||
var imageNumTokens int
|
var imageNumTokens int
|
||||||
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||||
if isMllama {
|
// Clip images are represented as 768 tokens, each an embedding
|
||||||
// Our mllama implementation packs all of the embeddings into a single token
|
imageNumTokens = 768
|
||||||
imageNumTokens = 1
|
|
||||||
} else {
|
|
||||||
// Clip images are represented as 768 tokens, each an embedding
|
|
||||||
imageNumTokens = 768
|
|
||||||
}
|
|
||||||
|
|
||||||
n := len(msgs) - 1
|
n := len(msgs) - 1
|
||||||
// in reverse, find all messages that fit into context window
|
// in reverse, find all messages that fit into context window
|
||||||
for i := n; i >= 0; i-- {
|
for i := n; i >= 0; i-- {
|
||||||
if isMllama && len(msgs[i].Images) > 1 {
|
|
||||||
return "", nil, errTooManyImages
|
|
||||||
}
|
|
||||||
|
|
||||||
// always include the last message
|
// always include the last message
|
||||||
if i == n {
|
if i == n {
|
||||||
continue
|
continue
|
||||||
|
@ -91,41 +78,9 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||||
for _, i := range msg.Images {
|
for _, i := range msg.Images {
|
||||||
var imgData llm.ImageData
|
var imgData llm.ImageData
|
||||||
|
|
||||||
if isMllama {
|
imgData = llm.ImageData{
|
||||||
if len(m.ProjectorPaths) == 0 {
|
ID: len(images),
|
||||||
imgData = llm.ImageData{
|
Data: i,
|
||||||
ID: len(images),
|
|
||||||
Data: i,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
|
||||||
if err != nil {
|
|
||||||
return "", nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
buf := new(bytes.Buffer)
|
|
||||||
err = binary.Write(buf, binary.LittleEndian, data)
|
|
||||||
if err != nil {
|
|
||||||
return "", nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
ar, ok := opts["aspectRatioIndex"].(int)
|
|
||||||
if !ok {
|
|
||||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
|
||||||
}
|
|
||||||
|
|
||||||
imgData = llm.ImageData{
|
|
||||||
ID: len(images),
|
|
||||||
Data: buf.Bytes(),
|
|
||||||
AspectRatioID: ar,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
imgPrompt = "<|image|>"
|
|
||||||
} else {
|
|
||||||
imgData = llm.ImageData{
|
|
||||||
ID: len(images),
|
|
||||||
Data: i,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
||||||
|
@ -148,12 +103,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||||
|
|
||||||
return b.String(), images, nil
|
return b.String(), images, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkMllamaModelFamily(m *Model) bool {
|
|
||||||
for _, arch := range m.Config.ModelFamilies {
|
|
||||||
if arch == "mllama" {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"cmp"
|
"cmp"
|
||||||
"context"
|
"context"
|
||||||
"encoding/binary"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
@ -33,7 +32,6 @@ import (
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/model/models/mllama"
|
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/server/internal/client/ollama"
|
"github.com/ollama/ollama/server/internal/client/ollama"
|
||||||
"github.com/ollama/ollama/server/internal/registry"
|
"github.com/ollama/ollama/server/internal/registry"
|
||||||
|
@ -204,38 +202,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
isMllama := checkMllamaModelFamily(m)
|
|
||||||
if isMllama && len(req.Images) > 1 {
|
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
images := make([]llm.ImageData, len(req.Images))
|
images := make([]llm.ImageData, len(req.Images))
|
||||||
for i := range req.Images {
|
for i := range req.Images {
|
||||||
if isMllama && len(m.ProjectorPaths) > 0 {
|
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||||
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
|
||||||
if err != nil {
|
|
||||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
ar, ok := opts["aspectRatioIndex"].(int)
|
|
||||||
if !ok {
|
|
||||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
buf := new(bytes.Buffer)
|
|
||||||
err = binary.Write(buf, binary.LittleEndian, data)
|
|
||||||
if err != nil {
|
|
||||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
|
|
||||||
} else {
|
|
||||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt := req.Prompt
|
prompt := req.Prompt
|
||||||
|
@ -267,9 +236,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||||
|
|
||||||
for _, i := range images {
|
for _, i := range images {
|
||||||
imgPrompt := ""
|
imgPrompt := ""
|
||||||
if isMllama {
|
|
||||||
imgPrompt = "<|image|>"
|
|
||||||
}
|
|
||||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
numParallel := int(envconfig.NumParallel())
|
numParallel := int(envconfig.NumParallel())
|
||||||
// TODO (jmorganca): mllama doesn't support parallel yet
|
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
||||||
// see https://github.com/ollama/ollama/issues/4165
|
// ref: https://github.com/ollama/ollama/issues/4165
|
||||||
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
|
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
slog.Warn("mllama doesn't support parallel requests yet")
|
slog.Warn("mllama does not currently support parallel requests")
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue