Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing
us to isolate llama.cpp in a separate process and shutdown when idle, and
gracefully restart if it has problems.  This also serves as a first step to be
able to run multiple copies to support multiple models concurrently.
This commit is contained in:
Daniel Hiltgen 2024-03-14 10:24:13 -07:00
parent 3b6a9154dd
commit 58d95cc9bd
35 changed files with 1416 additions and 1910 deletions

View file

@ -17,7 +17,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
)
@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
},
}
s := Server{}
s := &Server{}
router := s.GenerateRoutes()
httpSrv := httptest.NewServer(router)
@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
}
}
type MockLLM struct {
encoding []int
}
func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
return nil
}
func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
return llm.encoding, nil
}
func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
return "", nil
}
func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
return []float64{}, nil
}
func (llm *MockLLM) Close() {
// do nothing
}