mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 02:16:36 +02:00
Revert "increase default context length to 4096 (#10364)"
This reverts commit 424f648632
.
This commit is contained in:
parent
5cfc1c39f3
commit
dd93e1af85
7 changed files with 12 additions and 49 deletions
|
@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command {
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||||
envVars["OLLAMA_LOAD_TIMEOUT"],
|
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||||
envVars["OLLAMA_CONTEXT_LENGTH"],
|
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
|
|
|
@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
|
||||||
|
|
||||||
## How can I specify the context window size?
|
## How can I specify the context window size?
|
||||||
|
|
||||||
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
|
By default, Ollama uses a context window size of 2048 tokens.
|
||||||
|
|
||||||
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
||||||
To change this when using `ollama run`, use `/set parameter`:
|
To change this when using `ollama run`, use `/set parameter`:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
/set parameter num_ctx 8192
|
/set parameter num_ctx 4096
|
||||||
```
|
```
|
||||||
|
|
||||||
When using the API, specify the `num_ctx` parameter:
|
When using the API, specify the `num_ctx` parameter:
|
||||||
|
@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3.2",
|
"model": "llama3.2",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"options": {
|
"options": {
|
||||||
"num_ctx": 8192
|
"num_ctx": 4096
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
|
@ -169,7 +169,7 @@ var (
|
||||||
// Enable the new Ollama engine
|
// Enable the new Ollama engine
|
||||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||||
// ContextLength sets the default context length
|
// ContextLength sets the default context length
|
||||||
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
|
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
|
@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func Int64(key string, defaultValue int64) func() int64 {
|
|
||||||
return func() int64 {
|
|
||||||
if s := Var(key); s != "" {
|
|
||||||
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
|
|
||||||
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
||||||
} else {
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultValue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set aside VRAM per GPU
|
// Set aside VRAM per GPU
|
||||||
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||||
|
|
||||||
|
@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
|
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
|
||||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||||
|
|
||||||
// Informational
|
// Informational
|
||||||
|
|
|
@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestContextLength(t *testing.T) {
|
func TestContextLength(t *testing.T) {
|
||||||
cases := map[string]int64{
|
cases := map[string]uint{
|
||||||
"": -1,
|
"": 2048,
|
||||||
"4096": 4096,
|
"4096": 4096,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
|
||||||
{Role: "user", Content: "Hello!"},
|
{Role: "user", Content: "Hello!"},
|
||||||
},
|
},
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
Options: map[string]any{
|
|
||||||
"num_ctx": 1024,
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
|
|
||||||
if w.Code != http.StatusOK {
|
if w.Code != http.StatusOK {
|
||||||
|
@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
|
||||||
{Role: "user", Content: "Hello!"},
|
{Role: "user", Content: "Hello!"},
|
||||||
},
|
},
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
Options: map[string]any{
|
|
||||||
"num_ctx": 1024,
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
|
|
||||||
if w.Code != http.StatusOK {
|
if w.Code != http.StatusOK {
|
||||||
|
@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
|
||||||
{Role: "user", Content: "Help me write tests."},
|
{Role: "user", Content: "Help me write tests."},
|
||||||
},
|
},
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
Options: map[string]any{
|
|
||||||
"num_ctx": 1024,
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
|
|
||||||
if w.Code != http.StatusOK {
|
if w.Code != http.StatusOK {
|
||||||
|
|
|
@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
|
||||||
// Default automatic value for parallel setting
|
// Default automatic value for parallel setting
|
||||||
// Model will still need to fit in VRAM. If this setting won't fit
|
// Model will still need to fit in VRAM. If this setting won't fit
|
||||||
// we'll back off down to 1 to try to get it to fit
|
// we'll back off down to 1 to try to get it to fit
|
||||||
var defaultParallel = 2
|
var defaultParallel = 4
|
||||||
|
|
||||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||||
|
|
||||||
|
@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
|
|
||||||
// context must be canceled to decrement ref count and release the runner
|
// context must be canceled to decrement ref count and release the runner
|
||||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||||
|
if opts.NumCtx < 4 {
|
||||||
|
opts.NumCtx = 4
|
||||||
|
}
|
||||||
|
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: c,
|
ctx: c,
|
||||||
model: model,
|
model: model,
|
||||||
|
@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
|
||||||
defaultContextLength = 4096
|
|
||||||
smallGpuContextLength = 2048
|
|
||||||
)
|
|
||||||
|
|
||||||
func (s *Scheduler) processPending(ctx context.Context) {
|
func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
|
@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
gpus = s.getGpuFn()
|
gpus = s.getGpuFn()
|
||||||
}
|
}
|
||||||
|
|
||||||
if pending.origNumCtx == -1 {
|
|
||||||
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
|
|
||||||
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
|
|
||||||
pending.opts.NumCtx = smallGpuContextLength
|
|
||||||
pending.origNumCtx = smallGpuContextLength
|
|
||||||
} else {
|
|
||||||
pending.opts.NumCtx = defaultContextLength
|
|
||||||
pending.origNumCtx = defaultContextLength
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if envconfig.MaxRunners() <= 0 {
|
if envconfig.MaxRunners() <= 0 {
|
||||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||||
|
|
|
@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
b.req.opts.NumCtx = 4096
|
|
||||||
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue