diff --git a/cmd/cmd.go b/cmd/cmd.go index befe578d6..79ff87ac8 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], - envVars["OLLAMA_CONTEXT_LENGTH"], }) default: appendEnvDocs(cmd, envs) diff --git a/docs/faq.md b/docs/faq.md index 327afc6e5..f418da47f 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. +By default, Ollama uses a context window size of 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: @@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve To change this when using `ollama run`, use `/set parameter`: ```shell -/set parameter num_ctx 8192 +/set parameter num_ctx 4096 ``` When using the API, specify the `num_ctx` parameter: @@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "llama3.2", "prompt": "Why is the sky blue?", "options": { - "num_ctx": 8192 + "num_ctx": 4096 } }' ``` diff --git a/envconfig/config.go b/envconfig/config.go index fcb0a6947..fc702198f 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -169,7 +169,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) + ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048) ) func String(s string) func() string { @@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 { } } -func Int64(key string, defaultValue int64) func() int64 { - return func() int64 { - if s := Var(key); s != "" { - if n, err := strconv.ParseInt(s, 10, 64); err != nil { - slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) - } else { - return n - } - } - - return defaultValue - } -} - // Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) @@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, // Informational diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 72bfb4df5..5694eb8a3 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -278,8 +278,8 @@ func TestVar(t *testing.T) { } func TestContextLength(t *testing.T) { - cases := map[string]int64{ - "": -1, + cases := map[string]uint{ + "": 2048, "4096": 4096, } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index dd77b574a..56121d41b 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { @@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { @@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Help me write tests."}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { diff --git a/server/sched.go b/server/sched.go index d5b19fbfd..f3978796c 100644 --- a/server/sched.go +++ b/server/sched.go @@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3 // Default automatic value for parallel setting // Model will still need to fit in VRAM. If this setting won't fit // we'll back off down to 1 to try to get it to fit -var defaultParallel = 2 +var defaultParallel = 4 var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") @@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } + req := &LlmRequest{ ctx: c, model: model, @@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) { }() } -const ( - defaultContextLength = 4096 - smallGpuContextLength = 2048 -) - func (s *Scheduler) processPending(ctx context.Context) { for { select { @@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) { gpus = s.getGpuFn() } - if pending.origNumCtx == -1 { - if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 { - slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength) - pending.opts.NumCtx = smallGpuContextLength - pending.origNumCtx = smallGpuContextLength - } else { - pending.opts.NumCtx = defaultContextLength - pending.origNumCtx = defaultContextLength - } - } - if envconfig.MaxRunners() <= 0 { // No user specified MaxRunners, so figure out what automatic setting to use // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs diff --git a/server/sched_test.go b/server/sched_test.go index 1b620329c..274e18cec 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } - b.req.opts.NumCtx = 4096 b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return b }