Revert "increase default context length to 4096 (#10364)"

This reverts commit 424f648632.
2025-05-11 02:16:36 +02:00 · 2025-04-28 16:54:11 -07:00 · 2025-04-28 16:54:11 -07:00 · dd93e1af85
commit dd93e1af85
parent 5cfc1c39f3
7 changed files with 12 additions and 49 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
 				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/docs/faq.md
+++ b/docs/faq.md
@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
 ## How can I specify the context window size?
-By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 
+By default, Ollama uses a context window size of 2048 tokens. 
 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:
 ```shell
-/set parameter num_ctx 8192
+/set parameter num_ctx 4096
 ```
 When using the API, specify the `num_ctx` parameter:
@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 8192
+    "num_ctx": 4096
  }
 }'
 ```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
 )
 func String(s string) func() string {
@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }
 func Int64(key string, defaultValue int64) func() int64 {
 	return func() int64 {
 		if s := Var(key); s != "" {
 			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
 				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
 			} else {
 				return n
 			}
 		}
 		return defaultValue
 	}
 }
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
 }
 func TestContextLength(t *testing.T) {
-	cases := map[string]int64{
+	cases := map[string]uint{
-		"":     -1,
+		"":     2048,
 		"4096": 4096,
 	}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
 			Options: map[string]any{
 				"num_ctx": 1024,
 			},
 		})
 		if w.Code != http.StatusOK {
--- a/server/sched.go
+++ b/server/sched.go
@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 4
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")
@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }
 const (
 	defaultContextLength  = 4096
 	smallGpuContextLength = 2048
 )
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}
 					if pending.origNumCtx == -1 {
 						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
 							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
 							pending.opts.NumCtx = smallGpuContextLength
 							pending.origNumCtx = smallGpuContextLength
 						} else {
 							pending.opts.NumCtx = defaultContextLength
 							pending.origNumCtx = defaultContextLength
 						}
 					}
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
 	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }