Revert "increase default context length to 4096 (#10364)"

This reverts commit 424f648632.
2025-05-11 10:26:53 +02:00 · 2025-04-28 16:54:11 -07:00 · 2025-04-28 16:54:11 -07:00 · dd93e1af85
commit dd93e1af85
parent 5cfc1c39f3
7 changed files with 12 additions and 49 deletions
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})

 		if w.Code != http.StatusOK {
@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})

 		if w.Code != http.StatusOK {
@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
-			Options: map[string]any{
-				"num_ctx": 1024,
-			},
 		})

 		if w.Code != http.StatusOK {
--- a/server/sched.go
+++ b/server/sched.go
@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 4

 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")

@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }

-const (
-	defaultContextLength  = 4096
-	smallGpuContextLength = 2048
-)
-
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}

-					if pending.origNumCtx == -1 {
-						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
-							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
-							pending.opts.NumCtx = smallGpuContextLength
-							pending.origNumCtx = smallGpuContextLength
-						} else {
-							pending.opts.NumCtx = defaultContextLength
-							pending.origNumCtx = defaultContextLength
-						}
-					}
-
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }