mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
Revert "increase default context length to 4096 (#10364)"
This reverts commit 424f648632
.
This commit is contained in:
parent
5cfc1c39f3
commit
dd93e1af85
7 changed files with 12 additions and 49 deletions
|
@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command {
|
|||
envVars["OLLAMA_LLM_LIBRARY"],
|
||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||
envVars["OLLAMA_CONTEXT_LENGTH"],
|
||||
})
|
||||
default:
|
||||
appendEnvDocs(cmd, envs)
|
||||
|
|
|
@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
|
|||
|
||||
## How can I specify the context window size?
|
||||
|
||||
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
|
||||
By default, Ollama uses a context window size of 2048 tokens.
|
||||
|
||||
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||
|
||||
|
@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
|||
To change this when using `ollama run`, use `/set parameter`:
|
||||
|
||||
```shell
|
||||
/set parameter num_ctx 8192
|
||||
/set parameter num_ctx 4096
|
||||
```
|
||||
|
||||
When using the API, specify the `num_ctx` parameter:
|
||||
|
@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||
"model": "llama3.2",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"options": {
|
||||
"num_ctx": 8192
|
||||
"num_ctx": 4096
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
|
|
@ -169,7 +169,7 @@ var (
|
|||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// ContextLength sets the default context length
|
||||
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
|
||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
|
@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
|
|||
}
|
||||
}
|
||||
|
||||
func Int64(key string, defaultValue int64) func() int64 {
|
||||
return func() int64 {
|
||||
if s := Var(key); s != "" {
|
||||
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
|
||||
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
||||
} else {
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
return defaultValue
|
||||
}
|
||||
}
|
||||
|
||||
// Set aside VRAM per GPU
|
||||
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||
|
||||
|
@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
|
|||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
|
||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
|
||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||
|
||||
// Informational
|
||||
|
|
|
@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestContextLength(t *testing.T) {
|
||||
cases := map[string]int64{
|
||||
"": -1,
|
||||
cases := map[string]uint{
|
||||
"": 2048,
|
||||
"4096": 4096,
|
||||
}
|
||||
|
||||
|
|
|
@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
|
|||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
|
@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
|
|||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
|
@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
|
|||
{Role: "user", Content: "Help me write tests."},
|
||||
},
|
||||
Stream: &stream,
|
||||
Options: map[string]any{
|
||||
"num_ctx": 1024,
|
||||
},
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
|
|
|
@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
|
|||
// Default automatic value for parallel setting
|
||||
// Model will still need to fit in VRAM. If this setting won't fit
|
||||
// we'll back off down to 1 to try to get it to fit
|
||||
var defaultParallel = 2
|
||||
var defaultParallel = 4
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
|
@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: model,
|
||||
|
@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
|
|||
}()
|
||||
}
|
||||
|
||||
const (
|
||||
defaultContextLength = 4096
|
||||
smallGpuContextLength = 2048
|
||||
)
|
||||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
|
@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
gpus = s.getGpuFn()
|
||||
}
|
||||
|
||||
if pending.origNumCtx == -1 {
|
||||
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
|
||||
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
|
||||
pending.opts.NumCtx = smallGpuContextLength
|
||||
pending.origNumCtx = smallGpuContextLength
|
||||
} else {
|
||||
pending.opts.NumCtx = defaultContextLength
|
||||
pending.origNumCtx = defaultContextLength
|
||||
}
|
||||
}
|
||||
|
||||
if envconfig.MaxRunners() <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||
|
|
|
@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
b.req.opts.NumCtx = 4096
|
||||
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||
return b
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue