mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
refined test timing
adjust timing on some tests so they don't timeout on small/slow GPUs
This commit is contained in:
parent
5e8ff556cb
commit
68dfc6236a
4 changed files with 59 additions and 34 deletions
|
@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||
var (
|
||||
req = [2]api.GenerateRequest{
|
||||
{
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
Model: "orca-mini",
|
||||
Prompt: "why is the ocean blue?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}, {
|
||||
Model: "tinydolphin",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
Model: "tinydolphin",
|
||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||
Stream: &stream,
|
||||
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||
Options: map[string]interface{}{
|
||||
"seed": 42,
|
||||
"temperature": 0.0,
|
||||
|
@ -43,7 +45,7 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||
)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
|
||||
defer cancel()
|
||||
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
|
@ -56,32 +58,46 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||
for i := 0; i < len(req); i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 30*time.Second, 10*time.Second)
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
|
||||
req, resp := GenerateRequests()
|
||||
reqLimit := len(req)
|
||||
iterLimit := 5
|
||||
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
if vram != "" {
|
||||
max, err := strconv.ParseUint(vram, 10, 64)
|
||||
require.NoError(t, err)
|
||||
// Don't hammer on small VRAM cards...
|
||||
if max < 4*1024*1024*1024 {
|
||||
reqLimit = min(reqLimit, 2)
|
||||
iterLimit = 2
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
req, resp := GenerateRequests()
|
||||
// Get the server running (if applicable) warm the model up with a single initial request
|
||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
|
||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(req))
|
||||
for i := 0; i < len(req); i++ {
|
||||
wg.Add(reqLimit)
|
||||
for i := 0; i < reqLimit; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := 0; j < 5; j++ {
|
||||
for j := 0; j < iterLimit; j++ {
|
||||
slog.Info("Starting", "req", i, "iter", j)
|
||||
// On slower GPUs it can take a while to process the 4 concurrent requests
|
||||
// On slower GPUs it can take a while to process the concurrent requests
|
||||
// so we allow a much longer initial timeout
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
||||
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue