mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
parent
7d6eb0d4c3
commit
05cd82ef94
33 changed files with 94 additions and 94 deletions
|
@ -7,13 +7,13 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
// Split up the GPUs by type and try them
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
|
@ -67,7 +67,7 @@ type MemoryEstimate struct {
|
|||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
// Graph size for a partial offload, applies to all GPUs
|
||||
var graphPartialOffload uint64
|
||||
|
||||
|
@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||
gpuAllocations := make([]uint64, len(gpus))
|
||||
type gs struct {
|
||||
i int
|
||||
g *gpu.GpuInfo
|
||||
g *discover.GpuInfo
|
||||
}
|
||||
gpusWithSpace := []gs{}
|
||||
for i := range gpus {
|
||||
|
|
|
@ -10,7 +10,7 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/discover"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
|
@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||
}
|
||||
|
||||
// Simple CPU scenario
|
||||
gpus := []gpu.GpuInfo{
|
||||
gpus := []discover.GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
},
|
||||
|
@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||
|
||||
// Dual CUDA scenario with assymetry
|
||||
gpuMinimumMemory := uint64(2048)
|
||||
gpus = []gpu.GpuInfo{
|
||||
gpus = []discover.GpuInfo{
|
||||
{
|
||||
Library: "cuda",
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
|
|
|
@ -26,9 +26,9 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/runners"
|
||||
)
|
||||
|
@ -61,8 +61,8 @@ type llmServer struct {
|
|||
estimate MemoryEstimate
|
||||
totalLayers uint64
|
||||
// gpuCount int
|
||||
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
|
||||
sem *semaphore.Weighted
|
||||
|
@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
|||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
var err error
|
||||
var cpuRunner string
|
||||
var estimate MemoryEstimate
|
||||
|
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
var systemFreeMemory uint64
|
||||
var systemSwapFreeMemory uint64
|
||||
|
||||
systemInfo := gpu.GetSystemInfo()
|
||||
systemInfo := discover.GetSystemInfo()
|
||||
systemTotalMemory = systemInfo.System.TotalMemory
|
||||
systemFreeMemory = systemInfo.System.FreeMemory
|
||||
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
||||
|
@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
if opts.NumGPU == 0 {
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
|
@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = estimate.Layers
|
||||
}
|
||||
|
@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue