next build (#8539)

* add build to .dockerignore * test: only build one arch * add build to .gitignore * fix ccache path * filter amdgpu targets * only filter if autodetecting * Don't clobber gpu list for default runner This ensures the GPU specific environment variables are set properly * explicitly set CXX compiler for HIP * Update build_windows.ps1 This isn't complete, but is close. Dependencies are missing, and it only builds the "default" preset. * build: add ollama subdir * add .git to .dockerignore * docs: update development.md * update build_darwin.sh * remove unused scripts * llm: add cwd and build/lib/ollama to library paths * default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS * add additional cmake output vars for msvc * interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12 * remove unncessary filepath.Dir, cleanup * add hardware-specific directory to path * use absolute server path * build: linux arm * cmake install targets * remove unused files * ml: visit each library path once * build: skip cpu variants on arm * build: install cpu targets * build: fix workflow * shorter names * fix rocblas install * docs: clean up development.md * consistent build dir removal in development.md * silence -Wimplicit-function-declaration build warnings in ggml-cpu * update readme * update development readme * llm: update library lookup logic now that there is one runner (#8587) * tweak development.md * update docs * add windows cuda/rocm tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
2025-05-11 18:36:41 +02:00 · 2025-01-29 15:03:38 -08:00 · 2025-01-29 15:03:38 -08:00 · dcfb7a105c
commit dcfb7a105c
parent 2ef3c803a1
542 changed files with 5796 additions and 11469 deletions
--- a/llm/server.go
+++ b/llm/server.go
@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/runners"
 )

 type LlamaServer interface {
@ -91,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 // The gpu list must be a single family.
 func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
-	var cpuRunner string
-	var estimate MemoryEstimate
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
@ -107,12 +104,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	if opts.NumGPU == 0 {
 		gpus = discover.GetCPUInfo()
 	}
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = runners.ServerForCpu()
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
-	} else {
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)

+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
@ -120,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = runners.ServerForCpu()
 			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@ -140,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter

 	estimate.log()

-	// Loop through potential servers
-	finalErr := errors.New("no suitable llama servers found")
-
-	availableServers := runners.GetAvailableServers()
-
-	var servers []string
-	if cpuRunner != "" {
-		servers = []string{cpuRunner}
-	} else {
-		servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
-	}
-	demandLib := envconfig.LLMLibrary()
-	if demandLib != "" {
-		serverPath := availableServers[demandLib]
-		if serverPath == "" {
-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
-		} else {
-			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
-			servers = []string{demandLib}
-			if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
-				// Omit the GPU flag to silence the warning
-				opts.NumGPU = -1
-			}
-		}
-	}
-
-	if len(servers) == 0 {
-		return nil, fmt.Errorf("no servers found for %v", gpus)
-	}
-
 	params := []string{
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
@ -270,21 +233,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--multiuser-cache")
 	}

-	for i := range servers {
-		builtin := servers[i] == runners.BuiltinName()
-		server := availableServers[servers[i]]
-		if server == "" {
-			// Shouldn't happen
-			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
-			slog.Error("server list inconsistent", "error", finalErr)
+	// get available libraries
+	if err != nil {
+		return nil, fmt.Errorf("could not get libollama dir: %w", err)
+	}
+
+	entries, err := os.ReadDir(discover.LibOllamaPath)
+	if err != nil {
+		return nil, fmt.Errorf("could not read libollama dir: %w", err)
+	}
+
+	libs := make(map[string]string)
+	for _, entry := range entries {
+		if entry.IsDir() {
+			libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
+		}
+	}
+
+	lib := gpus[0].RunnerName()
+	requested := envconfig.LLMLibrary()
+	if libs[requested] != "" {
+		slog.Info("using requested gpu library", "requested", requested)
+		lib = requested
+	}
+
+	var compatible []string
+	for k := range libs {
+		// exact match first
+		if k == lib {
+			compatible = append([]string{k}, compatible...)
 			continue
 		}

-		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
-			gpus = discover.GetCPUInfo()
+		// then match the family (e.g. 'cuda')
+		if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
+			compatible = append(compatible, k)
 		}
+	}
+	slog.Debug("compatible gpu libraries", "compatible", compatible)

-		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
+	// without any LD_LIBRARY_PATH flags
+	for {
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 			var l *net.TCPListener
@ -305,25 +296,45 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// Start with the server directory for the LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{filepath.Dir(server)}

+		var libraryPaths []string
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

+		if len(compatible) > 0 {
+			c := compatible[0]
+			if libpath, ok := libs[c]; ok {
+				slog.Debug("adding gpu library", "path", libpath)
+				libraryPaths = append(libraryPaths, libpath)
+			}
+		}
+
 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != nil {
+			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
 			// assume gpus from the same library have the same dependency path
 			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}

+		// finally, add the root library path
+		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
+
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
+		}
+
+		exe, err = filepath.EvalSymlinks(exe)
+		if err != nil {
+			return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
+		}
+
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
-			cmd:         exec.Command(server, finalParams...),
+			cmd:         exec.Command(exe, finalParams...),
 			status:      NewStatusWriter(os.Stderr),
 			options:     opts,
 			modelPath:   model,
@ -394,17 +405,17 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}

 		if err = s.cmd.Start(); err != nil {
-			// Detect permission denied and augment the message about noexec
-			if errors.Is(err, os.ErrPermission) {
-				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
-				continue
-			}
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			}
-			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
-			finalErr = err
+			err := fmt.Errorf("error starting runner: %v %s", err, msg)
+			if len(compatible) == 0 {
+				return nil, err
+			}
+
+			slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
+			compatible = compatible[1:]
 			continue
 		}

@ -413,7 +424,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			err := s.cmd.Wait()
 			// Favor a more detailed message over the process exit status
 			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
-				slog.Debug("llama runner terminated", "error", err)
+				slog.Error("llama runner terminated", "error", err)
 				if strings.Contains(s.status.LastErrMsg, "unknown model") {
 					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
 				}
@ -425,9 +436,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter

 		return s, nil
 	}
-
-	slog.Error("unable to load any llama server", "error", finalErr)
-	return nil, finalErr
 }

 type ServerStatus int