mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 18:36:41 +02:00
Detect AMD GPU info via sysfs and block old cards
This wires up some new logic to start using sysfs to discover AMD GPU information and detects old cards we can't yet support so we can fallback to CPU mode.
This commit is contained in:
parent
1c8435ffa9
commit
6d84f07505
5 changed files with 151 additions and 34 deletions
86
gpu/gpu.go
86
gpu/gpu.go
|
@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo {
|
|||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
}
|
||||
}
|
||||
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
|
||||
// Only one GPU detected and it appears to be an integrated GPU - skip it
|
||||
slog.Info("ROCm unsupported integrated GPU detected")
|
||||
} else if memInfo.count > 0 {
|
||||
if memInfo.igpu_index >= 0 {
|
||||
// We have multiple GPUs reported, and one of them is an integrated GPU
|
||||
// so we have to set the env var to bypass it
|
||||
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
|
||||
val := os.Getenv("ROCR_VISIBLE_DEVICES")
|
||||
if val == "" {
|
||||
devices := []string{}
|
||||
for i := 0; i < int(memInfo.count); i++ {
|
||||
if i == int(memInfo.igpu_index) {
|
||||
continue
|
||||
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
ver, err := AMDDriverVersion()
|
||||
if err == nil {
|
||||
slog.Info("AMD Driver: " + ver)
|
||||
}
|
||||
gfx := AMDGFXVersions()
|
||||
tooOld := false
|
||||
for _, v := range gfx {
|
||||
if v.Major < 9 {
|
||||
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
|
||||
tooOld = true
|
||||
break
|
||||
}
|
||||
|
||||
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
|
||||
// e.g. gfx1034 works if we map it to gfx1030 at runtime
|
||||
|
||||
}
|
||||
if !tooOld {
|
||||
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
|
||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
|
||||
// Only one GPU detected and it appears to be an integrated GPU - skip it
|
||||
slog.Info("ROCm unsupported integrated GPU detected")
|
||||
} else if memInfo.count > 0 {
|
||||
if memInfo.igpu_index >= 0 {
|
||||
// We have multiple GPUs reported, and one of them is an integrated GPU
|
||||
// so we have to set the env var to bypass it
|
||||
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
|
||||
val := os.Getenv("ROCR_VISIBLE_DEVICES")
|
||||
if val == "" {
|
||||
devices := []string{}
|
||||
for i := 0; i < int(memInfo.count); i++ {
|
||||
if i == int(memInfo.igpu_index) {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, strconv.Itoa(i))
|
||||
}
|
||||
devices = append(devices, strconv.Itoa(i))
|
||||
val = strings.Join(devices, ",")
|
||||
os.Setenv("ROCR_VISIBLE_DEVICES", val)
|
||||
}
|
||||
val = strings.Join(devices, ",")
|
||||
os.Setenv("ROCR_VISIBLE_DEVICES", val)
|
||||
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
|
||||
}
|
||||
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
|
||||
resp.Library = "rocm"
|
||||
var version C.rocm_version_resp_t
|
||||
C.rocm_get_version(*gpuHandles.rocm, &version)
|
||||
verString := C.GoString(version.str)
|
||||
if version.status == 0 {
|
||||
resp.Variant = "v" + verString
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
|
||||
}
|
||||
C.free(unsafe.Pointer(version.str))
|
||||
}
|
||||
resp.Library = "rocm"
|
||||
var version C.rocm_version_resp_t
|
||||
C.rocm_get_version(*gpuHandles.rocm, &version)
|
||||
verString := C.GoString(version.str)
|
||||
if version.status == 0 {
|
||||
resp.Variant = "v" + verString
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
|
||||
}
|
||||
C.free(unsafe.Pointer(version.str))
|
||||
}
|
||||
}
|
||||
if resp.Library == "" {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue