-
Notifications
You must be signed in to change notification settings - Fork 0
/
nvml_system.go
105 lines (100 loc) · 3.28 KB
/
nvml_system.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package main
import (
pb "GCS-Info-Catch/proto"
"errors"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"log"
"strconv"
)
// return value 0 is success
func (g *GCSInfoCatchServer) NvmlUtilizationRate(req *pb.NvmlInfoReuqestMsg, stream pb.GcsInfoCatchServiceDocker_NvmlUtilizationRateServer) error {
log.Printf("Get GRPC request, indexId is %v\n", req.GetIndexID())
ret := nvml.Init()
if ret != nvml.SUCCESS {
log.Printf("Unable to initialize NVML:%v\n", nvml.ErrorString(ret))
return errors.New(nvml.ErrorString(ret))
}
log.Println("nvml.Init() ok")
defer func() {
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
log.Printf("Unable to shutdown NVML:%v\n", nvml.ErrorString(ret))
return
}
}()
/*count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
log.Printf("Unable to get device count:%v\n", nvml.ErrorString(ret))
return errors.New(nvml.ErrorString(ret))
}*/
var indexID []int32
var utilizationRate []uint32
var memRate []uint64
var temperature []uint32
var occupied []uint32
//handle "," index
index_arr := stringTrimHandler(req.GetIndexID())
for _, gpuIndex := range index_arr {
gpuIndexInt, _ := strconv.Atoi(gpuIndex)
device, ret := nvml.DeviceGetHandleByIndex(gpuIndexInt)
if ret != nvml.SUCCESS {
log.Printf("Unable to get device at index:%v\n", nvml.ErrorString(ret))
indexID = append(indexID, 99999) // 99999表示有异常
continue
}
//GPU 序列加入
indexID = append(indexID, int32(gpuIndexInt))
//GPU 利用率加入
rate, ret := device.GetUtilizationRates()
if ret != nvml.SUCCESS {
log.Printf("Unable to get device GetUtilizationRates at index:%v\n", nvml.ErrorString(ret))
utilizationRate = append(utilizationRate, 99999)
continue
}
utilizationRate = append(utilizationRate, rate.Gpu)
memUsed, ret := device.GetMemoryInfo()
if ret != nvml.SUCCESS {
log.Printf("Unable to get device memory used at index:%v\n", nvml.ErrorString(ret))
memRate = append(memRate, 99999)
continue
}
memRate = append(memRate, memUsed.Used*100/memUsed.Total)
//GPU 温度加入
temp, ret := device.GetTemperature(nvml.TemperatureSensors(0))
if ret != nvml.SUCCESS {
log.Printf("Unable to get device GetTemperature at index:%v\n", nvml.ErrorString(ret))
temperature = append(utilizationRate, 99999)
continue
}
temperature = append(utilizationRate, temp)
//occupied情况
process, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
log.Printf("Unable to get device GetComputeRunningProcesses at index:%v\n", nvml.ErrorString(ret))
occupied = append(occupied, 99999)
continue
}
if len(process) > 0 {
occupied = append(occupied, 1) // 1表示占用了
} else {
occupied = append(occupied, 0) //表示未占用
}
}
log.Printf("GPUIndex get %v\n", indexID)
log.Printf("utilizationRate get %v memRate get %v\n", utilizationRate, memRate)
//log.Printf("temperature get %v\n", temperature)
log.Printf("occupied get %v\n", occupied)
err := stream.Send(&pb.NvmlInfoRespondMsg{
IndexID: indexID,
UtilizationRate: utilizationRate,
MemRate: memRate,
Temperature: temperature,
Occupied: occupied,
})
if err != nil {
log.Printf("Stream send error:%v", err)
return errors.New(nvml.ErrorString(nvml.ERROR_UNKNOWN))
}
log.Println("grpc stream send ok")
return nil
}