Skip to content

Commit

Permalink
Tests and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Lars T Hansen committed Mar 7, 2025
1 parent 72badae commit f7e0f18
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 2 deletions.
4 changes: 2 additions & 2 deletions util/formats/oldfmt/decode_samples.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Reader:
m[key] = value
}

if _, found := m["version"]; !found {
if _, found := m["v"]; !found {
continue
}
t, found := m["time"]
Expand Down Expand Up @@ -132,8 +132,8 @@ Reader:
if err != nil {
continue Reader
}
envelope.Samples = append(envelope.Samples, sample)
}
envelope.Samples = append(envelope.Samples, sample)
}
if envelope != nil {
consume(envelope)
Expand Down
111 changes: 111 additions & 0 deletions util/formats/oldfmt_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Test the decoders.

package formats

import (
"os"
"strings"
"testing"

"formats/oldfmt"
)

func TestSysinfo(t *testing.T) {
f, err := os.Open("testdata/sysinfo.json")
if err != nil {
t.Fatal(err)
}
defer f.Close()
// There are two records in the file.
var iter int
err = oldfmt.ConsumeSysinfo(f, func(info *oldfmt.SysinfoEnvelope) {
switch iter {
case 0:
assert(t, info.Version == "0.13.100", "#0 version")
assert(t, info.Timestamp == "2025-03-01T00:00:01+01:00", "#0 time")
assert(t, info.Hostname == "ml1.hpc.uio.no", "#0 host")
assert(t, strings.HasPrefix(info.Description, "2x14 (hyperthreaded) Intel(R) Xeon(R)"), "#0 desc")
assert(t, info.CpuCores == 56, "#0 cores")
assert(t, info.MemGB == 125, "#0 memory")
assert(t, info.GpuCards == 3, "#0 gpu-cards")
assert(t, info.GpuMemGB == 33, "#0 gpu-mem")
assert(t, len(info.GpuInfo) == 3, "#0 gpu-info len")
g := info.GpuInfo[1]
assert(t, g.BusAddress == "00000000:3B:00.0", "#0 addr")
assert(t, g.Index == 1, "#0 index")
assert(t, g.UUID == "GPU-be013a01-364d-ca23-f871-206fe3f259ba", "#0 UUID")
assert(t, g.Manufacturer == "NVIDIA", "#0 manufacturer")
assert(t, g.Model == "NVIDIA GeForce RTX 2080 Ti", "#0 model")
assert(t, g.Architecture == "Turing", "#0 arch")
assert(t, g.Driver == "550.127.08", "#0 driver")
assert(t, g.Firmware == "12.4", "#0 firmware")
assert(t, g.MemKB == 11534336, "#0 card mem")
assert(t, g.PowerLimit == 250, "#0 power limit")
assert(t, g.MaxPowerLimit == 280, "#0 max power limit")
assert(t, g.MinPowerLimit == 100, "#0 min power limit")
assert(t, g.MaxCEClock == 2100, "#0 max ce clock")
assert(t, g.MaxMemClock == 7000, "#0 max memory clock")
case 1:
assert(t, info.Version == "0.13.200", "#1 version")
assert(t, info.Timestamp == "2025-02-28T00:00:01+01:00", "#1 time")
// The rest tested adequately above
}
iter++
})
assert(t, iter == 2, "Iteration count")
if err != nil {
t.Fatal(err)
}
}

func TestSamples(t *testing.T) {
f, err := os.Open("testdata/samples.csv")
if err != nil {
t.Fatal(err)
}
defer f.Close()
// There are three *groups* of records in the file.
// FIXME: Currently none of them have cpuload or gpuload information.
var iter int
err = oldfmt.ConsumeCSVSamples(f, func(info *oldfmt.SampleEnvelope) {
switch iter {
case 0:
assert(t, info.Timestamp == "2025-02-27T01:00:01+01:00", "#0 time")
assert(t, info.Hostname == "c1-6.fox", "#0 host")
assert(t, len(info.Samples) == 4, "#0 samples")
case 1:
assert(t, info.Timestamp == "2025-02-27T01:00:01+01:00", "#1 time")
assert(t, info.Hostname == "gpu-11.fox", "#1 host")
assert(t, len(info.Samples) == 8, "#1 samples")
case 2:
assert(t, info.Timestamp == "2025-02-27T01:05:01+01:00", "#2 time")
assert(t, info.Hostname == "gpu-11.fox", "#2 host")
assert(t, len(info.Samples) == 8, "#2 samples")
s := info.Samples[3]
assert(t, s.User == "ec-dhananjt", "#2 user")
assert(t, s.Cmd == "python", "#2 cmd")
assert(t, s.JobId == 1345347, "#2 job")
assert(t, s.Pid == 2164020, "#2 pid")
assert(t, s.ParentPid == 2163996, "#2 ppid")
assert(t, s.CpuPct == 898.6, "#2 cpu%")
assert(t, s.CpuKib == 77012712, "#2 cpukib")
assert(t, s.RssAnonKib == 70950544, "#2 rss")
assert(t, s.Gpus == "3", "#2 gpus")
assert(t, s.GpuPct == 47, "#2 gpu%")
assert(t, s.GpuMemPct == 7, "#2 gpumem%")
assert(t, s.GpuKib == 23418880, "#2 gpukib")
assert(t, s.CpuTimeSec == 5369665, "#2 cputime")
}
iter++
})
assert(t, iter == 3, "Iteration count")
if err != nil {
t.Fatal(err)
}
}

func assert(t *testing.T, c bool, msg string) {
if !c {
t.Fatal(msg)
}
}
20 changes: 20 additions & 0 deletions util/formats/testdata/samples.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=c1-6.fox,user=ec-tsauren,cmd=python3,job=1351930,ppid=2200718,cpu%=51.3,cpukib=194835888,rssanonkib=74255936,cputime_sec=9534,rolledup=255
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=c1-6.fox,user=ec-tsauren,cmd=python3,job=1351930,pid=2200663,ppid=2200654,cpu%=53.5,cpukib=1039564,rssanonkib=532600,cputime_sec=11213
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=c1-6.fox,user=ec-lgcharpe,cmd=pt_main_thread,job=1352526,pid=2202575,ppid=2200641,cpu%=87.4,cpukib=166299984,rssanonkib=163705808,cputime_sec=57058
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=c1-6.fox,user=ec-marcink,cmd=mpihello,pid=2165818,ppid=1,cpu%=1.5,cpukib=6020,rssanonkib=1024,cputime_sec=40613
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345348,ppid=2164018,cpu%=118.9,cpukib=757555664,rssanonkib=691719512,cputime_sec=1010,rolledup=9
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=python,job=1350861,pid=2877884,ppid=2877803,cpu%=0.4,cpukib=1010364,rssanonkib=549792,cputime_sec=502
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345347,pid=2164020,ppid=2163996,cpu%=898.6,cpukib=74012704,rssanonkib=68236740,gpus=3,gpu%=44,gpumem%=5,gpukib=23418880,cputime_sec=5367095
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345347,ppid=2164020,cpu%=98.2,cpukib=843997864,rssanonkib=780110372,cputime_sec=8290,rolledup=9
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=ollama,job=1350861,pid=2877829,ppid=2877803,cpu%=4.6,cpukib=607000,rssanonkib=537008,cputime_sec=117271
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=ollama_llama_se,job=1350861,ppid=2877829,cpu%=105.2,cpukib=1084072,rssanonkib=601468,"gpus=4,5,6,0",gpu%=70,gpumem%=45,gpukib=68816896,cputime_sec=4316,rolledup=1
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345348,pid=2164018,ppid=2163998,cpu%=917.6,cpukib=75652164,rssanonkib=69142664,gpus=2,gpu%=47,gpumem%=6,gpukib=23418880,cputime_sec=5486448
v=0.13.200,time=2025-02-27T01:00:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=nvidia-smi,job=1350861,pid=2877827,ppid=2877803,cpu%=0.3,cpukib=14588,rssanonkib=14096,cputime_sec=337
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345348,ppid=2164018,cpu%=114.7,cpukib=761488280,rssanonkib=695407112,cputime_sec=1319,rolledup=9
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345347,ppid=2164020,cpu%=98.1,cpukib=848105108,rssanonkib=784025960,cputime_sec=8573,rolledup=9
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=ollama,job=1350861,pid=2877829,ppid=2877803,cpu%=4.6,cpukib=606416,rssanonkib=538532,cputime_sec=117595
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345347,pid=2164020,ppid=2163996,cpu%=898.6,cpukib=77012712,rssanonkib=70950544,gpus=3,gpu%=47,gpumem%=7,gpukib=23418880,cputime_sec=5369665
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=ollama_llama_se,job=1350861,ppid=2877829,cpu%=107.1,cpukib=805776,rssanonkib=464144,"gpus=4,0,6,5",gpu%=102,gpukib=64337920,cputime_sec=4305,rolledup=1
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=python,job=1350861,pid=2877884,ppid=2877803,cpu%=0.4,cpukib=1010364,rssanonkib=549792,cputime_sec=503
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-dhananjt,cmd=python,job=1345348,pid=2164018,ppid=2163998,cpu%=917.6,cpukib=76652164,rssanonkib=70141132,gpus=2,gpu%=51,gpumem%=8,gpukib=23418880,cputime_sec=5488962
v=0.13.200,time=2025-02-27T01:05:01+01:00,host=gpu-11.fox,user=ec-sindrre,cmd=nvidia-smi,job=1350861,pid=2877827,ppid=2877803,cpu%=0.3,cpukib=14588,rssanonkib=14096,cputime_sec=338
27 changes: 27 additions & 0 deletions util/formats/testdata/sysinfo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"version": "0.13.100",
"timestamp": "2025-03-01T00:00:01+01:00",
"hostname": "ml1.hpc.uio.no",
"description": "2x14 (hyperthreaded) Intel(R) Xeon(R) Gold 5120 CPU @ 2.20GHz, 125 GiB, 3x NVIDIA GeForce RTX 2080 Ti @ 11GiB",
"cpu_cores": 56,
"mem_gb": 125,
"gpu_cards": 3,
"gpumem_gb": 33,
"gpu_info": [
{"bus_addr":"00000000:18:00.0", "index":0, "uuid":"GPU-35080357-601c-7113-ec05-f6ca1e58a91e",
"manufacturer":"NVIDIA", "model":"NVIDIA GeForce RTX 2080 Ti", "arch":"Turing", "driver":"550.127.08", "firmware":"12.4",
"mem_size_kib":11534336,
"power_limit_watt":250, "max_power_limit_watt":280, "min_power_limit_watt":100,
"max_ce_clock_mhz":2100, "max_mem_clock_mhz":7000},
{"bus_addr":"00000000:3B:00.0", "index":1, "uuid":"GPU-be013a01-364d-ca23-f871-206fe3f259ba",
"manufacturer":"NVIDIA", "model":"NVIDIA GeForce RTX 2080 Ti", "arch":"Turing", "driver":"550.127.08", "firmware":"12.4",
"mem_size_kib":11534336,
"power_limit_watt":250, "max_power_limit_watt":280, "min_power_limit_watt":100,
"max_ce_clock_mhz":2100, "max_mem_clock_mhz":7000},
{"bus_addr":"00000000:86:00.0", "index":2, "uuid":"GPU-daa9f6ac-c8bf-87be-8adc-89b1e7d3f38a",
"manufacturer":"NVIDIA", "model":"NVIDIA GeForce RTX 2080 Ti", "arch":"Turing", "driver":"550.127.08", "firmware":"12.4",
"mem_size_kib":11534336,
"power_limit_watt":250, "max_power_limit_watt":280, "min_power_limit_watt":100,
"max_ce_clock_mhz":2100, "max_mem_clock_mhz":7000}]
}
{"version":"0.13.200","timestamp":"2025-02-28T00:00:01+01:00","hostname":"c1-6.fox","description":"2x64 AMD EPYC 7702 64-Core Processor, 503 GiB","cpu_cores":128,"mem_gb":503}

0 comments on commit f7e0f18

Please sign in to comment.