diff --git a/docs/superpowers/contracts/usage-report-hz-cpu-fix-20260604.json b/docs/superpowers/contracts/usage-report-hz-cpu-fix-20260604.json new file mode 100644 index 0000000..1062dbb --- /dev/null +++ b/docs/superpowers/contracts/usage-report-hz-cpu-fix-20260604.json @@ -0,0 +1,15 @@ +{ + "title": "Fix HZ-as-CPU off-by-one in usage_report and restore the native helper", + "objective": "The usage report must attribute CPU time from atop's utime/stime columns, not its HZ field. Success means short-lived, near-zero-CPU processes (xset, dd, chronyc, sleep) no longer appear in the Top CPU table, real consumers rank correctly, and both the Python parser and the restored native C aggregator agree on the numbers.", + "acceptance_criteria": [ + "_parse_prc skips the HZ field; xset/dd/chronyc report ~0 CPU-seconds.", + "The native C helper builds from linux_configuration/C/atop_agg and emits the same CPU figures as the Python path.", + "A regression test feeds a raw PRC line including the HZ field and fails against the pre-fix indices.", + "usage_report.py runs end-to-end and the Top CPU table is led by genuine consumers (e.g. SkyrimSE)." + ], + "out_of_scope": [ + "Rewriting the digital_wellbeing polling daemons that generate the fork storm.", + "Achieving 100% line coverage of the C helper's defensive OOM/hash-full paths." + ], + "verifier": "python3 usage_report.py --date 20260604; make test in linux_configuration/C/atop_agg; python3 -m pytest linux_configuration/tests/test_usage_report_since.py" +} diff --git a/docs/superpowers/evidence/usage-report-hz-cpu-fix-20260604.json b/docs/superpowers/evidence/usage-report-hz-cpu-fix-20260604.json new file mode 100644 index 0000000..2149504 --- /dev/null +++ b/docs/superpowers/evidence/usage-report-hz-cpu-fix-20260604.json @@ -0,0 +1,40 @@ +{ + "intent": "Stop the usage report from charging atop's per-record HZ field as CPU time, which made short-lived processes (xset, dd, chronyc, sleep) appear as the top CPU consumers (xset reported 67h of CPU in a 5h40m window). After the fix the CPU table reflects real consumers (SkyrimSE, zstd, the video-capture pipeline) and the fork storm shows only in the accurate PID-count column.", + "scope": [ + "linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_parsing.py", + "linux_configuration/C/atop_agg/ (restored native helper with the same fix)", + "linux_configuration/tests/test_usage_report_since.py (regression tests)", + "Non-goal: rewriting the digital_wellbeing daemons that cause the fork storm" + ], + "changes": [ + "_parse_prc now reads utime/stime at after+2/after+3, skipping atop's HZ field that sits between state and utime; bumped _PRC_MIN_LEN 11 to 12.", + "_atop_agg_binary returns None (Python fallback) when the C source tree is absent, instead of trusting an orphaned cached binary; removed the stale ~/.cache/usage_report/atop_agg.", + "Restored C/atop_agg from git history into linux_configuration/C/atop_agg with the identical HZ-skip fix (tokens[10]/[11]), guard bumped to n<12, redundant PRM length check removed, and test fixtures corrected to include the HZ field.", + "Added Python regression tests asserting HZ is not summed as CPU and that a missing C source falls back to Python." + ], + "verification": [ + { + "command": "python3 usage_report.py --date 20260604 --no-clipboard --quiet", + "result": "pass", + "evidence": "Top CPU now led by SkyrimSE.exe 933s; xset/dd/chronyc dropped out entirely (real CPU ~0). Cross-checked against atop directly with corrected field indices." + }, + { + "command": "make test (linux_configuration/C/atop_agg)", + "result": "pass", + "evidence": "atop_agg tests: OK. Rebuilt binary emits xset cpu_ticks=0 vs 24427000 before." + }, + { + "command": "python3 -m pytest test_usage_report_since.py -k 'parse_prc or atop_agg_binary'", + "result": "pass", + "evidence": "4 passed. Buggy indices would yield 107 ticks vs the asserted 10, so the regression test fails against the old code." + } + ], + "risks": [ + "Native fast path needs a C compiler; without cc the report now falls back to the (slower) Python parser rather than a stale binary.", + "C helper coverage remains below 100% on defensive OOM/hash-full paths (pre-existing; the suite is not coverage-gated for linux_configuration)." + ], + "rollback": [ + "git checkout the parsing module and remove linux_configuration/C/atop_agg to revert.", + "Re-run usage_report.py --date 20260604 and confirm whether xset reappears with inflated CPU." + ] +} diff --git a/linux_configuration/C/atop_agg/.gitignore b/linux_configuration/C/atop_agg/.gitignore new file mode 100644 index 0000000..4b2a881 --- /dev/null +++ b/linux_configuration/C/atop_agg/.gitignore @@ -0,0 +1,9 @@ +# Build artifacts — atop_agg is rebuilt locally (and into ~/.cache/usage_report +# by usage_report.py); never commit the compiled binary or coverage output. +atop_agg +test_atop_agg +*.o +*.gcda +*.gcno +coverage.info +coverage_html/ diff --git a/linux_configuration/C/atop_agg/Makefile b/linux_configuration/C/atop_agg/Makefile new file mode 100644 index 0000000..7317260 --- /dev/null +++ b/linux_configuration/C/atop_agg/Makefile @@ -0,0 +1,33 @@ +CC := gcc +CFLAGS := -O2 -std=c11 -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Wno-unused-parameter +COV := -O0 -g --coverage -std=c11 -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Wno-unused-parameter -DATOP_AGG_NO_MAIN + +SRC := atop_agg.c +HDR := atop_agg.h +BIN := atop_agg + +.PHONY: all clean rebuild test coverage + +all: $(BIN) + +$(BIN): $(SRC) $(HDR) + $(CC) $(CFLAGS) -o $@ $(SRC) + +test_atop_agg: test_atop_agg.c atop_agg.c atop_agg.h + $(CC) $(COV) -o test_atop_agg test_atop_agg.c atop_agg.c + +test: test_atop_agg + ./test_atop_agg + +coverage: test_atop_agg + ./test_atop_agg + lcov --capture --directory . --output-file coverage.info --no-external + lcov --remove coverage.info '*/test_atop_agg.c' --output-file coverage.info + genhtml coverage.info --output-directory coverage_html + @echo "Coverage report at coverage_html/index.html" + +clean: + rm -f $(BIN) test_atop_agg *.o *.gcda *.gcno coverage.info + rm -rf coverage_html + +rebuild: clean all diff --git a/linux_configuration/C/atop_agg/atop_agg.c b/linux_configuration/C/atop_agg/atop_agg.c new file mode 100644 index 0000000..dd1d9c6 --- /dev/null +++ b/linux_configuration/C/atop_agg/atop_agg.c @@ -0,0 +1,478 @@ +/* + * atop_agg — fast per-PID aggregator for `atop -P PRC,PRM` output. + * + * Reads atop parseable output on stdin, folds it into per-PID CPU-tick + * and RSS trackers, and prints a compact TSV summary on stdout that a + * higher-level driver (Python) then name-folds into human-readable + * tables. This avoids the ~3s Python parse cost on a typical day's + * 1.7M-line atop dump; the C hot loop completes in well under a second + * so the pipeline runs at atop's own ~2s wall-clock floor. + * + * Output TSV lines: + * Wstart_epochend_epochdistinct_samplesmedian_interval + * Cpidnamedelta_ticks + * Rpidnamepeak_kbsum_kbsamples + */ +#include "atop_agg.h" + +#include +#include +#include +#include +#include + +/* + * A real-world day of atop on a dev box can see >700k distinct PIDs + * because every short-lived compiler/shell subprocess gets a fresh ID. + * 2M slots keeps the load factor below ~40% for that workload, keeping + * linear-probe chains short without dynamic resizing. + */ +#define HASH_CAP_BITS 21 +#define HASH_CAP (1u << HASH_CAP_BITS) +#define HASH_MASK (HASH_CAP - 1u) +#define MAX_EPOCHS 4096 +#define MAX_TOKENS 64 + +/* Knuth multiplicative hash → index in an open-addressed table. */ +static unsigned int hash_pid(int pid) +{ + unsigned int k = (unsigned int)pid; + return (k * 2654435761u) >> (32 - HASH_CAP_BITS); +} + +static PidCpu *cpu_slot(State *s, int pid) +{ + unsigned int h = hash_pid(pid); + for (unsigned int probes = 0; probes < HASH_CAP; probes++, h++) + { + PidCpu *slot = &s->cpu[h & HASH_MASK]; + if (slot->pid == pid) + { + return slot; + } + if (slot->pid == 0) + { + slot->pid = pid; + slot->first_ticks = -1; + slot->last_ticks = 0; + slot->samples = 0; + slot->name[0] = '\0'; + return slot; + } + } + /* Table full — drop the sample rather than loop forever. */ + return NULL; +} + +static PidRam *ram_slot(State *s, int pid) +{ + unsigned int h = hash_pid(pid); + for (unsigned int probes = 0; probes < HASH_CAP; probes++, h++) + { + PidRam *slot = &s->ram[h & HASH_MASK]; + if (slot->pid == pid) + { + return slot; + } + if (slot->pid == 0) + { + slot->pid = pid; + slot->peak_kb = 0; + slot->sum_kb = 0; + slot->samples = 0; + slot->name[0] = '\0'; + return slot; + } + } + return NULL; +} + +static void add_epoch(State *s, long epoch) +{ + /* Linear scan — there are only a few dozen distinct epochs per log. */ + for (int i = 0; i < s->n_epochs; i++) + { + if (s->epochs[i] == epoch) + { + return; + } + } + if (s->n_epochs < MAX_EPOCHS) + { + s->epochs[s->n_epochs++] = epoch; + } +} + +/* + * Tokenise a whitespace-separated line in place. Fills *tokens* with + * pointers into *line* and returns the token count. A process name + * wrapped in parentheses is rejoined into a single token with spaces + * preserved (atop emits `(Web Content)` as three whitespace-split + * tokens, which we merge back). + */ +int tokenize_line(char *line, char **tokens, int max_tokens) +{ + int n = 0; + char *p = line; + while (*p && n < max_tokens) + { + while (*p == ' ' || *p == '\t') + { + p++; + } + if (!*p || *p == '\n') + { + break; + } + char *start = p; + if (*p == '(') + { + /* Consume through the matching ')', preserving interior spaces. */ + while (*p && *p != ')') + { + p++; + } + if (*p == ')') + { + p++; + } + } + else + { + while (*p && *p != ' ' && *p != '\t' && *p != '\n') + { + p++; + } + } + if (*p) + { + *p = '\0'; + p++; + } + tokens[n++] = start; + } + return n; +} + +/* + * Copy *src* into *dst* (capacity *cap*), stripping a leading '(' and + * trailing ')' if both are present. Always null-terminates. If the + * resulting name is empty, writes "unknown". + */ +void copy_name(char *dst, size_t cap, const char *src) +{ + size_t len = strlen(src); + size_t start = 0; + if (len >= 2 && src[0] == '(' && src[len - 1] == ')') + { + start = 1; + len -= 2; + } + if (len == 0) + { + const char *fallback = "unknown"; + size_t flen = strlen(fallback); + if (flen >= cap) + { + flen = cap - 1; + } + memcpy(dst, fallback, flen); + dst[flen] = '\0'; + return; + } + if (len >= cap) + { + len = cap - 1; + } + memcpy(dst, src + start, len); + dst[len] = '\0'; +} + +/* + * Parse one PRC/PRM line and update *s*. Unknown labels and malformed + * records are silently skipped (atop emits a stable schema, but guard + * against future changes and header/separator lines). + */ +void process_line(char *line, State *s) +{ + char *tokens[MAX_TOKENS]; + int n = tokenize_line(line, tokens, MAX_TOKENS); + /* Both PRC and PRM need >= 12 fields: the 6-field generic prefix, pid, + (name), state, atop's per-label extra field (HZ for PRC / pagesize for + PRM), then the first data column we read at index 10/11. */ + if (n < 12) + { + return; + } + const char *label = tokens[0]; + int is_prc = (label[0] == 'P' && label[1] == 'R' && label[2] == 'C' && label[3] == '\0'); + int is_prm = (label[0] == 'P' && label[1] == 'R' && label[2] == 'M' && label[3] == '\0'); + if (!is_prc && !is_prm) + { + return; + } + long epoch = strtol(tokens[2], NULL, 10); + int pid = (int)strtol(tokens[6], NULL, 10); + if (pid <= 0) + { + return; + } + const char *name_tok = tokens[7]; + if (is_prc) + { + /* atop inserts its clock-tick rate (HZ) at tokens[9], between the + state field and utime/stime, so the CPU columns live at [10]/[11]. + Reading [9] charged a constant HZ (100) as CPU to every record — + the bug this fixes. */ + long utime = strtol(tokens[10], NULL, 10); + long stime = strtol(tokens[11], NULL, 10); + long ticks = utime + stime; + add_epoch(s, epoch); + PidCpu *slot = cpu_slot(s, pid); + if (slot == NULL) + { + return; + } + if (slot->first_ticks < 0) + { + slot->first_ticks = ticks; + } + slot->last_ticks = ticks; + slot->samples++; + copy_name(slot->name, sizeof(slot->name), name_tok); + return; + } + /* PRM: rsize_kb sits at tokens[11] (after state, pagesize, vsize); the + n < 12 length guard at the top already guarantees it is present. */ + long rsize_kb = strtol(tokens[11], NULL, 10); + PidRam *slot = ram_slot(s, pid); + if (slot == NULL) + { + return; + } + if (rsize_kb > slot->peak_kb) + { + slot->peak_kb = rsize_kb; + } + slot->sum_kb += rsize_kb; + slot->samples++; + copy_name(slot->name, sizeof(slot->name), name_tok); +} + +static int cmp_long(const void *a, const void *b) +{ + long la = *(const long *)a; + long lb = *(const long *)b; + if (la < lb) + { + return -1; + } + if (la > lb) + { + return 1; + } + return 0; +} + +/* FNV-1a 32-bit over a NUL-terminated string; used to key the name table. */ +static unsigned int fnv1a(const char *s) +{ + unsigned int h = 2166136261u; + while (*s) + { + h ^= (unsigned char)*s++; + h *= 16777619u; + } + return h; +} + +/* + * Per-name aggregate, built in a second pass over cpu/ram tables so that + * the caller only has to parse a few thousand output rows instead of one + * row per PID. The name table is deliberately oversized (64k slots for an + * expected few-thousand names) to keep linear-probe chains short. + */ +#define NAME_CAP_BITS 16 +#define NAME_CAP (1u << NAME_CAP_BITS) +#define NAME_MASK (NAME_CAP - 1u) + +typedef struct +{ + char name[ATOP_AGG_NAME_MAX]; + long cpu_ticks; + int cpu_pids; + long peak_kb; + long sum_avg_kb; + int rss_samples; + int ram_pids; + char used; +} NameAgg; + +static NameAgg *name_slot(NameAgg *table, const char *name) +{ + unsigned int h = fnv1a(name); + for (unsigned int probes = 0; probes < NAME_CAP; probes++, h++) + { + NameAgg *slot = &table[h & NAME_MASK]; + if (!slot->used) + { + slot->used = 1; + /* copy_name already enforced \0-termination on the source. */ + size_t i = 0; + while (name[i] && i + 1 < sizeof(slot->name)) + { + slot->name[i] = name[i]; + i++; + } + slot->name[i] = '\0'; + return slot; + } + if (strcmp(slot->name, name) == 0) + { + return slot; + } + } + return NULL; +} + +/* Write the aggregated summary to *out* in the documented TSV schema. */ +void emit_results(State *s, FILE *out) +{ + long start_epoch = 0; + long end_epoch = 0; + long median_interval = 0; + if (s->n_epochs > 0) + { + qsort(s->epochs, (size_t)s->n_epochs, sizeof(long), cmp_long); + start_epoch = s->epochs[0]; + end_epoch = s->epochs[s->n_epochs - 1]; + if (s->n_epochs >= 2) + { + long deltas[MAX_EPOCHS]; + for (int i = 0; i < s->n_epochs - 1; i++) + { + deltas[i] = s->epochs[i + 1] - s->epochs[i]; + } + qsort(deltas, (size_t)(s->n_epochs - 1), sizeof(long), cmp_long); + median_interval = deltas[(s->n_epochs - 1) / 2]; + } + } + fprintf(out, "W\t%ld\t%ld\t%d\t%ld\n", start_epoch, end_epoch, s->n_epochs, median_interval); + + NameAgg *names = calloc(NAME_CAP, sizeof(NameAgg)); + if (!names) + { + return; + } + for (unsigned int i = 0; i < HASH_CAP; i++) + { + PidCpu *slot = &s->cpu[i]; + if (slot->pid == 0) + { + continue; + } + long delta = slot->last_ticks; + if (slot->samples >= 2) + { + delta = slot->last_ticks - slot->first_ticks; + if (delta < 0) + { + delta = 0; + } + } + NameAgg *na = name_slot(names, slot->name); + if (!na) + { + continue; + } + na->cpu_ticks += delta; + na->cpu_pids++; + } + for (unsigned int i = 0; i < HASH_CAP; i++) + { + PidRam *slot = &s->ram[i]; + if (slot->pid == 0) + { + continue; + } + long avg_kb = slot->samples ? slot->sum_kb / slot->samples : 0; + NameAgg *na = name_slot(names, slot->name); + if (!na) + { + continue; + } + if (slot->peak_kb > na->peak_kb) + { + na->peak_kb = slot->peak_kb; + } + na->sum_avg_kb += avg_kb; + na->rss_samples++; + na->ram_pids++; + } + for (unsigned int i = 0; i < NAME_CAP; i++) + { + NameAgg *na = &names[i]; + if (!na->used) + { + continue; + } + int pids = na->cpu_pids > na->ram_pids ? na->cpu_pids : na->ram_pids; + fprintf(out, "N\t%s\t%ld\t%ld\t%ld\t%d\t%d\n", na->name, na->cpu_ticks, na->peak_kb, + na->sum_avg_kb, na->rss_samples, pids); + } + free(names); +} + +State *state_new(void) +{ + State *s = calloc(1, sizeof(State)); + if (!s) + { + return NULL; + } + s->cpu = calloc(HASH_CAP, sizeof(PidCpu)); + s->ram = calloc(HASH_CAP, sizeof(PidRam)); + s->epochs = calloc(MAX_EPOCHS, sizeof(long)); + if (!s->cpu || !s->ram || !s->epochs) + { + state_free(s); + return NULL; + } + s->n_epochs = 0; + return s; +} + +void state_free(State *s) +{ + if (!s) + { + return; + } + free(s->cpu); + free(s->ram); + free(s->epochs); + free(s); +} + +#ifndef ATOP_AGG_NO_MAIN +int main(void) +{ + State *s = state_new(); + if (!s) + { + fprintf(stderr, "atop_agg: out of memory\n"); + return 1; + } + char *line = NULL; + size_t cap = 0; + ssize_t got; + while ((got = getline(&line, &cap, stdin)) != -1) + { + process_line(line, s); + } + free(line); + emit_results(s, stdout); + state_free(s); + return 0; +} +#endif diff --git a/linux_configuration/C/atop_agg/atop_agg.h b/linux_configuration/C/atop_agg/atop_agg.h new file mode 100644 index 0000000..6503199 --- /dev/null +++ b/linux_configuration/C/atop_agg/atop_agg.h @@ -0,0 +1,42 @@ +#ifndef ATOP_AGG_H +#define ATOP_AGG_H + +#include + +/* NAME_MAX capped to keep slot size compact; typical atop comm is 15 chars. */ +#define ATOP_AGG_NAME_MAX 40 + +typedef struct +{ + int pid; + char name[ATOP_AGG_NAME_MAX]; + long first_ticks; + long last_ticks; + int samples; +} PidCpu; + +typedef struct +{ + int pid; + char name[ATOP_AGG_NAME_MAX]; + long peak_kb; + long sum_kb; + int samples; +} PidRam; + +typedef struct +{ + PidCpu *cpu; + PidRam *ram; + long *epochs; + int n_epochs; +} State; + +State *state_new(void); +void state_free(State *s); +int tokenize_line(char *line, char **tokens, int max_tokens); +void copy_name(char *dst, size_t cap, const char *src); +void process_line(char *line, State *s); +void emit_results(State *s, FILE *out); + +#endif diff --git a/linux_configuration/C/atop_agg/run.sh b/linux_configuration/C/atop_agg/run.sh new file mode 100755 index 0000000..fd2a98d --- /dev/null +++ b/linux_configuration/C/atop_agg/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Build and demo atop_agg on today's atop log. +set -euo pipefail +cd "$(dirname "$0")" +make +LOG="${1:-/var/log/atop/atop_$(date +%Y%m%d)}" +if [[ ! -f "$LOG" ]]; then + echo "No atop log at $LOG; pass a path as arg 1." >&2 + exit 1 +fi +echo "Aggregating $LOG ..." >&2 +atop -r "$LOG" -P PRC,PRM | ./atop_agg | head -20 diff --git a/linux_configuration/C/atop_agg/test_atop_agg.c b/linux_configuration/C/atop_agg/test_atop_agg.c new file mode 100644 index 0000000..ca5f2f3 --- /dev/null +++ b/linux_configuration/C/atop_agg/test_atop_agg.c @@ -0,0 +1,229 @@ +/* + * Unit tests for atop_agg helpers. Compiled with --coverage; aims for + * 100% line coverage of atop_agg.c (excluding main, which is guarded + * by -DATOP_AGG_NO_MAIN). + */ +#include "atop_agg.h" + +#include +#include +#include +#include + +static int failures = 0; + +#define CHECK(cond) \ + do \ + { \ + if (!(cond)) \ + { \ + fprintf(stderr, "FAIL %s:%d: %s\n", __FILE__, __LINE__, #cond); \ + failures++; \ + } \ + } while (0) + +static void test_copy_name(void) +{ + char buf[16]; + copy_name(buf, sizeof(buf), "(bash)"); + CHECK(strcmp(buf, "bash") == 0); + + copy_name(buf, sizeof(buf), "bash"); + CHECK(strcmp(buf, "bash") == 0); + + copy_name(buf, sizeof(buf), "()"); + CHECK(strcmp(buf, "unknown") == 0); + + copy_name(buf, sizeof(buf), ""); + CHECK(strcmp(buf, "unknown") == 0); + + /* Truncation. */ + copy_name(buf, sizeof(buf), "(veryverylongnameabc)"); + CHECK(strlen(buf) == sizeof(buf) - 1); + + /* Fallback truncation: buf too small for "unknown" itself. */ + char tiny[4]; + copy_name(tiny, sizeof(tiny), ""); + CHECK(strcmp(tiny, "unk") == 0); +} + +static void test_tokenize(void) +{ + char line[] = "PRC host 1000 2026/01/01 12:00:00 600 123 (bash) S 100 10 20\n"; + char *toks[32]; + int n = tokenize_line(line, toks, 32); + CHECK(n == 12); + CHECK(strcmp(toks[0], "PRC") == 0); + CHECK(strcmp(toks[7], "(bash)") == 0); + CHECK(strcmp(toks[9], "100") == 0); /* HZ field atop inserts before utime */ + CHECK(strcmp(toks[10], "10") == 0); /* utime */ + CHECK(strcmp(toks[11], "20") == 0); /* stime */ + + /* Multi-word parenthesised name. */ + char line2[] = "PRM host 1000 d t 600 200 (Web Content) S 4096 1 2 0 0\n"; + char *t2[32]; + int n2 = tokenize_line(line2, t2, 32); + CHECK(n2 >= 12); + CHECK(strncmp(t2[7], "(Web Content)", 13) == 0); + + /* Empty / whitespace-only line. */ + char empty[] = " \n"; + char *t3[4]; + CHECK(tokenize_line(empty, t3, 4) == 0); + + /* Max-tokens cap respected. */ + char big[] = "a b c d e f g h i j k"; + char *t4[3]; + CHECK(tokenize_line(big, t4, 3) == 3); + + /* Unclosed paren at EOL — consumed to end. */ + char unclosed[] = "(abc"; + char *t5[2]; + int n5 = tokenize_line(unclosed, t5, 2); + CHECK(n5 == 1); + CHECK(strcmp(t5[0], "(abc") == 0); +} + +static void test_process_and_emit(void) +{ + State *s = state_new(); + assert(s != NULL); + + /* Two PRC samples for PID 100: first utime+stime=30, last=100. + Delta should be 70. The "100" after the state is atop's HZ field. */ + char prc1[] = "PRC h 1000 d t 600 100 (cc1) S 100 10 20\n"; + char prc2[] = "PRC h 1600 d t 600 100 (cc1) S 100 70 30\n"; + process_line(prc1, s); + process_line(prc2, s); + + /* One PRM sample for PID 100: rss=4096 kB. */ + char prm1[] = "PRM h 1000 d t 600 100 (cc1) S 4096 100 4096 0 0\n"; + process_line(prm1, s); + + /* PRC sample for PID 200 seen only once → delta == last_ticks. */ + char prc3[] = "PRC h 1000 d t 600 200 (short) S 100 5 5\n"; + process_line(prc3, s); + + /* Header / separator / unknown label should be ignored. */ + char header[] = "# comment line\n"; + process_line(header, s); + char sep[] = "SEP\n"; + process_line(sep, s); + char other[] = "CPU h 1000 d t 600 0 0 0 0 0 0 0 0\n"; + process_line(other, s); + + /* Malformed: pid <= 0 (12 tokens so it passes the length guard and + actually reaches the pid check). */ + char bad_pid[] = "PRC h 1000 d t 600 0 (x) S 100 1 1\n"; + process_line(bad_pid, s); + + /* PRC short (< 12 tokens) should hit the shared length guard, not crash. */ + char prc_short[] = "PRC h 1000 d t 600 300 (y) S 1\n"; + process_line(prc_short, s); + + /* PRM short (< 12 tokens) hits the same shared length guard. */ + char prm_short[] = "PRM h 1000 d t 600 300 (y) S 4096 1\n"; + process_line(prm_short, s); + + /* Emit and sanity-check the output. */ + char *buf = NULL; + size_t sz = 0; + FILE *out = open_memstream(&buf, &sz); + assert(out != NULL); + emit_results(s, out); + fclose(out); + CHECK(strstr(buf, "W\t1000\t1600\t2\t600\n") != NULL); + /* cc1: cpu delta 70 (pid 100 two samples) + 0 pids column via max(cpu,ram). + Peak RSS 4096, sum_avg 4096, rss_samples 1, pids max(1,1)=1. */ + CHECK(strstr(buf, "N\tcc1\t70\t4096\t4096\t1\t1\n") != NULL); + /* short: single-sample pid 200 → delta == 10; no RAM, so peak/sum/rss=0. */ + CHECK(strstr(buf, "N\tshort\t10\t0\t0\t0\t1\n") != NULL); + free(buf); + state_free(s); +} + +static void test_empty_and_single_epoch(void) +{ + State *s = state_new(); + /* No input at all → window line with zeroes. */ + char *buf = NULL; + size_t sz = 0; + FILE *out = open_memstream(&buf, &sz); + emit_results(s, out); + fclose(out); + CHECK(strstr(buf, "W\t0\t0\t0\t0\n") != NULL); + free(buf); + state_free(s); + + /* Exactly one epoch → median interval stays 0. */ + s = state_new(); + char prc[] = "PRC h 500 d t 600 50 (a) S 100 1 1\n"; + process_line(prc, s); + buf = NULL; + sz = 0; + out = open_memstream(&buf, &sz); + emit_results(s, out); + fclose(out); + CHECK(strstr(buf, "W\t500\t500\t1\t0\n") != NULL); + free(buf); + state_free(s); +} + +static void test_delta_clamped_to_zero(void) +{ + /* Counter reset: last < first → delta must clamp to 0. */ + State *s = state_new(); + char a[] = "PRC h 100 d t 600 77 (x) S 100 50 50\n"; + char b[] = "PRC h 700 d t 600 77 (x) S 100 10 10\n"; + process_line(a, s); + process_line(b, s); + char *buf = NULL; + size_t sz = 0; + FILE *out = open_memstream(&buf, &sz); + emit_results(s, out); + fclose(out); + CHECK(strstr(buf, "N\tx\t0\t") != NULL); + free(buf); + state_free(s); +} + +static void test_hash_collision(void) +{ + /* Force two PIDs into adjacent slots (Knuth hash rarely collides on + small integers, but we sweep a range to exercise the linear-probe + branch). */ + State *s = state_new(); + for (int pid = 1; pid <= 2000; pid++) + { + char line[128]; + snprintf(line, sizeof(line), "PRC h 1000 d t 600 %d (p) S 100 1 1\n", pid); + process_line(line, s); + snprintf(line, sizeof(line), "PRM h 1000 d t 600 %d (p) S 4096 1 1 0 0\n", pid); + process_line(line, s); + } + state_free(s); +} + +static void test_state_free_null(void) +{ + /* Freeing NULL must be safe. */ + state_free(NULL); +} + +int main(void) +{ + test_copy_name(); + test_tokenize(); + test_process_and_emit(); + test_empty_and_single_epoch(); + test_delta_clamped_to_zero(); + test_hash_collision(); + test_state_free_null(); + if (failures > 0) + { + fprintf(stderr, "%d test failures\n", failures); + return 1; + } + printf("atop_agg tests: OK\n"); + return 0; +} diff --git a/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_parsing.py b/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_parsing.py index a6ab040..25639f0 100644 --- a/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_parsing.py +++ b/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_parsing.py @@ -25,10 +25,13 @@ from _usage_report_types import ( # atop parseable output layout (atop 2.x, same on Arch/Debian/Ubuntu): # 0 label, 1 host, 2 epoch, 3 YYYY/MM/DD, 4 HH:MM:SS, 5 interval_s, # then per-process fields starting at index 6. -# PRC per-proc: pid name(parens) state utime_ticks stime_ticks ... +# PRC per-proc: pid name(parens) state HZ utime_ticks stime_ticks ... +# NOTE: atop inserts its clock-tick rate (HZ) between `state` and `utime` +# (the PRC analogue of the pagesize field PRM inserts before its memory +# columns); utime/stime therefore live two and three slots past `state`. _PRC_PID_IDX = 6 _PRC_NAME_IDX = 7 -_PRC_MIN_LEN = 11 +_PRC_MIN_LEN = 12 # PRM per-proc: pid name state pagesz_b vsize_kb rsize_kb ... _PRM_PID_IDX = 6 _PRM_NAME_IDX = 7 @@ -61,13 +64,39 @@ def _run(cmd: list[str]) -> str: return proc.stdout -def _iter_atop_lines(log: Path, labels: str) -> Iterator[str]: +def _atop_read_cmd( + log: Path, + labels: str, + begin: str | None, + end: str | None, +) -> list[str]: + """Build an `atop -r` command, optionally bounded by begin/end times. + + *begin*/*end* are atop `-b`/`-e` arguments (`[YYYYMMDD]hhmm[ss]`) used to + restrict replay to a sub-window of the day's log, so a "since last report" + run does not double-count the part of the first day already reported. + """ + cmd = ["atop", "-r", str(log)] + if begin is not None: + cmd += ["-b", begin] + if end is not None: + cmd += ["-e", end] + cmd += ["-P", labels] + return cmd + + +def _iter_atop_lines( + log: Path, + labels: str, + begin: str | None = None, + end: str | None = None, +) -> Iterator[str]: """Stream `atop -r LOG -P LABELS` stdout line-by-line. Uses `Popen` so the report can show progress while atop is still decoding its binary log, rather than buffering the whole output. """ - cmd = ["atop", "-r", str(log), "-P", labels] + cmd = _atop_read_cmd(log, labels, begin, end) with subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -112,10 +141,13 @@ def _parse_prc(parts: list[str], pid_cpu: dict[int, _PidCpu]) -> None: except (ValueError, IndexError): return name, after = _parse_name(parts, _PRC_NAME_IDX) - # After name comes: state utime stime ... + # After name comes: state HZ utime stime ... (HZ is atop's clock-tick + # rate; skipping it is what keeps a constant 100 from being charged as + # CPU to every record — the bug that made cpu-seconds collapse to PID + # count for short-lived processes). try: - utime = int(parts[after + 1]) - stime = int(parts[after + 2]) + utime = int(parts[after + 2]) + stime = int(parts[after + 3]) except (ValueError, IndexError): return pid_cpu.setdefault(pid, _PidCpu()).observe(name, utime + stime) @@ -153,6 +185,8 @@ def _window_from_epochs(epochs: set[int]) -> _Window: distinct_samples=len(ordered), interval_s=interval, seconds=ordered[-1] - ordered[0], + start_epoch=ordered[0], + end_epoch=ordered[-1], ) @@ -163,12 +197,18 @@ def _atop_agg_binary() -> Path | None: is unavailable, in which case callers use the pure-Python parser. """ src_c = _ATOP_AGG_SRC_DIR / "atop_agg.c" - if _ATOP_AGG_CACHE_BIN.exists() and ( - not src_c.exists() - or src_c.stat().st_mtime <= _ATOP_AGG_CACHE_BIN.stat().st_mtime + if not src_c.exists(): + # Source tree is gone (relocated/extracted): never trust an orphaned + # cached binary whose provenance we can no longer verify against + # source — a stale build can silently carry parsing bugs. Fall back to + # the pure-Python parser instead. + return None + if ( + _ATOP_AGG_CACHE_BIN.exists() + and src_c.stat().st_mtime <= _ATOP_AGG_CACHE_BIN.stat().st_mtime ): return _ATOP_AGG_CACHE_BIN - if not src_c.exists() or shutil.which("cc") is None: + if shutil.which("cc") is None: return None _ATOP_AGG_CACHE_BIN.parent.mkdir(parents=True, exist_ok=True) make_cmd = ["make", "-s", "-C", str(_ATOP_AGG_SRC_DIR), "atop_agg"] @@ -218,6 +258,8 @@ def _window_from_native(parts: list[str]) -> _Window: distinct_samples=n_epochs, interval_s=int(interval_s), seconds=end_epoch - start_epoch, + start_epoch=start_epoch, + end_epoch=end_epoch, ) @@ -225,12 +267,14 @@ def _aggregate_atop_native( log: Path, progress: _Progress, binary: Path, + begin: str | None = None, + end: str | None = None, ) -> tuple[dict[str, ProcAgg], _Window]: """Aggregate via `atop | atop_agg`; return `(by_name, window)`.""" progress.start_stage("atop: parse PRC+PRM (native)") agg_map: dict[str, ProcAgg] = {} window = _Window() - atop_cmd = ["atop", "-r", str(log), "-P", "PRC,PRM"] + atop_cmd = _atop_read_cmd(log, "PRC,PRM", begin, end) agg_cmd = [str(binary)] with ( subprocess.Popen( @@ -265,16 +309,21 @@ def _aggregate_atop_native( def aggregate_atop( log: Path, progress: _Progress, + begin: str | None = None, + end: str | None = None, ) -> tuple[dict[str, ProcAgg], _Window]: """Stream PRC+PRM records, fold them into `{name: ProcAgg}`, return window. Prefers the native `atop_agg` C helper (auto-built into ``~/.cache/usage_report/``) for ~7x speedup on full-day logs, falling back to an inline Python parser when the helper is unavailable. + + *begin*/*end* are optional atop `-b`/`-e` arguments that bound replay to a + sub-window of the day's log (used by the "since last report" mode). """ binary = _atop_agg_binary() if binary is not None: - return _aggregate_atop_native(log, progress, binary) + return _aggregate_atop_native(log, progress, binary, begin, end) progress.start_stage("atop: parse PRC+PRM") pid_cpu: dict[int, _PidCpu] = {} pid_ram: dict[int, _PidRam] = {} @@ -285,7 +334,7 @@ def aggregate_atop( # 10-min-interval log. The fraction is only used for the progress bar, # so a rough calibration is fine; it caps at 99% if we underestimate. est_total_bytes = log_size * 11 or 1 - for raw in _iter_atop_lines(log, "PRC,PRM"): + for raw in _iter_atop_lines(log, "PRC,PRM", begin, end): bytes_seen += len(raw) + 1 if not raw or raw[0] == "#" or raw.startswith("RESET") or raw == "SEP": continue @@ -365,11 +414,33 @@ def _pid_comm_name(pid: int) -> str | None: return Path(comm).name if comm else None +def _pmon_row_epoch(parts: list[str]) -> float | None: + """Local-time epoch of a pmon row from its `date`/`time` columns, or None. + + pmon timestamps are naive local time (`YYYYMMDD HH:MM:SS`); `.astimezone()` + attaches the local offset so the result is comparable to a `begin_epoch` + derived the same way. + """ + try: + stamp = _dt.datetime.strptime( + f"{parts[0]} {parts[1]}", + "%Y%m%d %H:%M:%S", + ).astimezone() + except (ValueError, IndexError): + return None + return stamp.timestamp() + + def aggregate_pmon( log: Path, progress: _Progress, + begin_epoch: float | None = None, ) -> tuple[dict[str, GpuAgg], int]: - """Return `({program: GpuAgg}, sample_count)` from the pmon *log*.""" + """Return `({program: GpuAgg}, sample_count)` from the pmon *log*. + + When *begin_epoch* is set, rows timestamped before it are skipped so the + first day of a "since last report" window starts at the previous run time. + """ progress.start_stage("pmon log scan") agg: dict[str, GpuAgg] = {} samples = 0 @@ -385,6 +456,10 @@ def aggregate_pmon( parts = _pmon_fields(line) if parts is None or len(parts) < _PMON_MIN_FIELDS: continue + if begin_epoch is not None: + row_epoch = _pmon_row_epoch(parts) + if row_epoch is not None and row_epoch < begin_epoch: + continue samples += _ingest_pmon_row(parts, agg) progress.update(1.0) return agg, samples @@ -414,3 +489,56 @@ def _ingest_pmon_row(parts: list[str], agg: dict[str, GpuAgg]) -> int: entry.peak_sm_pct = max(entry.peak_sm_pct, sm) entry.peak_mem_pct = max(entry.peak_mem_pct, mem) return 1 + + +def merge_proc_aggs(dst: dict[str, ProcAgg], src: dict[str, ProcAgg]) -> None: + """Fold one day's CPU/RAM aggregates (*src*) into the running *dst*. + + CPU-seconds and RSS sample counts add across days; peak RSS is the max; + PID counts add (each day contributes its own distinct PIDs). + """ + for name, item in src.items(): + entry = dst.setdefault(name, ProcAgg(name=name)) + entry.cpu_ticks += item.cpu_ticks + entry.peak_rss_kb = max(entry.peak_rss_kb, item.peak_rss_kb) + entry.rss_kb_sum += item.rss_kb_sum + entry.rss_samples += item.rss_samples + entry.extra_pids += item.pid_count + + +def merge_gpu_aggs(dst: dict[str, GpuAgg], src: dict[str, GpuAgg]) -> None: + """Fold one day's GPU aggregates (*src*) into the running *dst*.""" + for name, item in src.items(): + entry = dst.setdefault(name, GpuAgg(name=name)) + entry.sm_pct_sum += item.sm_pct_sum + entry.mem_pct_sum += item.mem_pct_sum + entry.samples += item.samples + entry.peak_sm_pct = max(entry.peak_sm_pct, item.peak_sm_pct) + entry.peak_mem_pct = max(entry.peak_mem_pct, item.peak_mem_pct) + entry.extra_pids += item.pid_count + + +def merge_windows(windows: list[_Window]) -> _Window: + """Combine per-day coverage *windows* into one spanning window. + + Start/end span the earliest and latest samples; ``seconds`` sums the + per-day coverage (not wall-clock end-start) so the denominator for average + CPU% reflects only the time actually monitored, excluding gap days. + """ + real = [w for w in windows if w.distinct_samples] + if not real: + return _Window() + first = min(real, key=lambda w: w.start_epoch) + last = max(real, key=lambda w: w.end_epoch) + intervals = [w.interval_s for w in real if w.interval_s] + # Representative interval = the most common per-day interval, if any. + interval = max(set(intervals), key=intervals.count) if intervals else 0 + return _Window( + start=first.start, + end=last.end, + distinct_samples=sum(w.distinct_samples for w in real), + interval_s=interval, + seconds=sum(w.seconds for w in real), + start_epoch=first.start_epoch, + end_epoch=last.end_epoch, + ) diff --git a/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_types.py b/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_types.py index ebc2418..9386e6a 100644 --- a/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_types.py +++ b/linux_configuration/scripts/periodic_background/system-maintenance/bin/_usage_report_types.py @@ -79,6 +79,15 @@ class ProcAgg: rss_kb_sum: int = 0 rss_samples: int = 0 pid_set: set[int] = field(default_factory=set) + # PID counts folded in when merging per-day aggregates. Tracked as a plain + # integer (not by extending `pid_set`) because the native parser stores a + # synthetic `range(n)` set whose union across days would collapse counts. + extra_pids: int = 0 + + @property + def pid_count(self) -> int: + """Distinct PIDs seen, including those merged from other day windows.""" + return len(self.pid_set) + self.extra_pids @property def cpu_seconds(self) -> float: @@ -109,6 +118,13 @@ class GpuAgg: peak_sm_pct: float = 0.0 peak_mem_pct: float = 0.0 pid_set: set[int] = field(default_factory=set) + # PID counts folded in when merging per-day aggregates (see ProcAgg). + extra_pids: int = 0 + + @property + def pid_count(self) -> int: + """Distinct PIDs seen, including those merged from other day windows.""" + return len(self.pid_set) + self.extra_pids @property def gpu_seconds(self) -> float: @@ -190,3 +206,7 @@ class _Window: distinct_samples: int = 0 interval_s: int = 0 seconds: int = 0 + # Raw epoch bounds, kept so multiple per-day windows can be merged by + # min(start)/max(end) without re-parsing the ISO strings above. + start_epoch: int = 0 + end_epoch: int = 0 diff --git a/linux_configuration/scripts/periodic_background/system-maintenance/bin/usage_report.py b/linux_configuration/scripts/periodic_background/system-maintenance/bin/usage_report.py index 8f7cf2e..2388ecb 100755 --- a/linux_configuration/scripts/periodic_background/system-maintenance/bin/usage_report.py +++ b/linux_configuration/scripts/periodic_background/system-maintenance/bin/usage_report.py @@ -1,19 +1,24 @@ #!/usr/bin/env python3 -"""End-of-day resource usage report from atop + nvidia-smi pmon logs. +"""Resource usage report from atop + nvidia-smi pmon logs. -Parses the current-day (or given) `atop` binary log via `atop -P PRC,PRM -r` -and the per-process nvidia-smi pmon log, aggregates CPU seconds, peak/average -RSS, and GPU SM-% seconds per program, and prints a compact Markdown report -intended to be pasted into an LLM (Claude / Copilot) for further analysis. +Parses one or more daily `atop` binary logs via `atop -P PRC,PRM -r` and the +per-process nvidia-smi pmon logs, aggregates CPU seconds, peak/average RSS, and +GPU SM-% seconds per program, and prints a compact Markdown report intended to +be pasted into an LLM (Claude / Copilot) for further analysis. -Run with no arguments to report on today's logs: +Run with no arguments to report on **everything since the last report**: the +previous run's timestamp is persisted, and each run covers the whole window +from then until now, spanning as many daily logs as needed (so skipped days are +never lost). After a successful report the timestamp is advanced to "now". - usage_report.py # today - usage_report.py --date 20260419 # specific day + usage_report.py # since the last report (multi-day) + usage_report.py --since 20260419 # ad hoc: from a date to now, no state + usage_report.py --date 20260419 # one specific day (ad hoc, no state) usage_report.py --top 20 # keep 20 rows per table + usage_report.py --no-update-state # don't advance the saved timestamp usage_report.py > report.md # redirect to a file -The output intentionally front-loads metadata (hostname, window, sample +The output intentionally front-loads metadata (hostname, period, window, sample count, HZ, machine specs) so the LLM never has to guess context. """ @@ -21,7 +26,9 @@ from __future__ import annotations import argparse from collections import defaultdict +from dataclasses import dataclass import datetime as _dt +import json import os from pathlib import Path import platform @@ -34,7 +41,14 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Iterable -from _usage_report_parsing import _run, aggregate_atop, aggregate_pmon +from _usage_report_parsing import ( + _run, + aggregate_atop, + aggregate_pmon, + merge_gpu_aggs, + merge_proc_aggs, + merge_windows, +) from _usage_report_types import ( _HZ, _PMON_INTERVAL_S, @@ -52,6 +66,12 @@ _SEC_PER_DAY = 86_400 _SEC_PER_HOUR = 3600 _SEC_PER_MIN = 60 +# Persisted marker of when the last report was generated. Lives under +# ~/.local/share (durable app state), not ~/.cache, so clearing caches does not +# silently reset the "since last report" window back to today-only. +_STATE_DIR = Path.home() / ".local/share/usage_report" +_STATE_FILE = _STATE_DIR / "last_report.json" + def _host_profile() -> dict[str, str]: """Collect a small bag of identifying facts about the host.""" @@ -127,7 +147,7 @@ def _cpu_table(aggs: Iterable[ProcAgg], window_s: int, top: int) -> list[str]: f"{idx} | {_md_escape(item.name)} | " f"{item.cpu_seconds:,.0f}s ({_fmt_h(item.cpu_seconds)}) | " f"{single:.1f}% | {box:.1f}% | " - f"{item.peak_rss_mb:,.0f} MiB | {len(item.pid_set)} |", + f"{item.peak_rss_mb:,.0f} MiB | {item.pid_count} |", ) return rows @@ -151,7 +171,7 @@ def _dedupe_ram(aggs: Iterable[ProcAgg]) -> list[tuple[ProcAgg, list[str]]]: buckets[key].append(item) result: list[tuple[ProcAgg, list[str]]] = [] for bucket in buckets.values(): - bucket.sort(key=lambda a: (a.cpu_ticks, len(a.pid_set)), reverse=True) + bucket.sort(key=lambda a: (a.cpu_ticks, a.pid_count), reverse=True) rep = bucket[0] siblings = [b.name for b in bucket[1:]] result.append((rep, siblings)) @@ -186,7 +206,7 @@ def _ram_table(aggs: Iterable[ProcAgg], top: int) -> list[str]: f"{item.peak_rss_mb:,.0f} MiB | " f"{item.avg_rss_mb:,.0f} MiB | " f"{item.cpu_seconds:,.0f}s | " - f"{len(item.pid_set)} | {sib} |", + f"{item.pid_count} | {sib} |", ) return rows @@ -212,7 +232,7 @@ def _gpu_table(aggs: dict[str, GpuAgg], total_samples: int, top: int) -> list[st f"{item.peak_sm_pct:.0f}% | " f"{item.peak_mem_pct:.0f}% | " f"{item.samples} ({presence:.0f}%) | " - f"{len(item.pid_set)} |", + f"{item.pid_count} |", ) return rows @@ -227,11 +247,15 @@ def _fingerprint_section() -> list[str]: ] -def _methodology_section(atop_log: Path, pmon_log: Path, window: _Window) -> list[str]: +def _methodology_section( + atop_desc: str, + pmon_desc: str, + window: _Window, +) -> list[str]: window_note = ( f"- **Coverage window**: {_fmt_h(window.seconds)} " - f"(from first to last atop sample; window may be shorter than wall " - f"clock since the next atop tick has not yet fired)." + f"(sum of per-day atop coverage from first to last sample; excludes " + f"any gap days where atop was not logging, and the final partial tick)." ) interval_note = ( f"- **atop sample interval (observed)**: {window.interval_s}s" @@ -266,8 +290,8 @@ def _methodology_section(atop_log: Path, pmon_log: Path, window: _Window) -> lis return [ "## Methodology", "", - f"- **atop log**: `{atop_log}` (binary, replay with `atop -r`)", - f"- **pmon log**: `{pmon_log}` (`nvidia-smi pmon -d {_PMON_INTERVAL_S}`)", + f"- **atop log(s)**: {atop_desc}", + f"- **pmon log(s)**: {pmon_desc}", f"- **HZ**: {_HZ} ticks/s; **page size**: {_PAGE_KB} KiB", window_note, interval_note, @@ -293,34 +317,60 @@ def _compute_window(atop_log: Path, progress: _Progress) -> _Window: _LLM_PROMPT = [ - "> Below is a day's worth of aggregated resource usage for my Linux workstation.", - "> Identify which programs are the biggest hogs, flag anything that looks abnormal", - "> for a typical developer/gaming setup, and suggest concrete optimisations", - "> (config tweaks, process limits, alternative tools). Be specific.", + "> Below is aggregated resource usage for my Linux workstation over the", + "> reporting period shown above. Identify which programs are the biggest", + "> hogs, flag anything that looks abnormal for a typical developer/gaming", + "> setup, and suggest concrete optimisations (config tweaks, process limits,", + "> alternative tools). Be specific.", ] _REPORT_STAGES = 2 -def _build_report( - args: argparse.Namespace, - atop_log: Path, - pmon_log: Path, -) -> str: - progress = _Progress( - enabled=not args.quiet, - total_stages=_REPORT_STAGES, - ) - cpu_aggs, window = aggregate_atop(atop_log, progress) - if not window.seconds: - window.seconds = _SEC_PER_DAY - gpu_aggs, gpu_samples = aggregate_pmon(pmon_log, progress) - progress.finish() +@dataclass +class _Segment: + """One calendar day's resolved logs plus optional in-day start bounds. + *atop_begin* is an atop ``-b`` argument (``YYYYMMDDhhmmss``) and + *pmon_begin_epoch* the matching local epoch; both are set only for the first + day of a "since last report" window so re-runs do not double-count. + """ + + atop_log: Path + pmon_log: Path + atop_begin: str | None = None + pmon_begin_epoch: float | None = None + + +@dataclass +class _Aggregates: + """Merged CPU/GPU aggregates and coverage window for a reporting window. + + *days_with_data* is the number of daily logs that actually yielded atop + samples (gap days where the machine was off contribute nothing). + """ + + cpu: dict[str, ProcAgg] + gpu: dict[str, GpuAgg] + window: _Window + gpu_samples: int + days_with_data: int + + +def _render_report( + aggs: _Aggregates, + *, + top: int, + atop_desc: str, + pmon_desc: str, + period_line: str, +) -> str: + """Assemble the Markdown report from already-aggregated data.""" + window = aggs.window gpu_section = ( - _gpu_table(gpu_aggs, gpu_samples, args.top) - if gpu_aggs + _gpu_table(aggs.gpu, aggs.gpu_samples, top) + if aggs.gpu else ["_No GPU pmon data found._"] ) generated = _dt.datetime.now().astimezone().isoformat(timespec="seconds") @@ -329,20 +379,21 @@ def _build_report( "# System resource usage report", "", f"- **Generated**: {generated}", + period_line, f"- **atop window**: {window.start} \u2192 {window.end}", f"- **atop samples**: {window.distinct_samples} distinct " f"timestamps (sample interval \u2248 {interval})", - f"- **GPU pmon samples**: {gpu_samples} (\u2248{_PMON_INTERVAL_S}s each)", + f"- **GPU pmon samples**: {aggs.gpu_samples} (\u2248{_PMON_INTERVAL_S}s each)", "", *_fingerprint_section(), - *_methodology_section(atop_log, pmon_log, window), + *_methodology_section(atop_desc, pmon_desc, window), "## Top CPU consumers", "", - *_cpu_table(cpu_aggs.values(), window.seconds, args.top), + *_cpu_table(aggs.cpu.values(), window.seconds, top), "", "## Top RAM consumers (by peak RSS, deduped by shared-memory bucket)", "", - *_ram_table(cpu_aggs.values(), args.top), + *_ram_table(aggs.cpu.values(), top), "", "## Top GPU consumers", "", @@ -356,12 +407,117 @@ def _build_report( return "\n".join(lines) + "\n" +def _aggregate_segments( + segments: list[_Segment], + progress: _Progress, +) -> _Aggregates: + """Aggregate and merge every existing daily log in *segments*. + + Missing daily logs (gap days) are skipped silently. + """ + cpu_total: dict[str, ProcAgg] = {} + gpu_total: dict[str, GpuAgg] = {} + windows: list[_Window] = [] + gpu_samples = 0 + days_with_data = 0 + for seg in segments: + if seg.atop_log.exists(): + cpu, window = aggregate_atop(seg.atop_log, progress, seg.atop_begin) + merge_proc_aggs(cpu_total, cpu) + if window.distinct_samples: + windows.append(window) + days_with_data += 1 + gpu, samples = aggregate_pmon(seg.pmon_log, progress, seg.pmon_begin_epoch) + merge_gpu_aggs(gpu_total, gpu) + gpu_samples += samples + return _Aggregates( + cpu_total, + gpu_total, + merge_windows(windows), + gpu_samples, + days_with_data, + ) + + +def _describe_logs(paths: list[Path], how: str) -> str: + """One-line Markdown description of the log files actually consumed.""" + if not paths: + return f"_none found_ (`{how}`)" + if len(paths) == 1: + return f"`{paths[0]}` (`{how}`)" + return ( + f"{len(paths)} daily logs `{paths[0].name}` \u2026 `{paths[-1].name}` " + f"in `{paths[0].parent}` (`{how}`)" + ) + + +def _log_descriptions(segments: list[_Segment]) -> tuple[str, str]: + """Return ``(atop_desc, pmon_desc)`` for the logs present in *segments*.""" + atop_present = [seg.atop_log for seg in segments if seg.atop_log.exists()] + pmon_present = [seg.pmon_log for seg in segments if seg.pmon_log.exists()] + return ( + _describe_logs(atop_present, "atop -r"), + _describe_logs(pmon_present, f"nvidia-smi pmon -d {_PMON_INTERVAL_S}"), + ) + + def _resolve_logs(date: str) -> tuple[Path, Path]: atop_log = _ATOP_LOG_DIR / f"atop_{date}" pmon_log = _PMON_LOG_DIR / f"pmon-{date}.log" return atop_log, pmon_log +def _read_last_generated() -> _dt.datetime | None: + """Return the timestamp of the previous report run, or None if unknown.""" + try: + raw = _STATE_FILE.read_text(encoding="utf-8") + except OSError: + return None + try: + stamp = json.loads(raw)["last_generated"] + return _dt.datetime.fromisoformat(stamp).astimezone() + except (ValueError, KeyError, TypeError): + return None + + +def _write_last_generated(when: _dt.datetime) -> None: + """Persist *when* as the last-report timestamp for the next run.""" + _STATE_DIR.mkdir(parents=True, exist_ok=True) + payload = json.dumps({"last_generated": when.isoformat(timespec="seconds")}) + _STATE_FILE.write_text(payload + "\n", encoding="utf-8") + + +def _has_time_of_day(when: _dt.datetime) -> bool: + """True when *when* is past local midnight, so a begin bound is needed.""" + return bool(when.hour or when.minute or when.second or when.microsecond) + + +def _plan_segments(start: _dt.datetime, end: _dt.datetime) -> list[_Segment]: + """Resolve one `_Segment` per calendar day across ``[start, end]``. + + The first day is bounded at *start*'s time-of-day so a same-day re-run only + covers the slice since the previous report; later days are covered in full. + Returns an empty list when *start* is after *end* (e.g. a future state file). + """ + segments: list[_Segment] = [] + day = start.date() + while day <= end.date(): + atop_log, pmon_log = _resolve_logs(day.strftime("%Y%m%d")) + if day == start.date() and _has_time_of_day(start): + segments.append( + _Segment( + atop_log, + pmon_log, + start.strftime("%Y%m%d%H%M%S"), + start.timestamp(), + ), + ) + else: + segments.append(_Segment(atop_log, pmon_log)) + day += _dt.timedelta(days=1) + return segments + + _INSTALL_SCRIPT = Path(__file__).with_name("install_usage_monitoring.sh") @@ -406,13 +562,126 @@ def _copy_to_clipboard(text: str) -> None: ) -def main(argv: list[str] | None = None) -> int: - """Entry point; see module docstring for CLI.""" +def _emit(args: argparse.Namespace, report: str) -> None: + """Write the report to stdout and (unless suppressed) the clipboard.""" + sys.stdout.write(report) + if not args.no_clipboard: + _copy_to_clipboard(report) + + +def _period_line(start: _dt.datetime, end: _dt.datetime) -> str: + """Markdown bullet describing the requested reporting period.""" + span = _fmt_h(max((end - start).total_seconds(), 0.0)) + return ( + f"- **Reporting period**: {start.isoformat(timespec='seconds')} → " + f"{end.isoformat(timespec='seconds')} ({span})" + ) + + +def _is_single_day_mode(args: argparse.Namespace) -> bool: + """True when the user pinned an exact day or explicit log paths.""" + return ( + args.date is not None or args.atop_log is not None or args.pmon_log is not None + ) + + +def _should_advance_state(args: argparse.Namespace) -> bool: + """Advance the saved timestamp only for genuine since-last-report runs. + + An explicit ``--since`` is treated as a read-only ad-hoc query (like + ``--date``) so "let me look from date X" never silently re-baselines the + saved tracking point. + """ + return args.since is None and not args.no_update_state + + +def _run_single_day(args: argparse.Namespace, now: _dt.datetime) -> int: + """Report on one specific day (legacy behaviour); never touches state.""" + date = args.date or now.strftime("%Y%m%d") + atop_default, pmon_default = _resolve_logs(date) + atop_log = args.atop_log or atop_default + pmon_log = args.pmon_log or pmon_default + _preflight(atop_log) + segment = _Segment(atop_log, pmon_log) + progress = _Progress(enabled=not args.quiet, total_stages=_REPORT_STAGES) + aggs = _aggregate_segments([segment], progress) + progress.finish() + if not aggs.window.seconds: + aggs.window.seconds = _SEC_PER_DAY + atop_desc, pmon_desc = _log_descriptions([segment]) + _emit( + args, + _render_report( + aggs, + top=args.top, + atop_desc=atop_desc, + pmon_desc=pmon_desc, + period_line=f"- **Reporting period**: {date} (single day)", + ), + ) + return 0 + + +def _resolve_start(args: argparse.Namespace, now: _dt.datetime) -> _dt.datetime: + """Pick the window start: --since, else last report, else today midnight.""" + if args.since is not None: + return _dt.datetime.strptime(args.since, "%Y%m%d").astimezone() + last = _read_last_generated() + if last is not None: + return last + return now.replace(hour=0, minute=0, second=0, microsecond=0) + + +def _run_since(args: argparse.Namespace, now: _dt.datetime) -> int: + """Report on everything since the last run, spanning multiple daily logs.""" + if not shutil.which("atop"): + sys.exit(f"error: `atop` is not installed.\nrun: {_INSTALL_SCRIPT}") + start = _resolve_start(args, now) + segments = _plan_segments(start, now) + progress = _Progress( + enabled=not args.quiet, + total_stages=max(2 * len(segments), 1), + ) + aggs = _aggregate_segments(segments, progress) + progress.finish() + if aggs.days_with_data == 0: + sys.stderr.write( + f"no atop logs with data for {start.date()} … {now.date()}; " + "nothing to report.\n", + ) + if _should_advance_state(args): + _write_last_generated(now) + return 0 + if not aggs.window.seconds: + aggs.window.seconds = _SEC_PER_DAY + atop_desc, pmon_desc = _log_descriptions(segments) + _emit( + args, + _render_report( + aggs, + top=args.top, + atop_desc=atop_desc, + pmon_desc=pmon_desc, + period_line=_period_line(start, now), + ), + ) + if _should_advance_state(args): + _write_last_generated(now) + return 0 + + +def _build_parser() -> argparse.ArgumentParser: + """Construct the command-line argument parser.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--date", - default=_dt.datetime.now().astimezone().strftime("%Y%m%d"), - help="YYYYMMDD to report on (default: today)", + default=None, + help="report on one specific day (YYYYMMDD); ad hoc, ignores state", + ) + parser.add_argument( + "--since", + default=None, + help="ad-hoc: report from this date (YYYYMMDD) to now; leaves state", ) parser.add_argument( "--top", @@ -424,35 +693,39 @@ def main(argv: list[str] | None = None) -> int: "--atop-log", type=Path, default=None, - help="override atop log path", + help="override atop log path (implies single-day mode)", ) parser.add_argument( "--pmon-log", type=Path, default=None, - help="override pmon log path", + help="override pmon log path (implies single-day mode)", ) parser.add_argument( "--no-clipboard", action="store_true", help="skip copying the report to the X clipboard", ) + parser.add_argument( + "--no-update-state", + action="store_true", + help="do not advance the saved last-report timestamp", + ) parser.add_argument( "--quiet", action="store_true", help="suppress the progress line on stderr", ) - args = parser.parse_args(argv) + return parser - atop_default, pmon_default = _resolve_logs(args.date) - atop_log = args.atop_log or atop_default - pmon_log = args.pmon_log or pmon_default - _preflight(atop_log) - report = _build_report(args, atop_log, pmon_log) - sys.stdout.write(report) - if not args.no_clipboard: - _copy_to_clipboard(report) - return 0 + +def main(argv: list[str] | None = None) -> int: + """Entry point; see module docstring for CLI.""" + args = _build_parser().parse_args(argv) + now = _dt.datetime.now().astimezone() + if _is_single_day_mode(args): + return _run_single_day(args, now) + return _run_since(args, now) if __name__ == "__main__": diff --git a/linux_configuration/tests/conftest.py b/linux_configuration/tests/conftest.py new file mode 100644 index 0000000..29ae2e5 --- /dev/null +++ b/linux_configuration/tests/conftest.py @@ -0,0 +1,21 @@ +"""Pytest bootstrap: make usage_report's ``bin/`` importable for these tests. + +The usage-report modules live in a non-package script directory and use +absolute imports (``from _usage_report_parsing import ...``), so the directory +must be on ``sys.path`` before the tests import them. +""" + +from __future__ import annotations + +from pathlib import Path +import sys + +_BIN = ( + Path(__file__).resolve().parents[1] + / "scripts" + / "periodic_background" + / "system-maintenance" + / "bin" +) +if str(_BIN) not in sys.path: + sys.path.insert(0, str(_BIN)) diff --git a/linux_configuration/tests/test_usage_report_pmon_names.py b/linux_configuration/tests/test_usage_report_pmon_names.py index 7905a09..eb22bcc 100644 --- a/linux_configuration/tests/test_usage_report_pmon_names.py +++ b/linux_configuration/tests/test_usage_report_pmon_names.py @@ -2,42 +2,26 @@ from __future__ import annotations -import importlib.util -from pathlib import Path -import sys from typing import TYPE_CHECKING +import _usage_report_parsing as parsing + if TYPE_CHECKING: import pytest -MODULE_PATH = ( - Path(__file__).resolve().parents[1] - / "scripts" - / "system-maintenance" - / "bin" - / "usage_report.py" -) -SPEC = importlib.util.spec_from_file_location("usage_report", MODULE_PATH) -if SPEC is None or SPEC.loader is None: - msg = "could not load usage_report module" - raise RuntimeError(msg) -usage_report = importlib.util.module_from_spec(SPEC) -sys.modules[SPEC.name] = usage_report -SPEC.loader.exec_module(usage_report) - def test_normalize_pmon_command_prefers_first_executable_token() -> None: """The parser should keep executable-like token, not trailing args.""" tokens = ["code-insiders", "--type=", "gpu-process", "Not"] - assert usage_report._normalize_pmon_command(tokens) == "code-insiders" + assert parsing._normalize_pmon_command(tokens) == "code-insiders" def test_normalize_pmon_command_skips_leading_option_tokens() -> None: """If the first token is an option, use the next non-option token.""" tokens = ["--type=", "code-insiders", "--flag"] - assert usage_report._normalize_pmon_command(tokens) == "code-insiders" + assert parsing._normalize_pmon_command(tokens) == "code-insiders" def test_ingest_pmon_row_uses_command_field_start_not_last_token() -> None: @@ -60,7 +44,7 @@ def test_ingest_pmon_row_uses_command_field_start_not_last_token() -> None: ] agg: dict[str, object] = {} - consumed = usage_report._ingest_pmon_row(row, agg) + consumed = parsing._ingest_pmon_row(row, agg) assert consumed == 1 assert "code-insiders" in agg @@ -85,8 +69,8 @@ def test_ingest_pmon_row_falls_back_to_proc_comm_on_unknown( ] agg: dict[str, object] = {} - monkeypatch.setattr(usage_report, "_pid_comm_name", lambda _pid: "python") - consumed = usage_report._ingest_pmon_row(row, agg) + monkeypatch.setattr(parsing, "_pid_comm_name", lambda _pid: "python") + consumed = parsing._ingest_pmon_row(row, agg) assert consumed == 1 assert "python" in agg diff --git a/linux_configuration/tests/test_usage_report_since.py b/linux_configuration/tests/test_usage_report_since.py new file mode 100644 index 0000000..8246402 --- /dev/null +++ b/linux_configuration/tests/test_usage_report_since.py @@ -0,0 +1,479 @@ +"""Tests for the "since last report" multi-day aggregation in usage_report. + +Covers the helpers added to span and merge several daily logs: aggregate +merging, window merging, PID-count carry-over, pmon timestamp filtering, +atop command bounding, the persisted last-report state, day-segment planning, +and the run-mode dispatch logic. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +from pathlib import Path +from typing import TYPE_CHECKING + +import _usage_report_parsing as parsing +from _usage_report_types import GpuAgg, ProcAgg, _PidCpu, _Progress, _Window +import usage_report + +if TYPE_CHECKING: + import pytest + +# Aware timezone matching how the parser localizes naive timestamps, so epochs +# computed here line up with `_pmon_row_epoch`'s `.astimezone()` conversion. +_LOCAL_TZ = _dt.datetime.now().astimezone().tzinfo + + +def _at( + year: int, month: int, day: int, hour: int = 0, minute: int = 0 +) -> _dt.datetime: + """Build an aware local datetime for tests.""" + return _dt.datetime(year, month, day, hour, minute, tzinfo=_LOCAL_TZ) + + +# --------------------------------------------------------------------------- # +# PID-count carry-over (types) +# --------------------------------------------------------------------------- # +def test_proc_pid_count_combines_set_and_extra() -> None: + """`pid_count` adds the live set length and merged-in extras.""" + agg = ProcAgg("x", pid_set={1, 2, 3}, extra_pids=2) + + assert agg.pid_count == 5 + + +def test_gpu_pid_count_combines_set_and_extra() -> None: + """GpuAgg exposes the same combined PID count.""" + agg = GpuAgg("x", pid_set={9}, extra_pids=4) + + assert agg.pid_count == 5 + + +# --------------------------------------------------------------------------- # +# Aggregate merging (parsing) +# --------------------------------------------------------------------------- # +def test_merge_proc_aggs_sums_and_takes_peak() -> None: + """CPU/RSS sums accumulate, peak RSS is the max, PID counts add.""" + dst: dict[str, ProcAgg] = {} + parsing.merge_proc_aggs( + dst, + { + "a": ProcAgg( + "a", + cpu_ticks=100, + peak_rss_kb=200, + rss_kb_sum=50, + rss_samples=2, + pid_set={1, 2}, + ) + }, + ) + parsing.merge_proc_aggs( + dst, + { + "a": ProcAgg( + "a", + cpu_ticks=10, + peak_rss_kb=500, + rss_kb_sum=5, + rss_samples=1, + pid_set={3}, + ) + }, + ) + + entry = dst["a"] + assert entry.cpu_ticks == 110 + assert entry.peak_rss_kb == 500 + assert entry.rss_kb_sum == 55 + assert entry.rss_samples == 3 + assert entry.pid_count == 3 + + +def test_merge_gpu_aggs_sums_and_takes_peak() -> None: + """GPU sample sums accumulate and peaks take the max across days.""" + dst: dict[str, GpuAgg] = {} + parsing.merge_gpu_aggs( + dst, + { + "g": GpuAgg( + "g", + sm_pct_sum=30.0, + mem_pct_sum=10.0, + samples=3, + peak_sm_pct=40.0, + peak_mem_pct=20.0, + pid_set={1}, + ) + }, + ) + parsing.merge_gpu_aggs( + dst, + { + "g": GpuAgg( + "g", + sm_pct_sum=5.0, + mem_pct_sum=2.0, + samples=1, + peak_sm_pct=80.0, + peak_mem_pct=15.0, + pid_set={2, 3}, + ) + }, + ) + + entry = dst["g"] + assert entry.sm_pct_sum == 35.0 + assert entry.samples == 4 + assert entry.peak_sm_pct == 80.0 + assert entry.peak_mem_pct == 20.0 + assert entry.pid_count == 3 + + +# --------------------------------------------------------------------------- # +# Window merging (parsing) +# --------------------------------------------------------------------------- # +def test_merge_windows_empty_returns_default() -> None: + """Merging no real windows yields the empty default window.""" + assert parsing.merge_windows([]).distinct_samples == 0 + assert parsing.merge_windows([_Window()]).distinct_samples == 0 + + +def test_merge_windows_spans_and_sums() -> None: + """Span uses min start / max end; samples and seconds sum; interval is modal.""" + w_empty = _Window() # distinct_samples == 0, must be ignored + w1 = _Window( + start="s1", + end="e1", + distinct_samples=5, + interval_s=600, + seconds=100, + start_epoch=1000, + end_epoch=2000, + ) + w2 = _Window( + start="s2", + end="e2", + distinct_samples=3, + interval_s=600, + seconds=50, + start_epoch=500, + end_epoch=3000, + ) + + merged = parsing.merge_windows([w_empty, w1, w2]) + + assert merged.start == "s2" # earliest start_epoch (500) + assert merged.end == "e2" # latest end_epoch (3000) + assert merged.distinct_samples == 8 + assert merged.seconds == 150 + assert merged.interval_s == 600 + + +# --------------------------------------------------------------------------- # +# pmon timestamp helpers (parsing) +# --------------------------------------------------------------------------- # +def test_pmon_row_epoch_parses_valid_row() -> None: + """A well-formed pmon row yields the matching local epoch.""" + row = ["20260604", "10:30:00", "0", "100", "G", "5", "1"] + + assert parsing._pmon_row_epoch(row) == _at(2026, 6, 4, 10, 30).timestamp() + + +def test_pmon_row_epoch_returns_none_on_bad_input() -> None: + """Malformed or short rows return None rather than raising.""" + assert parsing._pmon_row_epoch([]) is None + assert parsing._pmon_row_epoch(["nope", "alsonope"]) is None + + +def _write_pmon(path: Path) -> None: + """Write a tiny pmon log with two rows ten minutes apart.""" + path.write_text( + "#Date Time gpu pid type sm mem enc dec jpg ofa command\n" + " 20260604 10:00:00 0 100 G 5 1 - - - - Xorg\n" + " 20260604 11:00:00 0 101 G 7 2 - - - - thorium\n", + encoding="utf-8", + ) + + +def test_aggregate_pmon_without_bound_keeps_all_rows(tmp_path: Path) -> None: + """No begin_epoch means every data row counts.""" + log = tmp_path / "pmon.log" + _write_pmon(log) + + _, samples = parsing.aggregate_pmon(log, _Progress(enabled=False, total_stages=1)) + + assert samples == 2 + + +def test_aggregate_pmon_filters_rows_before_begin(tmp_path: Path) -> None: + """Rows timestamped before begin_epoch are skipped.""" + log = tmp_path / "pmon.log" + _write_pmon(log) + cutoff = _at(2026, 6, 4, 10, 30).timestamp() + + agg, samples = parsing.aggregate_pmon( + log, + _Progress(enabled=False, total_stages=1), + begin_epoch=cutoff, + ) + + assert samples == 1 + assert "thorium" in agg + assert "Xorg" not in agg + + +# --------------------------------------------------------------------------- # +# atop command bounding (parsing) +# --------------------------------------------------------------------------- # +def test_atop_read_cmd_unbounded() -> None: + """Without bounds the command is a plain replay.""" + cmd = parsing._atop_read_cmd( + Path("/var/log/atop/atop_20260604"), "PRC,PRM", None, None + ) + + assert cmd == ["atop", "-r", "/var/log/atop/atop_20260604", "-P", "PRC,PRM"] + + +def test_atop_read_cmd_with_begin_and_end() -> None: + """Begin/end inject -b/-e before the -P selector.""" + cmd = parsing._atop_read_cmd(Path("/x"), "PRC", "202606041400", "202606042000") + + assert cmd == [ + "atop", + "-r", + "/x", + "-b", + "202606041400", + "-e", + "202606042000", + "-P", + "PRC", + ] + + +# --------------------------------------------------------------------------- # +# Persisted last-report state (usage_report) +# --------------------------------------------------------------------------- # +def test_state_round_trip( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A written timestamp reads back as an equal aware datetime.""" + state = tmp_path / "state" / "last_report.json" + monkeypatch.setattr(usage_report, "_STATE_DIR", state.parent) + monkeypatch.setattr(usage_report, "_STATE_FILE", state) + when = _at(2026, 6, 2, 9, 0) + + usage_report._write_last_generated(when) + + assert usage_report._read_last_generated() == when + + +def test_state_missing_file_returns_none( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """No state file yet means "unknown", so the caller falls back to today.""" + monkeypatch.setattr(usage_report, "_STATE_FILE", tmp_path / "absent.json") + + assert usage_report._read_last_generated() is None + + +def test_state_corrupt_file_returns_none( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Corrupt or partial JSON is treated as unknown, not a crash.""" + bad = tmp_path / "bad.json" + bad.write_text("{ not json", encoding="utf-8") + monkeypatch.setattr(usage_report, "_STATE_FILE", bad) + assert usage_report._read_last_generated() is None + + bad.write_text("{}", encoding="utf-8") # valid JSON, missing key + assert usage_report._read_last_generated() is None + + +# --------------------------------------------------------------------------- # +# Day-segment planning (usage_report) +# --------------------------------------------------------------------------- # +def test_has_time_of_day() -> None: + """Midnight needs no begin bound; any later time does.""" + assert usage_report._has_time_of_day(_at(2026, 6, 4, 14, 30)) is True + assert usage_report._has_time_of_day(_at(2026, 6, 4, 0, 0)) is False + + +def test_plan_segments_single_day_midnight_unbounded() -> None: + """A start at local midnight covers the whole first day (no -b bound).""" + segments = usage_report._plan_segments(_at(2026, 6, 4), _at(2026, 6, 4, 12)) + + assert len(segments) == 1 + assert segments[0].atop_begin is None + assert segments[0].pmon_begin_epoch is None + + +def test_plan_segments_bounds_only_first_day() -> None: + """A mid-day start bounds the first day only; later days are full.""" + start = _at(2026, 6, 2, 14, 0) + segments = usage_report._plan_segments(start, _at(2026, 6, 4, 10, 0)) + + assert len(segments) == 3 + assert segments[0].atop_begin == "20260602140000" + assert segments[0].pmon_begin_epoch == start.timestamp() + assert all(seg.atop_begin is None for seg in segments[1:]) + assert segments[-1].atop_log.name == "atop_20260604" + + +def test_plan_segments_start_after_end_is_empty() -> None: + """A future state file (start past end) yields no segments.""" + assert usage_report._plan_segments(_at(2026, 6, 5), _at(2026, 6, 4)) == [] + + +# --------------------------------------------------------------------------- # +# Start resolution and mode dispatch (usage_report) +# --------------------------------------------------------------------------- # +def _args(**overrides: object) -> argparse.Namespace: + """Build a Namespace with the usage_report CLI defaults.""" + base: dict[str, object] = { + "date": None, + "since": None, + "atop_log": None, + "pmon_log": None, + } + base.update(overrides) + return argparse.Namespace(**base) + + +def test_resolve_start_prefers_since(monkeypatch: pytest.MonkeyPatch) -> None: + """--since wins over any saved state and starts at local midnight.""" + monkeypatch.setattr(usage_report, "_read_last_generated", lambda: _at(2026, 1, 1)) + start = usage_report._resolve_start(_args(since="20260604"), _at(2026, 6, 4, 12)) + + assert start.date() == _dt.date(2026, 6, 4) + assert (start.hour, start.minute) == (0, 0) + + +def test_resolve_start_uses_last_report(monkeypatch: pytest.MonkeyPatch) -> None: + """Without --since, the saved last-report timestamp is the start.""" + last = _at(2026, 6, 2, 9, 0) + monkeypatch.setattr(usage_report, "_read_last_generated", lambda: last) + + assert usage_report._resolve_start(_args(), _at(2026, 6, 4, 12)) == last + + +def test_resolve_start_first_run_is_today_midnight( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """First-ever run (no state) covers today from local midnight.""" + monkeypatch.setattr(usage_report, "_read_last_generated", lambda: None) + now = _at(2026, 6, 4, 12, 30) + + assert usage_report._resolve_start(_args(), now) == _at(2026, 6, 4, 0, 0) + + +def test_is_single_day_mode() -> None: + """Pinning a date or explicit log path selects single-day mode.""" + assert usage_report._is_single_day_mode(_args(date="20260604")) is True + assert usage_report._is_single_day_mode(_args(atop_log=Path("/x"))) is True + assert usage_report._is_single_day_mode(_args(pmon_log=Path("/x"))) is True + assert usage_report._is_single_day_mode(_args()) is False + + +def test_should_advance_state_only_for_default_run() -> None: + """Only a plain since-last-report run re-baselines the saved timestamp.""" + assert usage_report._should_advance_state(_args(no_update_state=False)) is True + assert usage_report._should_advance_state(_args(no_update_state=True)) is False + # --since is an ad-hoc query and must never advance state. + assert ( + usage_report._should_advance_state( + _args(since="20260510", no_update_state=False), + ) + is False + ) + + +# --------------------------------------------------------------------------- # +# Report fragments (usage_report) +# --------------------------------------------------------------------------- # +def test_period_line_contains_both_bounds() -> None: + """The period bullet shows start, end, and the span.""" + line = usage_report._period_line(_at(2026, 6, 2, 9), _at(2026, 6, 4, 9)) + + assert "2026-06-02T09:00:00" in line + assert "2026-06-04T09:00:00" in line + assert "→" in line + + +def test_describe_logs_counts() -> None: + """Log description switches between none / single / multiple wording.""" + assert "none found" in usage_report._describe_logs([], "atop -r") + assert usage_report._describe_logs( + [Path("/var/log/atop/atop_20260604")], "atop -r" + ).startswith( + "`/var/log/atop/atop_20260604`", + ) + many = usage_report._describe_logs( + [Path("/v/atop_20260601"), Path("/v/atop_20260604")], + "atop -r", + ) + assert "2 daily logs" in many + + +# --------------------------------------------------------------------------- # +# PRC field parsing — HZ-field regression (parsing) +# --------------------------------------------------------------------------- # +def test_parse_prc_does_not_charge_hz_as_cpu() -> None: + """atop emits `... pid (name) state HZ utime stime`; the HZ column must be + skipped, never summed as CPU. + + Regression for the off-by-one that read HZ (100) as utime, which inflated + every process's CPU-seconds to its record/PID count (xset showing 67h). + """ + pid_cpu: dict[int, _PidCpu] = {} + # 6 generic fields, pid, (name), state, HZ=100, utime=7, stime=3, + tail. + line = "PRC host 1000 2026/06/04 12:00:00 600 4242 (xset) E 100 7 3 0 0 0" + + parsing._parse_prc(line.split(), pid_cpu) + + entry = pid_cpu[4242] + assert entry.name == "xset" + assert entry.delta_ticks == 10 # utime+stime, never the HZ constant (100) + + +def test_parse_prc_skips_hz_with_multiword_name() -> None: + """The HZ skip stays aligned when the name spans several tokens.""" + pid_cpu: dict[int, _PidCpu] = {} + line = "PRC h 1000 d t 600 99 (Web Content) S 100 40 2 0 0" + + parsing._parse_prc(line.split(), pid_cpu) + + assert pid_cpu[99].name == "Web Content" + assert pid_cpu[99].delta_ticks == 42 # 40+2, HZ(100) skipped + + +def test_parse_prc_too_short_is_ignored() -> None: + """A truncated PRC record (missing stime) is skipped, not a crash.""" + pid_cpu: dict[int, _PidCpu] = {} + # Tokens run out at utime — no stime at after+3, so the record is dropped. + line = "PRC h 1000 d t 600 7 (x) S 100 5" + + parsing._parse_prc(line.split(), pid_cpu) + + assert pid_cpu == {} + + +# --------------------------------------------------------------------------- # +# Native helper selection (parsing) +# --------------------------------------------------------------------------- # +def test_atop_agg_binary_missing_source_falls_back( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A deleted C source tree yields None (Python fallback) even when a cached + binary exists — never trust an orphaned, unverifiable build.""" + monkeypatch.setattr(parsing, "_ATOP_AGG_SRC_DIR", tmp_path / "gone") + cache = tmp_path / "atop_agg" + cache.write_text("stale binary", encoding="utf-8") + monkeypatch.setattr(parsing, "_ATOP_AGG_CACHE_BIN", cache) + + assert parsing._atop_agg_binary() is None diff --git a/meta/pyproject.toml b/meta/pyproject.toml index d02e50e..713c442 100644 --- a/meta/pyproject.toml +++ b/meta/pyproject.toml @@ -204,7 +204,10 @@ exclude_dirs = ["tests", ".venv", "Bash/ffmpeg-build"] # PYTEST - Testing framework configuration # ============================================================================ [tool.pytest.ini_options] -testpaths = ["python_pkg"] +# linux_configuration/tests covers the standalone usage_report scripts. It adds +# no python_pkg coverage (those tests don't import python_pkg), so running it +# alongside leaves the 100%-on-python_pkg gate untouched. +testpaths = ["python_pkg", "linux_configuration/tests"] python_files = ["test_*.py", "*_test.py"] python_classes = ["Test*"] python_functions = ["test_*"] diff --git a/meta/run.sh b/meta/run.sh index f445898..ec5d880 100755 --- a/meta/run.sh +++ b/meta/run.sh @@ -13,8 +13,8 @@ set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" -REPORT_SCRIPT="$SCRIPT_DIR/linux_configuration/scripts/system-maintenance/bin/usage_report.py" -ARTIFACT_INIT_SCRIPT="$SCRIPT_DIR/scripts/init_agent_artifacts.sh" +REPORT_SCRIPT="$SCRIPT_DIR/linux_configuration/scripts/periodic_background/system-maintenance/bin/usage_report.py" +ARTIFACT_INIT_SCRIPT="$SCRIPT_DIR/meta/scripts/init_agent_artifacts.sh" if [[ ! -f "$REPORT_SCRIPT" ]]; then echo "Error: usage_report.py not found at: $REPORT_SCRIPT" >&2 diff --git a/meta/scripts/pytest_changed_packages.py b/meta/scripts/pytest_changed_packages.py index 0b0603f..87791b0 100755 --- a/meta/scripts/pytest_changed_packages.py +++ b/meta/scripts/pytest_changed_packages.py @@ -9,6 +9,10 @@ invocation with whole-repo coverage measured against ``python_pkg``. Running all packages together (rather than just the touched ones) ensures that 100% branch coverage is maintained across the entire codebase on every commit, not just the files that happened to change. + +Standalone script suites outside ``python_pkg/`` (currently +``linux_configuration/tests``) are also run so their behaviour is gated, but +they are not coverage-measured (coverage stays scoped to ``python_pkg``). """ from __future__ import annotations @@ -21,6 +25,10 @@ import sys _TOTAL_MEM = "4G" +# Standalone script test suites outside python_pkg/ that should be gated but +# not coverage-measured. Skipped silently if the directory does not exist. +_EXTRA_TEST_DIRS = ("linux_configuration/tests",) + def main() -> int: """Entry point.""" @@ -35,6 +43,9 @@ def main() -> int: if not packages: return 0 + test_dirs = [f"python_pkg/{pkg}/tests" for pkg in packages] + test_dirs += [d for d in _EXTRA_TEST_DIRS if Path(d).is_dir()] + cmd = [ sys.executable, "-m", @@ -50,7 +61,7 @@ def main() -> int: # Override addopts from pyproject.toml to avoid double --cov flags. "-o", "addopts=--strict-markers --strict-config -ra", - *[f"python_pkg/{pkg}/tests" for pkg in packages], + *test_dirs, ] if shutil.which("systemd-run") is not None: