mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 18:03:07 +02:00
372 lines
10 KiB
Bash
Executable File
372 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
|
# shellcheck source=../lib/common.sh
|
|
source "$SCRIPT_DIR/../lib/common.sh"
|
|
|
|
REPORT_DIR="${HOME}/.local/state/system-diagnostics"
|
|
REPORT_FILE="$REPORT_DIR/arch-performance-$(date +%Y%m%d_%H%M%S).log"
|
|
APPLY_SAFE_FIXES=false
|
|
INSTALL_TOOLS=false
|
|
|
|
declare -a FINDINGS=()
|
|
declare -a ACTIONS=()
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
diagnose_arch_performance.sh - Diagnose common causes of Arch Linux slowness/instability
|
|
|
|
Usage:
|
|
diagnose_arch_performance.sh [OPTIONS]
|
|
|
|
Options:
|
|
--apply-safe-fixes Apply conservative fixes (requires sudo)
|
|
--install-tools Install optional diagnostics packages (requires sudo)
|
|
-h, --help Show help
|
|
|
|
Safe fixes applied when --apply-safe-fixes is used:
|
|
- Enable/start fstrim.timer if missing
|
|
- Resolve TLP vs power-profiles-daemon conflict (keeps power-profiles-daemon)
|
|
- Vacuum journal logs if they exceed 1GiB
|
|
|
|
Notes:
|
|
- Script does not reboot automatically.
|
|
- Some checks are informational and provide next-step commands.
|
|
EOF
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--apply-safe-fixes)
|
|
APPLY_SAFE_FIXES=true
|
|
shift
|
|
;;
|
|
--install-tools)
|
|
INSTALL_TOOLS=true
|
|
shift
|
|
;;
|
|
-h | --help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
log_error "Unknown option: $1"
|
|
usage
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
add_finding() {
|
|
FINDINGS+=("$1")
|
|
log_warn "$1"
|
|
}
|
|
|
|
add_action() {
|
|
ACTIONS+=("$1")
|
|
log_info "$1"
|
|
}
|
|
|
|
run_and_log() {
|
|
local header="$1"
|
|
shift
|
|
{
|
|
echo
|
|
echo "=== $header ==="
|
|
"$@" 2>&1 || true
|
|
} >>"$REPORT_FILE"
|
|
}
|
|
|
|
check_root_if_needed() {
|
|
if [[ $APPLY_SAFE_FIXES == "true" || $INSTALL_TOOLS == "true" ]]; then
|
|
require_root "$@"
|
|
fi
|
|
}
|
|
|
|
install_optional_tools() {
|
|
if [[ $INSTALL_TOOLS != "true" ]]; then
|
|
return
|
|
fi
|
|
|
|
local packages=(lm_sensors smartmontools nvtop iotop powertop)
|
|
log_info "Installing optional diagnostic packages: ${packages[*]}"
|
|
pacman -S --needed --noconfirm "${packages[@]}"
|
|
}
|
|
|
|
collect_basics() {
|
|
run_and_log "Kernel" uname -a
|
|
run_and_log "Uptime" uptime
|
|
run_and_log "Memory" free -h
|
|
run_and_log "Swap" swapon --show
|
|
run_and_log "CPU (lscpu)" lscpu
|
|
run_and_log "Disk Usage" df -h /
|
|
run_and_log "Boot Time" systemd-analyze
|
|
run_and_log "Failed Units" systemctl --failed --no-pager
|
|
run_and_log "Recent Errors (this boot)" journalctl -b -p err --no-pager -n 200
|
|
|
|
local cpu_count
|
|
cpu_count=$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 1)
|
|
local load1
|
|
load1=$(awk '{print int($1)}' /proc/loadavg 2>/dev/null || echo 0)
|
|
if [[ ${load1:-0} -ge ${cpu_count:-1} ]]; then
|
|
add_finding "1-minute load average is at/above CPU thread count (${load1}/${cpu_count}); background tasks may be saturating the system."
|
|
fi
|
|
|
|
local failed_count
|
|
failed_count=$(systemctl --failed --no-legend 2>/dev/null | wc -l || true)
|
|
failed_count=${failed_count//[[:space:]]/}
|
|
if [[ ${failed_count:-0} -gt 0 ]]; then
|
|
add_finding "One or more systemd units are failed (${failed_count}); failed services can cause repeated retries and instability."
|
|
fi
|
|
|
|
local acpi_error_count
|
|
acpi_error_count=$(journalctl -b -p err --no-pager 2>/dev/null | grep -ic 'acpi' || true)
|
|
if [[ ${acpi_error_count:-0} -ge 5 ]]; then
|
|
add_finding "Frequent ACPI errors detected in current boot (${acpi_error_count}); BIOS/firmware update may improve stability."
|
|
fi
|
|
|
|
local top_snapshot
|
|
top_snapshot=$(ps -eo pid,comm,%cpu,%mem --sort=-%cpu | head -n 12 || true)
|
|
{
|
|
echo
|
|
echo "=== Top CPU Processes ==="
|
|
echo "$top_snapshot"
|
|
} >>"$REPORT_FILE"
|
|
|
|
local xorg_cpu
|
|
xorg_cpu=$(ps -C Xorg -o %cpu= | awk '{sum+=$1} END {printf "%.0f", sum+0}' || echo 0)
|
|
if [[ ${xorg_cpu:-0} -ge 20 ]]; then
|
|
add_finding "Xorg is using high CPU (${xorg_cpu}%); desktop/compositor/GPU driver path may be a primary slowdown source."
|
|
fi
|
|
}
|
|
|
|
check_cpu_governor() {
|
|
local gov_files
|
|
gov_files=$(find /sys/devices/system/cpu -maxdepth 3 -name scaling_governor 2>/dev/null || true)
|
|
|
|
if [[ -z $gov_files ]]; then
|
|
add_action "CPU governor files not found (may be unsupported on this platform)."
|
|
return
|
|
fi
|
|
|
|
local summary
|
|
summary=$(awk '{count[$1]++} END {for (g in count) printf "%s:%d ", g, count[g]}' $gov_files 2>/dev/null || true)
|
|
echo "CPU governor summary: ${summary:-unknown}" >>"$REPORT_FILE"
|
|
|
|
if grep -q '^powersave$' $gov_files 2>/dev/null; then
|
|
add_finding "CPU governor includes 'powersave' on one or more cores; this can make high-end hardware feel slow."
|
|
fi
|
|
}
|
|
|
|
check_thermal_state() {
|
|
if has_cmd sensors; then
|
|
run_and_log "Temperatures (sensors)" sensors
|
|
else
|
|
add_action "Install lm_sensors and run 'sensors' to verify thermal throttling."
|
|
fi
|
|
|
|
if has_cmd dmesg; then
|
|
local therm_hits
|
|
therm_hits=$(dmesg | grep -Ei 'throttl|thermal|overheat|cpu clock throttled' | tail -n 30 || true)
|
|
if [[ -n $therm_hits ]]; then
|
|
add_finding "Kernel logs show thermal/throttling related messages."
|
|
{
|
|
echo
|
|
echo "=== Thermal/Throttling dmesg excerpts ==="
|
|
echo "$therm_hits"
|
|
} >>"$REPORT_FILE"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_power_services() {
|
|
local tlp_enabled="false"
|
|
local ppd_enabled="false"
|
|
|
|
if systemctl is-enabled tlp.service >/dev/null 2>&1; then
|
|
tlp_enabled="true"
|
|
fi
|
|
if systemctl is-enabled power-profiles-daemon.service >/dev/null 2>&1; then
|
|
ppd_enabled="true"
|
|
fi
|
|
|
|
echo "Power services: tlp=${tlp_enabled}, power-profiles-daemon=${ppd_enabled}" >>"$REPORT_FILE"
|
|
|
|
if [[ $tlp_enabled == "true" && $ppd_enabled == "true" ]]; then
|
|
add_finding "Both TLP and power-profiles-daemon are enabled; they often conflict and cause inconsistent performance."
|
|
fi
|
|
|
|
if [[ $tlp_enabled == "false" && $ppd_enabled == "false" ]]; then
|
|
add_action "No power management daemon is enabled; consider installing/enabling power-profiles-daemon for predictable AC/battery behavior."
|
|
fi
|
|
}
|
|
|
|
check_storage_health() {
|
|
run_and_log "Block Devices" lsblk -o NAME,MODEL,ROTA,SIZE,TYPE,MOUNTPOINT,FSTYPE
|
|
|
|
if has_cmd fstrim; then
|
|
run_and_log "fstrim dry-run" fstrim -av --dry-run
|
|
fi
|
|
|
|
if systemctl is-enabled fstrim.timer >/dev/null 2>&1; then
|
|
add_action "fstrim.timer is enabled (good for SSD performance longevity)."
|
|
else
|
|
add_finding "fstrim.timer is not enabled; SSD maintenance trimming may be missing."
|
|
fi
|
|
|
|
if has_cmd smartctl; then
|
|
local root_disk
|
|
root_disk=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//' || true)
|
|
if [[ -n ${root_disk:-} && -b $root_disk ]]; then
|
|
run_and_log "SMART Summary ($root_disk)" smartctl -H "$root_disk"
|
|
fi
|
|
else
|
|
add_action "Install smartmontools and run SMART health checks for your SSD/NVMe."
|
|
fi
|
|
}
|
|
|
|
check_memory_pressure() {
|
|
local mem_total mem_available swap_total swap_free
|
|
mem_total=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
|
|
mem_available=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
|
|
swap_total=$(awk '/SwapTotal/ {print $2}' /proc/meminfo)
|
|
swap_free=$(awk '/SwapFree/ {print $2}' /proc/meminfo)
|
|
|
|
if [[ ${swap_total:-0} -gt 0 ]]; then
|
|
local swap_used
|
|
swap_used=$((swap_total - swap_free))
|
|
local swap_pct
|
|
swap_pct=$((swap_used * 100 / swap_total))
|
|
echo "Swap usage: ${swap_pct}%" >>"$REPORT_FILE"
|
|
if [[ $swap_pct -ge 35 && ${mem_available:-0} -gt $((mem_total / 3)) ]]; then
|
|
add_finding "High swap usage while RAM is still available; this can cause stutter."
|
|
add_action "Consider lowering swappiness (temporary: sudo sysctl vm.swappiness=10)."
|
|
fi
|
|
fi
|
|
|
|
if [[ -f /proc/pressure/memory ]]; then
|
|
run_and_log "Memory PSI" cat /proc/pressure/memory
|
|
fi
|
|
}
|
|
|
|
check_gpu_state() {
|
|
if has_cmd nvidia-smi; then
|
|
run_and_log "NVIDIA State" nvidia-smi
|
|
local pstate util power
|
|
pstate=$(nvidia-smi --query-gpu=pstate --format=csv,noheader 2>/dev/null | head -n 1 | xargs || true)
|
|
util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -n 1 | xargs || true)
|
|
power=$(nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits 2>/dev/null | head -n 1 | xargs || true)
|
|
|
|
echo "NVIDIA pstate: ${pstate:-unknown}" >>"$REPORT_FILE"
|
|
echo "NVIDIA util: ${util:-unknown}%" >>"$REPORT_FILE"
|
|
echo "NVIDIA power: ${power:-unknown}W" >>"$REPORT_FILE"
|
|
|
|
if [[ ${pstate:-} == "P0" && ${util:-100} -le 5 ]]; then
|
|
add_finding "NVIDIA GPU is in P0 high-performance state while mostly idle; this can increase heat and trigger thermal limits."
|
|
add_action "If laptop has hybrid graphics, prefer iGPU mode for desktop workloads and use dGPU on demand."
|
|
fi
|
|
else
|
|
run_and_log "PCI VGA Devices" lspci -nnk | grep -A3 -Ei 'vga|3d|display'
|
|
fi
|
|
}
|
|
|
|
check_journal_size() {
|
|
local journal_line
|
|
journal_line=$(journalctl --disk-usage 2>/dev/null || true)
|
|
echo "Journal usage: ${journal_line:-unknown}" >>"$REPORT_FILE"
|
|
|
|
if [[ $journal_line =~ ([0-9]+\.?[0-9]*)\ (G|M) ]]; then
|
|
local value unit
|
|
value="${BASH_REMATCH[1]}"
|
|
unit="${BASH_REMATCH[2]}"
|
|
if [[ $unit == "G" ]]; then
|
|
add_finding "Systemd journal is large (${value}G); excessive logs can waste I/O and disk space."
|
|
fi
|
|
fi
|
|
}
|
|
|
|
apply_safe_fixes() {
|
|
if [[ $APPLY_SAFE_FIXES != "true" ]]; then
|
|
return
|
|
fi
|
|
|
|
log_info "Applying safe fixes..."
|
|
|
|
if ! systemctl is-enabled fstrim.timer >/dev/null 2>&1; then
|
|
systemctl enable --now fstrim.timer
|
|
add_action "Enabled and started fstrim.timer."
|
|
fi
|
|
|
|
if systemctl is-enabled tlp.service >/dev/null 2>&1 && systemctl is-enabled power-profiles-daemon.service >/dev/null 2>&1; then
|
|
systemctl disable --now tlp.service
|
|
add_action "Disabled tlp.service to avoid conflict with power-profiles-daemon."
|
|
fi
|
|
|
|
local journal_line
|
|
journal_line=$(journalctl --disk-usage 2>/dev/null || true)
|
|
if [[ $journal_line =~ ([0-9]+\.?[0-9]*)\ G ]]; then
|
|
journalctl --vacuum-size=300M
|
|
add_action "Vacuumed systemd journal to 300M."
|
|
fi
|
|
}
|
|
|
|
print_summary() {
|
|
echo
|
|
echo "=============================="
|
|
echo " Arch Performance Diagnostics"
|
|
echo "=============================="
|
|
echo "Report: $REPORT_FILE"
|
|
echo
|
|
|
|
if [[ ${#FINDINGS[@]} -eq 0 ]]; then
|
|
log_ok "No high-confidence bottlenecks detected by automated checks."
|
|
else
|
|
log_warn "Likely issues found (${#FINDINGS[@]}):"
|
|
local item
|
|
for item in "${FINDINGS[@]}"; do
|
|
echo " - $item"
|
|
done
|
|
fi
|
|
|
|
if [[ ${#ACTIONS[@]} -gt 0 ]]; then
|
|
echo
|
|
log_info "Actions/recommendations:"
|
|
local action
|
|
for action in "${ACTIONS[@]}"; do
|
|
echo " - $action"
|
|
done
|
|
fi
|
|
|
|
echo
|
|
echo "Recommended next command for deep per-process analysis:"
|
|
echo " sudo iotop -oPa"
|
|
echo " top"
|
|
echo " systemd-analyze blame"
|
|
}
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
check_root_if_needed "$@"
|
|
|
|
mkdir -p "$REPORT_DIR"
|
|
log_info "Writing diagnostic report to: $REPORT_FILE"
|
|
|
|
collect_basics
|
|
install_optional_tools
|
|
check_cpu_governor
|
|
check_thermal_state
|
|
check_power_services
|
|
check_storage_health
|
|
check_memory_pressure
|
|
check_gpu_state
|
|
check_journal_size
|
|
apply_safe_fixes
|
|
print_summary
|
|
}
|
|
|
|
main "$@"
|