testsAndMisc/linux_configuration/scripts/fixes/diagnose_arch_performance.sh

372 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
# shellcheck source=../lib/common.sh
source "$SCRIPT_DIR/../lib/common.sh"
REPORT_DIR="${HOME}/.local/state/system-diagnostics"
REPORT_FILE="$REPORT_DIR/arch-performance-$(date +%Y%m%d_%H%M%S).log"
APPLY_SAFE_FIXES=false
INSTALL_TOOLS=false
declare -a FINDINGS=()
declare -a ACTIONS=()
usage() {
cat << 'EOF'
diagnose_arch_performance.sh - Diagnose common causes of Arch Linux slowness/instability
Usage:
diagnose_arch_performance.sh [OPTIONS]
Options:
--apply-safe-fixes Apply conservative fixes (requires sudo)
--install-tools Install optional diagnostics packages (requires sudo)
-h, --help Show help
Safe fixes applied when --apply-safe-fixes is used:
- Enable/start fstrim.timer if missing
- Resolve TLP vs power-profiles-daemon conflict (keeps power-profiles-daemon)
- Vacuum journal logs if they exceed 1GiB
Notes:
- Script does not reboot automatically.
- Some checks are informational and provide next-step commands.
EOF
}
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--apply-safe-fixes)
APPLY_SAFE_FIXES=true
shift
;;
--install-tools)
INSTALL_TOOLS=true
shift
;;
-h | --help)
usage
exit 0
;;
*)
log_error "Unknown option: $1"
usage
exit 2
;;
esac
done
}
add_finding() {
FINDINGS+=("$1")
log_warn "$1"
}
add_action() {
ACTIONS+=("$1")
log_info "$1"
}
run_and_log() {
local header="$1"
shift
{
echo
echo "=== $header ==="
"$@" 2>&1 || true
} >> "$REPORT_FILE"
}
check_root_if_needed() {
if [[ $APPLY_SAFE_FIXES == "true" || $INSTALL_TOOLS == "true" ]]; then
require_root "$@"
fi
}
install_optional_tools() {
if [[ $INSTALL_TOOLS != "true" ]]; then
return
fi
local packages=(lm_sensors smartmontools nvtop iotop powertop)
log_info "Installing optional diagnostic packages: ${packages[*]}"
pacman -S --needed --noconfirm "${packages[@]}"
}
collect_basics() {
run_and_log "Kernel" uname -a
run_and_log "Uptime" uptime
run_and_log "Memory" free -h
run_and_log "Swap" swapon --show
run_and_log "CPU (lscpu)" lscpu
run_and_log "Disk Usage" df -h /
run_and_log "Boot Time" systemd-analyze
run_and_log "Failed Units" systemctl --failed --no-pager
run_and_log "Recent Errors (this boot)" journalctl -b -p err --no-pager -n 200
local cpu_count
cpu_count=$(getconf _NPROCESSORS_ONLN 2> /dev/null || echo 1)
local load1
load1=$(awk '{print int($1)}' /proc/loadavg 2> /dev/null || echo 0)
if [[ ${load1:-0} -ge ${cpu_count:-1} ]]; then
add_finding "1-minute load average is at/above CPU thread count (${load1}/${cpu_count}); background tasks may be saturating the system."
fi
local failed_count
failed_count=$(systemctl --failed --no-legend 2> /dev/null | wc -l || true)
failed_count=${failed_count//[[:space:]]/}
if [[ ${failed_count:-0} -gt 0 ]]; then
add_finding "One or more systemd units are failed (${failed_count}); failed services can cause repeated retries and instability."
fi
local acpi_error_count
acpi_error_count=$(journalctl -b -p err --no-pager 2> /dev/null | grep -ic 'acpi' || true)
if [[ ${acpi_error_count:-0} -ge 5 ]]; then
add_finding "Frequent ACPI errors detected in current boot (${acpi_error_count}); BIOS/firmware update may improve stability."
fi
local top_snapshot
top_snapshot=$(ps -eo pid,comm,%cpu,%mem --sort=-%cpu | head -n 12 || true)
{
echo
echo "=== Top CPU Processes ==="
echo "$top_snapshot"
} >> "$REPORT_FILE"
local xorg_cpu
xorg_cpu=$(ps -C Xorg -o %cpu= | awk '{sum+=$1} END {printf "%.0f", sum+0}' || echo 0)
if [[ ${xorg_cpu:-0} -ge 20 ]]; then
add_finding "Xorg is using high CPU (${xorg_cpu}%); desktop/compositor/GPU driver path may be a primary slowdown source."
fi
}
check_cpu_governor() {
local gov_files
gov_files=$(find /sys/devices/system/cpu -maxdepth 3 -name scaling_governor 2> /dev/null || true)
if [[ -z $gov_files ]]; then
add_action "CPU governor files not found (may be unsupported on this platform)."
return
fi
local summary
summary=$(awk '{count[$1]++} END {for (g in count) printf "%s:%d ", g, count[g]}' $gov_files 2> /dev/null || true)
echo "CPU governor summary: ${summary:-unknown}" >> "$REPORT_FILE"
if grep -q '^powersave$' $gov_files 2> /dev/null; then
add_finding "CPU governor includes 'powersave' on one or more cores; this can make high-end hardware feel slow."
fi
}
check_thermal_state() {
if has_cmd sensors; then
run_and_log "Temperatures (sensors)" sensors
else
add_action "Install lm_sensors and run 'sensors' to verify thermal throttling."
fi
if has_cmd dmesg; then
local therm_hits
therm_hits=$(dmesg | grep -Ei 'throttl|thermal|overheat|cpu clock throttled' | tail -n 30 || true)
if [[ -n $therm_hits ]]; then
add_finding "Kernel logs show thermal/throttling related messages."
{
echo
echo "=== Thermal/Throttling dmesg excerpts ==="
echo "$therm_hits"
} >> "$REPORT_FILE"
fi
fi
}
check_power_services() {
local tlp_enabled="false"
local ppd_enabled="false"
if systemctl is-enabled tlp.service > /dev/null 2>&1; then
tlp_enabled="true"
fi
if systemctl is-enabled power-profiles-daemon.service > /dev/null 2>&1; then
ppd_enabled="true"
fi
echo "Power services: tlp=${tlp_enabled}, power-profiles-daemon=${ppd_enabled}" >> "$REPORT_FILE"
if [[ $tlp_enabled == "true" && $ppd_enabled == "true" ]]; then
add_finding "Both TLP and power-profiles-daemon are enabled; they often conflict and cause inconsistent performance."
fi
if [[ $tlp_enabled == "false" && $ppd_enabled == "false" ]]; then
add_action "No power management daemon is enabled; consider installing/enabling power-profiles-daemon for predictable AC/battery behavior."
fi
}
check_storage_health() {
run_and_log "Block Devices" lsblk -o NAME,MODEL,ROTA,SIZE,TYPE,MOUNTPOINT,FSTYPE
if has_cmd fstrim; then
run_and_log "fstrim dry-run" fstrim -av --dry-run
fi
if systemctl is-enabled fstrim.timer > /dev/null 2>&1; then
add_action "fstrim.timer is enabled (good for SSD performance longevity)."
else
add_finding "fstrim.timer is not enabled; SSD maintenance trimming may be missing."
fi
if has_cmd smartctl; then
local root_disk
root_disk=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//' || true)
if [[ -n ${root_disk:-} && -b $root_disk ]]; then
run_and_log "SMART Summary ($root_disk)" smartctl -H "$root_disk"
fi
else
add_action "Install smartmontools and run SMART health checks for your SSD/NVMe."
fi
}
check_memory_pressure() {
local mem_total mem_available swap_total swap_free
mem_total=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
mem_available=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
swap_total=$(awk '/SwapTotal/ {print $2}' /proc/meminfo)
swap_free=$(awk '/SwapFree/ {print $2}' /proc/meminfo)
if [[ ${swap_total:-0} -gt 0 ]]; then
local swap_used
swap_used=$((swap_total - swap_free))
local swap_pct
swap_pct=$((swap_used * 100 / swap_total))
echo "Swap usage: ${swap_pct}%" >> "$REPORT_FILE"
if [[ $swap_pct -ge 35 && ${mem_available:-0} -gt $((mem_total / 3)) ]]; then
add_finding "High swap usage while RAM is still available; this can cause stutter."
add_action "Consider lowering swappiness (temporary: sudo sysctl vm.swappiness=10)."
fi
fi
if [[ -f /proc/pressure/memory ]]; then
run_and_log "Memory PSI" cat /proc/pressure/memory
fi
}
check_gpu_state() {
if has_cmd nvidia-smi; then
run_and_log "NVIDIA State" nvidia-smi
local pstate util power
pstate=$(nvidia-smi --query-gpu=pstate --format=csv,noheader 2> /dev/null | head -n 1 | xargs || true)
util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2> /dev/null | head -n 1 | xargs || true)
power=$(nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits 2> /dev/null | head -n 1 | xargs || true)
echo "NVIDIA pstate: ${pstate:-unknown}" >> "$REPORT_FILE"
echo "NVIDIA util: ${util:-unknown}%" >> "$REPORT_FILE"
echo "NVIDIA power: ${power:-unknown}W" >> "$REPORT_FILE"
if [[ ${pstate:-} == "P0" && ${util:-100} -le 5 ]]; then
add_finding "NVIDIA GPU is in P0 high-performance state while mostly idle; this can increase heat and trigger thermal limits."
add_action "If laptop has hybrid graphics, prefer iGPU mode for desktop workloads and use dGPU on demand."
fi
else
run_and_log "PCI VGA Devices" lspci -nnk | grep -A3 -Ei 'vga|3d|display'
fi
}
check_journal_size() {
local journal_line
journal_line=$(journalctl --disk-usage 2> /dev/null || true)
echo "Journal usage: ${journal_line:-unknown}" >> "$REPORT_FILE"
if [[ $journal_line =~ ([0-9]+\.?[0-9]*)\ (G|M) ]]; then
local value unit
value="${BASH_REMATCH[1]}"
unit="${BASH_REMATCH[2]}"
if [[ $unit == "G" ]]; then
add_finding "Systemd journal is large (${value}G); excessive logs can waste I/O and disk space."
fi
fi
}
apply_safe_fixes() {
if [[ $APPLY_SAFE_FIXES != "true" ]]; then
return
fi
log_info "Applying safe fixes..."
if ! systemctl is-enabled fstrim.timer > /dev/null 2>&1; then
systemctl enable --now fstrim.timer
add_action "Enabled and started fstrim.timer."
fi
if systemctl is-enabled tlp.service > /dev/null 2>&1 && systemctl is-enabled power-profiles-daemon.service > /dev/null 2>&1; then
systemctl disable --now tlp.service
add_action "Disabled tlp.service to avoid conflict with power-profiles-daemon."
fi
local journal_line
journal_line=$(journalctl --disk-usage 2> /dev/null || true)
if [[ $journal_line =~ ([0-9]+\.?[0-9]*)\ G ]]; then
journalctl --vacuum-size=300M
add_action "Vacuumed systemd journal to 300M."
fi
}
print_summary() {
echo
echo "=============================="
echo " Arch Performance Diagnostics"
echo "=============================="
echo "Report: $REPORT_FILE"
echo
if [[ ${#FINDINGS[@]} -eq 0 ]]; then
log_ok "No high-confidence bottlenecks detected by automated checks."
else
log_warn "Likely issues found (${#FINDINGS[@]}):"
local item
for item in "${FINDINGS[@]}"; do
echo " - $item"
done
fi
if [[ ${#ACTIONS[@]} -gt 0 ]]; then
echo
log_info "Actions/recommendations:"
local action
for action in "${ACTIONS[@]}"; do
echo " - $action"
done
fi
echo
echo "Recommended next command for deep per-process analysis:"
echo " sudo iotop -oPa"
echo " top"
echo " systemd-analyze blame"
}
main() {
parse_args "$@"
check_root_if_needed "$@"
mkdir -p "$REPORT_DIR"
log_info "Writing diagnostic report to: $REPORT_FILE"
collect_basics
install_optional_tools
check_cpu_governor
check_thermal_state
check_power_services
check_storage_health
check_memory_pressure
check_gpu_state
check_journal_size
apply_safe_fixes
print_summary
}
main "$@"