testsAndMisc/linux_configuration/scripts/utils/analyze_repo.sh

#!/bin/bash
# Analyze a git repository for most-used keywords, functions, etc.
# Usage: ./analyze_repo.sh [repo_url_or_local_path] [output_dir] [--no-ignore]
#
# Examples:
#   ./analyze_repo.sh https://github.com/torvalds/linux    # Clone from URL
#   ./analyze_repo.sh /path/to/local/repo                  # Use local directory
#   ./analyze_repo.sh .                                    # Analyze current directory
#   ./analyze_repo.sh . /tmp/out --no-ignore               # Include node_modules, etc.

set -e

# Parse arguments
INPUT=""
WORK_DIR=""
RESPECT_GITIGNORE=true

for arg in "$@"; do
	case "$arg" in
	--no-ignore)
		RESPECT_GITIGNORE=false
		;;
	*)
		if [ -z "$INPUT" ]; then
			INPUT="$arg"
		elif [ -z "$WORK_DIR" ]; then
			WORK_DIR="$arg"
		fi
		;;
	esac
done

INPUT="${INPUT:-https://github.com/torvalds/linux}"
WORK_DIR="${WORK_DIR:-/tmp/repo_analysis}"
TOP_N=50 # Number of top results to show

# Directories to exclude (unless --no-ignore is used)
EXCLUDE_DIRS="node_modules|\.git|vendor|\.venv|venv|__pycache__|\.cache|build|dist|\.next|\.nuxt|target|\.tox|\.eggs"

# Detect if input is a URL or local path
is_url() {
	[[ $1 =~ ^https?:// ]] || [[ $1 =~ ^git@ ]] || [[ $1 =~ ^ssh:// ]]
}

IS_LOCAL=false
if is_url "$INPUT"; then
	REPO_URL="$INPUT"
	REPO_NAME=$(basename "$REPO_URL" .git)
	REPO_DIR="$WORK_DIR/$REPO_NAME"
else
	# Local path - resolve to absolute path
	IS_LOCAL=true
	if [ -d "$INPUT" ]; then
		REPO_DIR=$(cd "$INPUT" && pwd)
		REPO_NAME=$(basename "$REPO_DIR")
	else
		echo "Error: '$INPUT' is not a valid directory or URL"
		exit 1
	fi
fi

RESULTS_DIR="$WORK_DIR/results_${REPO_NAME}"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

print_header() {
	echo ""
	echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
	echo -e "${GREEN}  $1${NC}"
	echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
	echo ""
}

print_subheader() {
	echo ""
	echo -e "${YELLOW}--- $1 ---${NC}"
	echo ""
}

# Check if we're in a git repository
is_git_repo() {
	git rev-parse --is-inside-work-tree &>/dev/null
}

# Helper function to find files while respecting exclusions
# Usage: find_files "*.c" or find_files "*.py" "*.pyx"
find_files() {
	local patterns=("$@")

	if [ "$RESPECT_GITIGNORE" = true ]; then
		if is_git_repo; then
			# Use git ls-files which respects .gitignore automatically
			# This includes tracked files and untracked files not in .gitignore
			local git_patterns=()
			for pat in "${patterns[@]}"; do
				git_patterns+=("$pat")
			done
			# Get tracked files + untracked (but not ignored) files
			{
				git ls-files -- "${git_patterns[@]}" 2>/dev/null
				git ls-files --others --exclude-standard -- "${git_patterns[@]}" 2>/dev/null
			} | sort -u
		else
			# Not a git repo - fall back to manual exclusion
			local find_args=()
			for i in "${!patterns[@]}"; do
				if [ $i -eq 0 ]; then
					find_args+=(-name "${patterns[$i]}")
				else
					find_args+=(-o -name "${patterns[$i]}")
				fi
			done
			find . -type f \( "${find_args[@]}" \) 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/"
		fi
	else
		# No filtering - find all files
		local find_args=()
		for i in "${!patterns[@]}"; do
			if [ $i -eq 0 ]; then
				find_args+=(-name "${patterns[$i]}")
			else
				find_args+=(-o -name "${patterns[$i]}")
			fi
		done
		find . -type f \( "${find_args[@]}" \) 2>/dev/null
	fi
}

# Count files matching pattern (respecting exclusions)
count_files() {
	find_files "$@" | wc -l
}

#==============================================================================
# STEP 0: Install Missing Tools
#==============================================================================
install_missing_tools() {
	local MISSING_TOOLS=()
	local MISSING_AUR=()

	# Check for required tools
	command -v git &>/dev/null || MISSING_TOOLS+=("git")
	command -v ctags &>/dev/null || MISSING_TOOLS+=("ctags")
	command -v cscope &>/dev/null || MISSING_TOOLS+=("cscope")
	command -v clang &>/dev/null || MISSING_TOOLS+=("clang")
	command -v ugrep &>/dev/null || MISSING_TOOLS+=("ugrep")

	# Check for AUR tools
	command -v tokei &>/dev/null || MISSING_AUR+=("tokei")
	command -v scc &>/dev/null || MISSING_AUR+=("scc")

	# Check for Rust 'counts' tool (install via cargo if missing)
	if ! command -v counts &>/dev/null; then
		if command -v cargo &>/dev/null; then
			echo "Installing 'counts' via cargo (fast word counter)..."
			cargo install counts 2>/dev/null || echo "Warning: counts install failed, will use Python fallback"
		fi
	fi

	# If nothing is missing, return
	if [ ${#MISSING_TOOLS[@]} -eq 0 ] && [ ${#MISSING_AUR[@]} -eq 0 ]; then
		echo -e "${GREEN}All required tools are installed.${NC}"
		return 0
	fi

	echo -e "${YELLOW}Missing tools detected. Installing...${NC}"

	# Detect package manager
	if command -v pacman &>/dev/null; then
		# Arch Linux
		if [ ${#MISSING_TOOLS[@]} -gt 0 ]; then
			echo "Installing from official repos: ${MISSING_TOOLS[*]}"
			sudo pacman -S --needed --noconfirm "${MISSING_TOOLS[@]}"
		fi

		if [ ${#MISSING_AUR[@]} -gt 0 ]; then
			# Find or install AUR helper
			if command -v yay &>/dev/null; then
				AUR_HELPER="yay"
			elif command -v paru &>/dev/null; then
				AUR_HELPER="paru"
			else
				echo "No AUR helper found. Installing yay..."
				sudo pacman -S --needed --noconfirm base-devel git
				TEMP_DIR=$(mktemp -d)
				git clone https://aur.archlinux.org/yay.git "$TEMP_DIR/yay"
				(cd "$TEMP_DIR/yay" && makepkg -si --noconfirm)
				rm -rf "$TEMP_DIR"
				AUR_HELPER="yay"
			fi

			echo "Installing from AUR: ${MISSING_AUR[*]}"
			$AUR_HELPER -S --needed --noconfirm "${MISSING_AUR[@]}"
		fi

	elif command -v apt-get &>/dev/null; then
		# Debian/Ubuntu
		echo "Installing tools via apt..."
		sudo apt-get update

		# Map tool names to package names
		APT_PACKAGES=()
		for tool in "${MISSING_TOOLS[@]}"; do
			case $tool in
			ctags) APT_PACKAGES+=("universal-ctags") ;;
			ugrep) APT_PACKAGES+=("ugrep") ;;
			*) APT_PACKAGES+=("$tool") ;;
			esac
		done

		[ ${#APT_PACKAGES[@]} -gt 0 ] && sudo apt-get install -y "${APT_PACKAGES[@]}"

		# Install tokei/scc via cargo or snap
		for aur_tool in "${MISSING_AUR[@]}"; do
			if command -v cargo &>/dev/null; then
				echo "Installing $aur_tool via cargo..."
				cargo install "$aur_tool"
			elif command -v snap &>/dev/null; then
				echo "Installing $aur_tool via snap..."
				sudo snap install "$aur_tool"
			else
				echo -e "${YELLOW}Warning: Cannot install $aur_tool. Install cargo or snap first.${NC}"
			fi
		done

	elif command -v dnf &>/dev/null; then
		# Fedora
		echo "Installing tools via dnf..."
		sudo dnf install -y "${MISSING_TOOLS[@]}" "${MISSING_AUR[@]}" 2>/dev/null || {
			# tokei/scc might need cargo
			for aur_tool in "${MISSING_AUR[@]}"; do
				if command -v cargo &>/dev/null; then
					cargo install "$aur_tool"
				fi
			done
		}

	elif command -v brew &>/dev/null; then
		# macOS with Homebrew
		echo "Installing tools via brew..."
		ALL_TOOLS=("${MISSING_TOOLS[@]}" "${MISSING_AUR[@]}")
		brew install "${ALL_TOOLS[@]}"

	else
		echo -e "${RED}Unknown package manager. Please install these tools manually:${NC}"
		echo "  Official: ${MISSING_TOOLS[*]}"
		echo "  Additional: ${MISSING_AUR[*]}"
		exit 1
	fi

	echo -e "${GREEN}Tool installation complete.${NC}"
}

print_header "STEP 0: Checking/Installing Required Tools"
install_missing_tools

# Create directories
mkdir -p "$WORK_DIR" "$RESULTS_DIR"

#==============================================================================
# STEP 1: Clone or Use Local Repository
#==============================================================================
print_header "STEP 1: Repository Setup"

if [ "$IS_LOCAL" = true ]; then
	echo "Using local repository: $REPO_DIR"
	if [ ! -d "$REPO_DIR" ]; then
		echo "Error: Directory does not exist: $REPO_DIR"
		exit 1
	fi
else
	# Remote URL - clone it
	if [ -d "$REPO_DIR" ]; then
		echo "Repository already exists at $REPO_DIR"
		echo "Updating..."
		cd "$REPO_DIR"
		git pull --depth 1 2>/dev/null || echo "Update skipped (shallow clone)"
	else
		echo "Cloning $REPO_URL (shallow clone for speed)..."
		git clone --depth 1 "$REPO_URL" "$REPO_DIR"
	fi
fi

cd "$REPO_DIR"
echo "Repository: $REPO_NAME"
echo "Location: $REPO_DIR"
echo "Repository size: $(du -sh . | cut -f1)"
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
	# Count files respecting .gitignore
	FILE_COUNT=$({
		git ls-files 2>/dev/null
		git ls-files --others --exclude-standard 2>/dev/null
	} | sort -u | wc -l)
	echo "Files: $FILE_COUNT (respecting .gitignore)"
elif [ "$RESPECT_GITIGNORE" = true ]; then
	echo "Files: $(find . -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" | wc -l) (excluding common dirs)"
else
	echo "Files: $(find . -type f | wc -l)"
fi

#==============================================================================
# STEP 2: Basic Statistics with tokei
#==============================================================================
print_header "STEP 2: Code Statistics with tokei"

echo "Running tokei..."
tokei . | tee "$RESULTS_DIR/tokei_stats.txt"

#==============================================================================
# STEP 3: Code Statistics with scc
#==============================================================================
print_header "STEP 3: Code Statistics with scc (includes complexity)"

echo "Running scc..."
scc . | tee "$RESULTS_DIR/scc_stats.txt"

print_subheader "Top 10 Most Complex Files"
scc --by-file --sort complexity . 2>/dev/null | head -20 | tee "$RESULTS_DIR/scc_complexity.txt"

#==============================================================================
# STEP 4: Fast Keyword Analysis (Code vs Comments) - Multi-Language
#==============================================================================
print_header "STEP 4: Fast Keyword Analysis (Code vs Comments)"

# Helper function for fast word counting
# Uses 'counts' (Rust) if available, falls back to Python Counter
fast_count() {
	local top_n="${1:-50}"
	if command -v counts &>/dev/null; then
		counts 2>/dev/null | head -$((top_n + 1)) | tail -$top_n
	else
		python3 -c "
import sys
from collections import Counter
c = Counter(line.rstrip() for line in sys.stdin)
for word, count in c.most_common($top_n):
    print(f'{count} {word}')
"
	fi
}

#------------------------------------------------------------------------------
# Language Detection and Configuration
#------------------------------------------------------------------------------
print_subheader "Detecting languages in repository..."

if [ "$RESPECT_GITIGNORE" = true ]; then
	if is_git_repo; then
		echo -e "${YELLOW}Note: Respecting .gitignore (excludes node_modules, build outputs, etc.)${NC}"
	else
		echo -e "${YELLOW}Note: Excluding common directories (node_modules, .git, vendor, etc.)${NC}"
	fi
	echo "      Use --no-ignore to include everything."
	echo ""
fi

# Count files by extension to detect primary languages (using helper)
declare -A LANG_FILES
LANG_FILES[c]=$(count_files "*.c")
LANG_FILES[cpp]=$(count_files "*.cpp" "*.cc" "*.cxx")
LANG_FILES[h]=$(count_files "*.h" "*.hpp" "*.hxx")
LANG_FILES[python]=$(count_files "*.py")
LANG_FILES[javascript]=$(count_files "*.js")
LANG_FILES[typescript]=$(count_files "*.ts" "*.tsx")
LANG_FILES[java]=$(count_files "*.java")
LANG_FILES[go]=$(count_files "*.go")
LANG_FILES[rust]=$(count_files "*.rs")
LANG_FILES[ruby]=$(count_files "*.rb")
LANG_FILES[shell]=$(count_files "*.sh" "*.bash")

echo "Files found by language:"
for lang in c cpp h python javascript typescript java go rust ruby shell; do
	count=${LANG_FILES[$lang]}
	[ "$count" -gt 0 ] && echo "  $lang: $count files"
done

# Determine which language families are present
HAS_C_FAMILY=false
HAS_PYTHON=false
HAS_JS_FAMILY=false
HAS_SHELL=false
HAS_RUBY=false
HAS_GO=false
HAS_RUST=false
HAS_JAVA=false

((${LANG_FILES[c]} + ${LANG_FILES[cpp]} + ${LANG_FILES[h]} > 0)) && HAS_C_FAMILY=true
((${LANG_FILES[python]} > 0)) && HAS_PYTHON=true
((${LANG_FILES[javascript]} + ${LANG_FILES[typescript]} > 0)) && HAS_JS_FAMILY=true
((${LANG_FILES[shell]} > 0)) && HAS_SHELL=true
((${LANG_FILES[ruby]} > 0)) && HAS_RUBY=true
((${LANG_FILES[go]} > 0)) && HAS_GO=true
((${LANG_FILES[rust]} > 0)) && HAS_RUST=true
((${LANG_FILES[java]} > 0)) && HAS_JAVA=true

#------------------------------------------------------------------------------
# Language-specific keyword definitions
#------------------------------------------------------------------------------
# C/C++ keywords
KEYWORDS_C="auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while|inline|restrict|_Bool|_Complex|_Imaginary"
KEYWORDS_CPP="$KEYWORDS_C|alignas|alignof|and|and_eq|asm|atomic_cancel|atomic_commit|atomic_noexcept|bitand|bitor|bool|catch|char16_t|char32_t|char8_t|class|co_await|co_return|co_yield|compl|concept|const_cast|consteval|constexpr|constinit|decltype|delete|dynamic_cast|explicit|export|false|friend|mutable|namespace|new|noexcept|not|not_eq|nullptr|operator|or|or_eq|override|private|protected|public|reflexpr|reinterpret_cast|requires|static_assert|static_cast|synchronized|template|this|thread_local|throw|true|try|typeid|typename|using|virtual|wchar_t|xor|xor_eq"

# Python keywords
KEYWORDS_PYTHON="False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield"

# JavaScript/TypeScript keywords
KEYWORDS_JS="abstract|arguments|await|boolean|break|byte|case|catch|char|class|const|continue|debugger|default|delete|do|double|else|enum|eval|export|extends|false|final|finally|float|for|function|goto|if|implements|import|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|super|switch|synchronized|this|throw|throws|transient|true|try|typeof|undefined|var|void|volatile|while|with|yield"
KEYWORDS_TS="$KEYWORDS_JS|any|as|asserts|bigint|declare|get|infer|intrinsic|is|keyof|module|namespace|never|out|override|readonly|require|set|string|symbol|type|unique|unknown"

# Go keywords
KEYWORDS_GO="break|case|chan|const|continue|default|defer|else|fallthrough|for|func|go|goto|if|import|interface|map|package|range|return|select|struct|switch|type|var"

# Rust keywords
KEYWORDS_RUST="as|async|await|break|const|continue|crate|dyn|else|enum|extern|false|fn|for|if|impl|in|let|loop|match|mod|move|mut|pub|ref|return|self|Self|static|struct|super|trait|true|type|unsafe|use|where|while"

# Ruby keywords
KEYWORDS_RUBY="BEGIN|END|alias|and|begin|break|case|class|def|defined|do|else|elsif|end|ensure|false|for|if|in|module|next|nil|not|or|redo|rescue|retry|return|self|super|then|true|undef|unless|until|when|while|yield"
#------------------------------------------------------------------------------
# Multi-language comment processing - KEEP LANGUAGES SEPARATE
#------------------------------------------------------------------------------
print_subheader "Processing source files (separating code from comments)..."

# Create per-language output directory
mkdir -p "$RESULTS_DIR/per_language"
COMMENTS_TEMP=$(mktemp)
trap 'rm -f "$COMMENTS_TEMP" /tmp/code_*.tmp 2>/dev/null' EXIT

declare -A LANG_CODE_FILES

# Process C/C++ files
if $HAS_C_FAMILY; then
	echo "Processing C/C++ files..."
	LANG_CODE_FILES[c_cpp]=$(mktemp /tmp/code_c_cpp.XXXXXX.tmp)
	find_files "*.c" "*.cpp" "*.cc" "*.cxx" "*.h" "*.hpp" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[c_cpp]}"

	# Extract and strip C-style comments
	perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[c_cpp]}" >>"$COMMENTS_TEMP"
	perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[c_cpp]}" >"${LANG_CODE_FILES[c_cpp]}.clean"
	mv "${LANG_CODE_FILES[c_cpp]}.clean" "${LANG_CODE_FILES[c_cpp]}"
fi

# Process JavaScript files (separate from TypeScript)
if $HAS_JS_FAMILY; then
	echo "Processing JavaScript files..."
	LANG_CODE_FILES[javascript]=$(mktemp /tmp/code_js.XXXXXX.tmp)
	find_files "*.js" "*.jsx" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[javascript]}"

	echo "Processing TypeScript files..."
	LANG_CODE_FILES[typescript]=$(mktemp /tmp/code_ts.XXXXXX.tmp)
	find_files "*.ts" "*.tsx" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[typescript]}"

	# Extract and strip comments from both
	for lang_file in "${LANG_CODE_FILES[javascript]}" "${LANG_CODE_FILES[typescript]}"; do
		[ ! -s "$lang_file" ] && continue
		perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "$lang_file" >>"$COMMENTS_TEMP"
		perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "$lang_file" >"${lang_file}.clean"
		mv "${lang_file}.clean" "$lang_file"
	done
fi

# Process Python files
if $HAS_PYTHON; then
	echo "Processing Python files..."
	LANG_CODE_FILES[python]=$(mktemp /tmp/code_python.XXXXXX.tmp)
	find_files "*.py" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[python]}"

	perl -ne 'if (/^\s*#(.*)/) { print "$1\n"; } elsif (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[python]}" >>"$COMMENTS_TEMP"
	perl -0777 -ne 'while (/"""(.+?)"""/gs) { print "$1\n"; } while (/'"'"''"'"''"'"'(.+?)'"'"''"'"''"'"'/gs) { print "$1\n"; }' "${LANG_CODE_FILES[python]}" >>"$COMMENTS_TEMP"
	perl -pe 's/#.*$//' "${LANG_CODE_FILES[python]}" | perl -0777 -pe 's/""".*?"""//gs; s/'"'"''"'"''"'"'.*?'"'"''"'"''"'"'//gs' >"${LANG_CODE_FILES[python]}.clean"
	mv "${LANG_CODE_FILES[python]}.clean" "${LANG_CODE_FILES[python]}"
fi

# Process Go files
if $HAS_GO; then
	echo "Processing Go files..."
	LANG_CODE_FILES[go]=$(mktemp /tmp/code_go.XXXXXX.tmp)
	find_files "*.go" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[go]}"

	perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[go]}" >>"$COMMENTS_TEMP"
	perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[go]}" >"${LANG_CODE_FILES[go]}.clean"
	mv "${LANG_CODE_FILES[go]}.clean" "${LANG_CODE_FILES[go]}"
fi

# Process Rust files
if $HAS_RUST; then
	echo "Processing Rust files..."
	LANG_CODE_FILES[rust]=$(mktemp /tmp/code_rust.XXXXXX.tmp)
	find_files "*.rs" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[rust]}"

	perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[rust]}" >>"$COMMENTS_TEMP"
	perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[rust]}" >"${LANG_CODE_FILES[rust]}.clean"
	mv "${LANG_CODE_FILES[rust]}.clean" "${LANG_CODE_FILES[rust]}"
fi

# Process Ruby files
if $HAS_RUBY; then
	echo "Processing Ruby files..."
	LANG_CODE_FILES[ruby]=$(mktemp /tmp/code_ruby.XXXXXX.tmp)
	find_files "*.rb" | head -5000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[ruby]}"

	perl -ne 'if (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[ruby]}" >>"$COMMENTS_TEMP"
	perl -0777 -ne 'while (/=begin(.+?)=end/gs) { print "$1\n"; }' "${LANG_CODE_FILES[ruby]}" >>"$COMMENTS_TEMP"
	perl -pe 's/#.*$//' "${LANG_CODE_FILES[ruby]}" | perl -0777 -pe 's/=begin.*?=end//gs' >"${LANG_CODE_FILES[ruby]}.clean"
	mv "${LANG_CODE_FILES[ruby]}.clean" "${LANG_CODE_FILES[ruby]}"
fi

# Process Shell files
if $HAS_SHELL; then
	echo "Processing Shell files..."
	LANG_CODE_FILES[shell]=$(mktemp /tmp/code_shell.XXXXXX.tmp)
	find_files "*.sh" "*.bash" | head -5000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[shell]}"

	perl -ne 'if (/^\s*#(.*)/ && !/^#!/) { print "$1\n"; } elsif (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[shell]}" >>"$COMMENTS_TEMP"
	perl -pe 's/#.*$//' "${LANG_CODE_FILES[shell]}" >"${LANG_CODE_FILES[shell]}.clean"
	mv "${LANG_CODE_FILES[shell]}.clean" "${LANG_CODE_FILES[shell]}"
fi

# Process Java files
if $HAS_JAVA; then
	echo "Processing Java files..."
	LANG_CODE_FILES[java]=$(mktemp /tmp/code_java.XXXXXX.tmp)
	find_files "*.java" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[java]}"

	perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[java]}" >>"$COMMENTS_TEMP"
	perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[java]}" >"${LANG_CODE_FILES[java]}.clean"
	mv "${LANG_CODE_FILES[java]}.clean" "${LANG_CODE_FILES[java]}"
fi

COMMENT_LINES=$(wc -l <"$COMMENTS_TEMP")
echo ""
echo "Processed languages: ${!LANG_CODE_FILES[*]}"
echo "Total comment lines: $COMMENT_LINES"

#------------------------------------------------------------------------------
# Per-Language Keyword Analysis - Each language gets its own file
#------------------------------------------------------------------------------
print_subheader "Per-Language Keyword Analysis"

# Map language names to keyword variables
declare -A LANG_KEYWORDS
LANG_KEYWORDS[c_cpp]="$KEYWORDS_CPP"
LANG_KEYWORDS[python]="$KEYWORDS_PYTHON"
LANG_KEYWORDS[javascript]="$KEYWORDS_JS"
LANG_KEYWORDS[typescript]="$KEYWORDS_TS"
LANG_KEYWORDS[go]="$KEYWORDS_GO"
LANG_KEYWORDS[rust]="$KEYWORDS_RUST"
LANG_KEYWORDS[ruby]="$KEYWORDS_RUBY"
LANG_KEYWORDS[shell]="$KEYWORDS_SHELL"
LANG_KEYWORDS[java]="$KEYWORDS_JAVA"

# Analyze each language separately
for lang in "${!LANG_CODE_FILES[@]}"; do
	code_file="${LANG_CODE_FILES[$lang]}"
	keywords="${LANG_KEYWORDS[$lang]}"
	output_file="$RESULTS_DIR/per_language/keywords_${lang}.txt"

	if [ -f "$code_file" ] && [ -s "$code_file" ] && [ -n "$keywords" ]; then
		echo ""
		echo -e "${YELLOW}=== $lang Keywords ===${NC}"
		ugrep -o "\b($keywords)\b" "$code_file" 2>/dev/null |
			fast_count 50 |
			tee "$output_file"
	fi
done

#------------------------------------------------------------------------------
# Per-Language Function Analysis
#------------------------------------------------------------------------------
print_subheader "Per-Language Function Calls"

for lang in "${!LANG_CODE_FILES[@]}"; do
	code_file="${LANG_CODE_FILES[$lang]}"
	output_file="$RESULTS_DIR/per_language/functions_${lang}.txt"

	if [ -f "$code_file" ] && [ -s "$code_file" ]; then
		echo ""
		echo -e "${YELLOW}=== $lang Functions ===${NC}"
		ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\s*\(' "$code_file" 2>/dev/null |
			sed 's/\s*(//' |
			grep -vE '^(if|for|while|switch|catch|elif)$' |
			fast_count 30 |
			tee "$output_file"
	fi
done

#------------------------------------------------------------------------------
# Per-Language Import Analysis
#------------------------------------------------------------------------------
print_subheader "Per-Language Imports/Includes"

# C/C++ includes
if [ -n "${LANG_CODE_FILES[c_cpp]}" ] && [ -s "${LANG_CODE_FILES[c_cpp]}" ]; then
	echo -e "${YELLOW}=== C/C++ Includes ===${NC}"
	ugrep -o '#include\s*[<"][^>"]+[>"]' "${LANG_CODE_FILES[c_cpp]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_c_cpp.txt"
fi

# Python imports
if [ -n "${LANG_CODE_FILES[python]}" ] && [ -s "${LANG_CODE_FILES[python]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Python Imports ===${NC}"
	ugrep -o '^\s*(from\s+\S+\s+import\s+\S+|import\s+\S+)' "${LANG_CODE_FILES[python]}" 2>/dev/null |
		sed 's/^\s*//' |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_python.txt"
fi

# JavaScript imports
if [ -n "${LANG_CODE_FILES[javascript]}" ] && [ -s "${LANG_CODE_FILES[javascript]}" ]; then
	echo ""
	echo -e "${YELLOW}=== JavaScript Imports ===${NC}"
	ugrep -o "(import\s+.*\s+from\s+['\"][^'\"]+['\"]|require\s*\(['\"][^'\"]+['\"]\))" "${LANG_CODE_FILES[javascript]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_javascript.txt"
fi

# TypeScript imports
if [ -n "${LANG_CODE_FILES[typescript]}" ] && [ -s "${LANG_CODE_FILES[typescript]}" ]; then
	echo ""
	echo -e "${YELLOW}=== TypeScript Imports ===${NC}"
	ugrep -o "(import\s+.*\s+from\s+['\"][^'\"]+['\"]|require\s*\(['\"][^'\"]+['\"]\))" "${LANG_CODE_FILES[typescript]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_typescript.txt"
fi

# Go imports
if [ -n "${LANG_CODE_FILES[go]}" ] && [ -s "${LANG_CODE_FILES[go]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Go Imports ===${NC}"
	ugrep -o '"[^"]+/[^"]+"' "${LANG_CODE_FILES[go]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_go.txt"
fi

# Rust use statements
if [ -n "${LANG_CODE_FILES[rust]}" ] && [ -s "${LANG_CODE_FILES[rust]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Rust Use Statements ===${NC}"
	ugrep -o '^\s*use\s+[^;]+' "${LANG_CODE_FILES[rust]}" 2>/dev/null |
		sed 's/^\s*//' |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_rust.txt"
fi

# Java imports
if [ -n "${LANG_CODE_FILES[java]}" ] && [ -s "${LANG_CODE_FILES[java]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Java Imports ===${NC}"
	ugrep -o '^\s*import\s+[^;]+' "${LANG_CODE_FILES[java]}" 2>/dev/null |
		sed 's/^\s*//' |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_java.txt"
fi

# Ruby requires
if [ -n "${LANG_CODE_FILES[ruby]}" ] && [ -s "${LANG_CODE_FILES[ruby]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Ruby Requires ===${NC}"
	ugrep -o "(require\s+['\"][^'\"]+['\"]|require_relative\s+['\"][^'\"]+['\"])" "${LANG_CODE_FILES[ruby]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_ruby.txt"
fi

# Shell sources
if [ -n "${LANG_CODE_FILES[shell]}" ] && [ -s "${LANG_CODE_FILES[shell]}" ]; then
	echo ""
	echo -e "${YELLOW}=== Shell Sources ===${NC}"
	ugrep -o '(source\s+[^\s]+|\.\s+[^\s]+)' "${LANG_CODE_FILES[shell]}" 2>/dev/null |
		fast_count 30 |
		tee "$RESULTS_DIR/per_language/imports_shell.txt"
fi

#------------------------------------------------------------------------------
# Combined Analysis (for overview/backward compatibility)
#------------------------------------------------------------------------------
print_subheader "Combined Code Identifiers (all languages)"

# Create combined CODE_TEMP
CODE_TEMP=$(mktemp)
for lang_file in "${LANG_CODE_FILES[@]}"; do
	[ -f "$lang_file" ] && cat "$lang_file" >>"$CODE_TEMP"
done

ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\b' "$CODE_TEMP" 2>/dev/null |
	fast_count $TOP_N |
	tee "$RESULTS_DIR/code_identifiers.txt"

print_subheader "Most Used Words in COMMENTS"
ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\b' "$COMMENTS_TEMP" 2>/dev/null |
	fast_count $TOP_N |
	tee "$RESULTS_DIR/comment_words.txt"

# Create combined files from per-language analysis (for backward compatibility)
{
	echo "# Combined keywords from all languages"
	echo "# Format: count keyword (from per_language/keywords_*.txt)"
	cat "$RESULTS_DIR/per_language"/keywords_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_keywords.txt"

{
	echo "# Combined functions from all languages"
	echo "# See per_language/functions_*.txt for language-specific breakdown"
	cat "$RESULTS_DIR/per_language"/functions_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_function_calls.txt"

{
	echo "# Combined imports from all languages"
	echo "# See per_language/imports_*.txt for language-specific breakdown"
	cat "$RESULTS_DIR/per_language"/imports_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_imports.txt"

# List what per-language files were created
echo ""
echo "Per-language analysis files created:"
find "$RESULTS_DIR/per_language/" -maxdepth 1 -type f -printf '  %f\n' 2>/dev/null || true

print_subheader "Generating tags (this may take a while)..."

# Generate tags for different kinds
ctags -R --languages=C,C++ --c-kinds=+fp --fields=+lK -f "$RESULTS_DIR/tags" . 2>/dev/null || true

if [ -f "$RESULTS_DIR/tags" ]; then
	TOTAL_TAGS=$(grep -ac '^[^!]' "$RESULTS_DIR/tags" 2>/dev/null || echo "0")
	echo "Total symbols found: $TOTAL_TAGS"

	print_subheader "Most Common Symbol Names"
	# Fast: use cut + counts instead of awk + sort | uniq
	# -a flag treats tags file as text (may contain binary-like patterns)
	grep -a '^[^!]' "$RESULTS_DIR/tags" | cut -f1 | fast_count $TOP_N |
		tee "$RESULTS_DIR/ctags_symbols.txt"

	print_subheader "Symbol Types Distribution"
	# Fast: extract single-letter kind code after ;" and count
	grep -aoP ';"\t\K[a-z]' "$RESULTS_DIR/tags" 2>/dev/null | fast_count 20 | while read count kind; do
		case $kind in
		f) echo "$count functions" ;;
		v) echo "$count variables" ;;
		s) echo "$count structs" ;;
		t) echo "$count typedefs" ;;
		e) echo "$count enum values" ;;
		g) echo "$count enums" ;;
		m) echo "$count struct/union members" ;;
		d) echo "$count macro definitions" ;;
		p) echo "$count function prototypes" ;;
		u) echo "$count unions" ;;
		c) echo "$count classes" ;;
		n) echo "$count namespaces" ;;
		*) echo "$count kind=$kind" ;;
		esac
	done | tee "$RESULTS_DIR/ctags_kinds.txt"
fi

#==============================================================================
# STEP 6: cscope Analysis
#==============================================================================
print_header "STEP 6: cscope Database Analysis"

print_subheader "Building cscope database..."

# Find all C source files (respecting .gitignore if available)
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
	{
		git ls-files -- '*.c' '*.h' 2>/dev/null
		git ls-files --others --exclude-standard -- '*.c' '*.h' 2>/dev/null
	} | sort -u >"$RESULTS_DIR/cscope.files"
elif [ "$RESPECT_GITIGNORE" = true ]; then
	find . \( -name "*.c" -o -name "*.h" \) -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" >"$RESULTS_DIR/cscope.files"
else
	find . \( -name "*.c" -o -name "*.h" \) -type f >"$RESULTS_DIR/cscope.files" 2>/dev/null
fi
FILE_COUNT=$(wc -l <"$RESULTS_DIR/cscope.files")
echo "Found $FILE_COUNT source files"

# Build cscope database (can take a while for large repos)
echo "Building database (this may take several minutes for Linux kernel)..."
cscope -b -q -i "$RESULTS_DIR/cscope.files" -f "$RESULTS_DIR/cscope.out" 2>/dev/null || true

if [ -f "$RESULTS_DIR/cscope.out" ]; then
	echo "Database built successfully"
	echo "Database size: $(du -sh "$RESULTS_DIR/cscope.out" | cut -f1)"

	print_subheader "Example: Finding callers of 'printk' function"
	cscope -d -f "$RESULTS_DIR/cscope.out" -L -3 printk 2>/dev/null | head -20 || echo "No results"

	print_subheader "Example: Finding definition of 'struct file'"
	cscope -d -f "$RESULTS_DIR/cscope.out" -L -1 "struct file" 2>/dev/null | head -10 || echo "No results"
fi

#==============================================================================
# STEP 7: clang AST Analysis (if available)
#==============================================================================
print_header "STEP 7: clang-based Analysis (AST-level)"

print_subheader "Analyzing a sample file with clang AST dump"

# Find a simple C file to analyze (respecting .gitignore)
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
	SAMPLE_FILE=$(git ls-files -- '*.c' 2>/dev/null | head -20 | while read -r f; do
		[ -f "$f" ] && [ "$(stat -c%s "$f" 2>/dev/null || echo 999999)" -lt 51200 ] && echo "$f"
	done | head -1)
elif [ "$RESPECT_GITIGNORE" = true ]; then
	SAMPLE_FILE=$(find . -name "*.c" -size -50k -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" | head -1)
else
	SAMPLE_FILE=$(find . -name "*.c" -size -50k 2>/dev/null | head -1)
fi

if [ -n "$SAMPLE_FILE" ]; then
	echo "Sample file: $SAMPLE_FILE"
	echo ""
	echo "Function declarations in this file:"
	clang -Xclang -ast-dump -fsyntax-only "$SAMPLE_FILE" 2>/dev/null |
		grep -E "FunctionDecl.*<.*>" |
		head -20 |
		sed 's/.*FunctionDecl.*<[^>]*> /  /' |
		tee "$RESULTS_DIR/clang_sample_functions.txt" || echo "Analysis failed (missing headers)"
fi

print_subheader "Note: Full clang analysis requires compile_commands.json"
echo "For proper AST analysis of the Linux kernel, you need to:"
echo "  1. Configure the kernel: make defconfig"
echo "  2. Generate compile_commands.json: make compile_commands.json"
echo "  3. Use clang-query or clang-check with the database"

#==============================================================================
# STEP 8: Summary
#==============================================================================
print_header "ANALYSIS COMPLETE"

echo "Results saved to: $RESULTS_DIR/"
echo ""
ls -la "$RESULTS_DIR/"

echo ""
echo -e "${GREEN}Quick Summary:${NC}"
echo ""

if [ -f "$RESULTS_DIR/grep_keywords.txt" ]; then
	echo "Top 5 Language Keywords (in code):"
	head -5 "$RESULTS_DIR/grep_keywords.txt" | awk '{printf "  %s: %s times\n", $2, $1}'
fi

echo ""
if [ -f "$RESULTS_DIR/grep_function_calls.txt" ]; then
	echo "Top 5 Function/Method Calls (in code):"
	head -5 "$RESULTS_DIR/grep_function_calls.txt" | awk '{printf "  %s(): %s times\n", $2, $1}'
fi

echo ""
if [ -f "$RESULTS_DIR/comment_words.txt" ]; then
	echo "Top 5 Words in Comments:"
	head -5 "$RESULTS_DIR/comment_words.txt" | awk '{printf "  %s: %s times\n", $2, $1}'
fi

echo ""
if [ -f "$RESULTS_DIR/grep_imports.txt" ]; then
	echo "Top 5 Imports/Includes:"
	head -5 "$RESULTS_DIR/grep_imports.txt" | awk '{count=$1; $1=""; printf "  %s: %s times\n", substr($0,2), count}'
fi

echo ""
echo -e "${BLUE}To explore interactively with cscope (C/C++ only):${NC}"
echo "  cd $REPO_DIR && cscope -d -f $RESULTS_DIR/cscope.out"
echo ""
echo -e "${BLUE}To browse tags in vim:${NC}"
echo "  cd $REPO_DIR && vim -t main"