testsAndMisc/linux_configuration/scripts/utils/analyze_repo.sh

872 lines
33 KiB
Bash
Executable File

#!/bin/bash
# Analyze a git repository for most-used keywords, functions, etc.
# Usage: ./analyze_repo.sh [repo_url_or_local_path] [output_dir] [--no-ignore]
#
# Examples:
# ./analyze_repo.sh https://github.com/torvalds/linux # Clone from URL
# ./analyze_repo.sh /path/to/local/repo # Use local directory
# ./analyze_repo.sh . # Analyze current directory
# ./analyze_repo.sh . /tmp/out --no-ignore # Include node_modules, etc.
set -e
# Parse arguments
INPUT=""
WORK_DIR=""
RESPECT_GITIGNORE=true
for arg in "$@"; do
case "$arg" in
--no-ignore)
RESPECT_GITIGNORE=false
;;
*)
if [ -z "$INPUT" ]; then
INPUT="$arg"
elif [ -z "$WORK_DIR" ]; then
WORK_DIR="$arg"
fi
;;
esac
done
INPUT="${INPUT:-https://github.com/torvalds/linux}"
WORK_DIR="${WORK_DIR:-/tmp/repo_analysis}"
TOP_N=50 # Number of top results to show
# Directories to exclude (unless --no-ignore is used)
EXCLUDE_DIRS="node_modules|\.git|vendor|\.venv|venv|__pycache__|\.cache|build|dist|\.next|\.nuxt|target|\.tox|\.eggs"
# Detect if input is a URL or local path
is_url() {
[[ $1 =~ ^https?:// ]] || [[ $1 =~ ^git@ ]] || [[ $1 =~ ^ssh:// ]]
}
IS_LOCAL=false
if is_url "$INPUT"; then
REPO_URL="$INPUT"
REPO_NAME=$(basename "$REPO_URL" .git)
REPO_DIR="$WORK_DIR/$REPO_NAME"
else
# Local path - resolve to absolute path
IS_LOCAL=true
if [ -d "$INPUT" ]; then
REPO_DIR=$(cd "$INPUT" && pwd)
REPO_NAME=$(basename "$REPO_DIR")
else
echo "Error: '$INPUT' is not a valid directory or URL"
exit 1
fi
fi
RESULTS_DIR="$WORK_DIR/results_${REPO_NAME}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
print_header() {
echo ""
echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo -e "${GREEN} $1${NC}"
echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo ""
}
print_subheader() {
echo ""
echo -e "${YELLOW}--- $1 ---${NC}"
echo ""
}
# Check if we're in a git repository
is_git_repo() {
git rev-parse --is-inside-work-tree &>/dev/null
}
# Helper function to find files while respecting exclusions
# Usage: find_files "*.c" or find_files "*.py" "*.pyx"
find_files() {
local patterns=("$@")
if [ "$RESPECT_GITIGNORE" = true ]; then
if is_git_repo; then
# Use git ls-files which respects .gitignore automatically
# This includes tracked files and untracked files not in .gitignore
local git_patterns=()
for pat in "${patterns[@]}"; do
git_patterns+=("$pat")
done
# Get tracked files + untracked (but not ignored) files
{
git ls-files -- "${git_patterns[@]}" 2>/dev/null
git ls-files --others --exclude-standard -- "${git_patterns[@]}" 2>/dev/null
} | sort -u
else
# Not a git repo - fall back to manual exclusion
local find_args=()
for i in "${!patterns[@]}"; do
if [ $i -eq 0 ]; then
find_args+=(-name "${patterns[$i]}")
else
find_args+=(-o -name "${patterns[$i]}")
fi
done
find . -type f \( "${find_args[@]}" \) 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/"
fi
else
# No filtering - find all files
local find_args=()
for i in "${!patterns[@]}"; do
if [ $i -eq 0 ]; then
find_args+=(-name "${patterns[$i]}")
else
find_args+=(-o -name "${patterns[$i]}")
fi
done
find . -type f \( "${find_args[@]}" \) 2>/dev/null
fi
}
# Count files matching pattern (respecting exclusions)
count_files() {
find_files "$@" | wc -l
}
#==============================================================================
# STEP 0: Install Missing Tools
#==============================================================================
install_missing_tools() {
local MISSING_TOOLS=()
local MISSING_AUR=()
# Check for required tools
command -v git &>/dev/null || MISSING_TOOLS+=("git")
command -v ctags &>/dev/null || MISSING_TOOLS+=("ctags")
command -v cscope &>/dev/null || MISSING_TOOLS+=("cscope")
command -v clang &>/dev/null || MISSING_TOOLS+=("clang")
command -v ugrep &>/dev/null || MISSING_TOOLS+=("ugrep")
# Check for AUR tools
command -v tokei &>/dev/null || MISSING_AUR+=("tokei")
command -v scc &>/dev/null || MISSING_AUR+=("scc")
# Check for Rust 'counts' tool (install via cargo if missing)
if ! command -v counts &>/dev/null; then
if command -v cargo &>/dev/null; then
echo "Installing 'counts' via cargo (fast word counter)..."
cargo install counts 2>/dev/null || echo "Warning: counts install failed, will use Python fallback"
fi
fi
# If nothing is missing, return
if [ ${#MISSING_TOOLS[@]} -eq 0 ] && [ ${#MISSING_AUR[@]} -eq 0 ]; then
echo -e "${GREEN}All required tools are installed.${NC}"
return 0
fi
echo -e "${YELLOW}Missing tools detected. Installing...${NC}"
# Detect package manager
if command -v pacman &>/dev/null; then
# Arch Linux
if [ ${#MISSING_TOOLS[@]} -gt 0 ]; then
echo "Installing from official repos: ${MISSING_TOOLS[*]}"
sudo pacman -S --needed --noconfirm "${MISSING_TOOLS[@]}"
fi
if [ ${#MISSING_AUR[@]} -gt 0 ]; then
# Find or install AUR helper
if command -v yay &>/dev/null; then
AUR_HELPER="yay"
elif command -v paru &>/dev/null; then
AUR_HELPER="paru"
else
echo "No AUR helper found. Installing yay..."
sudo pacman -S --needed --noconfirm base-devel git
TEMP_DIR=$(mktemp -d)
git clone https://aur.archlinux.org/yay.git "$TEMP_DIR/yay"
(cd "$TEMP_DIR/yay" && makepkg -si --noconfirm)
rm -rf "$TEMP_DIR"
AUR_HELPER="yay"
fi
echo "Installing from AUR: ${MISSING_AUR[*]}"
$AUR_HELPER -S --needed --noconfirm "${MISSING_AUR[@]}"
fi
elif command -v apt-get &>/dev/null; then
# Debian/Ubuntu
echo "Installing tools via apt..."
sudo apt-get update
# Map tool names to package names
APT_PACKAGES=()
for tool in "${MISSING_TOOLS[@]}"; do
case $tool in
ctags) APT_PACKAGES+=("universal-ctags") ;;
ugrep) APT_PACKAGES+=("ugrep") ;;
*) APT_PACKAGES+=("$tool") ;;
esac
done
[ ${#APT_PACKAGES[@]} -gt 0 ] && sudo apt-get install -y "${APT_PACKAGES[@]}"
# Install tokei/scc via cargo or snap
for aur_tool in "${MISSING_AUR[@]}"; do
if command -v cargo &>/dev/null; then
echo "Installing $aur_tool via cargo..."
cargo install "$aur_tool"
elif command -v snap &>/dev/null; then
echo "Installing $aur_tool via snap..."
sudo snap install "$aur_tool"
else
echo -e "${YELLOW}Warning: Cannot install $aur_tool. Install cargo or snap first.${NC}"
fi
done
elif command -v dnf &>/dev/null; then
# Fedora
echo "Installing tools via dnf..."
sudo dnf install -y "${MISSING_TOOLS[@]}" "${MISSING_AUR[@]}" 2>/dev/null || {
# tokei/scc might need cargo
for aur_tool in "${MISSING_AUR[@]}"; do
if command -v cargo &>/dev/null; then
cargo install "$aur_tool"
fi
done
}
elif command -v brew &>/dev/null; then
# macOS with Homebrew
echo "Installing tools via brew..."
ALL_TOOLS=("${MISSING_TOOLS[@]}" "${MISSING_AUR[@]}")
brew install "${ALL_TOOLS[@]}"
else
echo -e "${RED}Unknown package manager. Please install these tools manually:${NC}"
echo " Official: ${MISSING_TOOLS[*]}"
echo " Additional: ${MISSING_AUR[*]}"
exit 1
fi
echo -e "${GREEN}Tool installation complete.${NC}"
}
print_header "STEP 0: Checking/Installing Required Tools"
install_missing_tools
# Create directories
mkdir -p "$WORK_DIR" "$RESULTS_DIR"
#==============================================================================
# STEP 1: Clone or Use Local Repository
#==============================================================================
print_header "STEP 1: Repository Setup"
if [ "$IS_LOCAL" = true ]; then
echo "Using local repository: $REPO_DIR"
if [ ! -d "$REPO_DIR" ]; then
echo "Error: Directory does not exist: $REPO_DIR"
exit 1
fi
else
# Remote URL - clone it
if [ -d "$REPO_DIR" ]; then
echo "Repository already exists at $REPO_DIR"
echo "Updating..."
cd "$REPO_DIR"
git pull --depth 1 2>/dev/null || echo "Update skipped (shallow clone)"
else
echo "Cloning $REPO_URL (shallow clone for speed)..."
git clone --depth 1 "$REPO_URL" "$REPO_DIR"
fi
fi
cd "$REPO_DIR"
echo "Repository: $REPO_NAME"
echo "Location: $REPO_DIR"
echo "Repository size: $(du -sh . | cut -f1)"
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
# Count files respecting .gitignore
FILE_COUNT=$({
git ls-files 2>/dev/null
git ls-files --others --exclude-standard 2>/dev/null
} | sort -u | wc -l)
echo "Files: $FILE_COUNT (respecting .gitignore)"
elif [ "$RESPECT_GITIGNORE" = true ]; then
echo "Files: $(find . -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" | wc -l) (excluding common dirs)"
else
echo "Files: $(find . -type f | wc -l)"
fi
#==============================================================================
# STEP 2: Basic Statistics with tokei
#==============================================================================
print_header "STEP 2: Code Statistics with tokei"
echo "Running tokei..."
tokei . | tee "$RESULTS_DIR/tokei_stats.txt"
#==============================================================================
# STEP 3: Code Statistics with scc
#==============================================================================
print_header "STEP 3: Code Statistics with scc (includes complexity)"
echo "Running scc..."
scc . | tee "$RESULTS_DIR/scc_stats.txt"
print_subheader "Top 10 Most Complex Files"
scc --by-file --sort complexity . 2>/dev/null | head -20 | tee "$RESULTS_DIR/scc_complexity.txt"
#==============================================================================
# STEP 4: Fast Keyword Analysis (Code vs Comments) - Multi-Language
#==============================================================================
print_header "STEP 4: Fast Keyword Analysis (Code vs Comments)"
# Helper function for fast word counting
# Uses 'counts' (Rust) if available, falls back to Python Counter
fast_count() {
local top_n="${1:-50}"
if command -v counts &>/dev/null; then
counts 2>/dev/null | head -$((top_n + 1)) | tail -$top_n
else
python3 -c "
import sys
from collections import Counter
c = Counter(line.rstrip() for line in sys.stdin)
for word, count in c.most_common($top_n):
print(f'{count} {word}')
"
fi
}
#------------------------------------------------------------------------------
# Language Detection and Configuration
#------------------------------------------------------------------------------
print_subheader "Detecting languages in repository..."
if [ "$RESPECT_GITIGNORE" = true ]; then
if is_git_repo; then
echo -e "${YELLOW}Note: Respecting .gitignore (excludes node_modules, build outputs, etc.)${NC}"
else
echo -e "${YELLOW}Note: Excluding common directories (node_modules, .git, vendor, etc.)${NC}"
fi
echo " Use --no-ignore to include everything."
echo ""
fi
# Count files by extension to detect primary languages (using helper)
declare -A LANG_FILES
LANG_FILES[c]=$(count_files "*.c")
LANG_FILES[cpp]=$(count_files "*.cpp" "*.cc" "*.cxx")
LANG_FILES[h]=$(count_files "*.h" "*.hpp" "*.hxx")
LANG_FILES[python]=$(count_files "*.py")
LANG_FILES[javascript]=$(count_files "*.js")
LANG_FILES[typescript]=$(count_files "*.ts" "*.tsx")
LANG_FILES[java]=$(count_files "*.java")
LANG_FILES[go]=$(count_files "*.go")
LANG_FILES[rust]=$(count_files "*.rs")
LANG_FILES[ruby]=$(count_files "*.rb")
LANG_FILES[shell]=$(count_files "*.sh" "*.bash")
echo "Files found by language:"
for lang in c cpp h python javascript typescript java go rust ruby shell; do
count=${LANG_FILES[$lang]}
[ "$count" -gt 0 ] && echo " $lang: $count files"
done
# Determine which language families are present
HAS_C_FAMILY=false
HAS_PYTHON=false
HAS_JS_FAMILY=false
HAS_SHELL=false
HAS_RUBY=false
HAS_GO=false
HAS_RUST=false
HAS_JAVA=false
((${LANG_FILES[c]} + ${LANG_FILES[cpp]} + ${LANG_FILES[h]} > 0)) && HAS_C_FAMILY=true
((${LANG_FILES[python]} > 0)) && HAS_PYTHON=true
((${LANG_FILES[javascript]} + ${LANG_FILES[typescript]} > 0)) && HAS_JS_FAMILY=true
((${LANG_FILES[shell]} > 0)) && HAS_SHELL=true
((${LANG_FILES[ruby]} > 0)) && HAS_RUBY=true
((${LANG_FILES[go]} > 0)) && HAS_GO=true
((${LANG_FILES[rust]} > 0)) && HAS_RUST=true
((${LANG_FILES[java]} > 0)) && HAS_JAVA=true
#------------------------------------------------------------------------------
# Language-specific keyword definitions
#------------------------------------------------------------------------------
# C/C++ keywords
KEYWORDS_C="auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while|inline|restrict|_Bool|_Complex|_Imaginary"
KEYWORDS_CPP="$KEYWORDS_C|alignas|alignof|and|and_eq|asm|atomic_cancel|atomic_commit|atomic_noexcept|bitand|bitor|bool|catch|char16_t|char32_t|char8_t|class|co_await|co_return|co_yield|compl|concept|const_cast|consteval|constexpr|constinit|decltype|delete|dynamic_cast|explicit|export|false|friend|mutable|namespace|new|noexcept|not|not_eq|nullptr|operator|or|or_eq|override|private|protected|public|reflexpr|reinterpret_cast|requires|static_assert|static_cast|synchronized|template|this|thread_local|throw|true|try|typeid|typename|using|virtual|wchar_t|xor|xor_eq"
# Python keywords
KEYWORDS_PYTHON="False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield"
# JavaScript/TypeScript keywords
KEYWORDS_JS="abstract|arguments|await|boolean|break|byte|case|catch|char|class|const|continue|debugger|default|delete|do|double|else|enum|eval|export|extends|false|final|finally|float|for|function|goto|if|implements|import|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|super|switch|synchronized|this|throw|throws|transient|true|try|typeof|undefined|var|void|volatile|while|with|yield"
KEYWORDS_TS="$KEYWORDS_JS|any|as|asserts|bigint|declare|get|infer|intrinsic|is|keyof|module|namespace|never|out|override|readonly|require|set|string|symbol|type|unique|unknown"
# Go keywords
KEYWORDS_GO="break|case|chan|const|continue|default|defer|else|fallthrough|for|func|go|goto|if|import|interface|map|package|range|return|select|struct|switch|type|var"
# Rust keywords
KEYWORDS_RUST="as|async|await|break|const|continue|crate|dyn|else|enum|extern|false|fn|for|if|impl|in|let|loop|match|mod|move|mut|pub|ref|return|self|Self|static|struct|super|trait|true|type|unsafe|use|where|while"
# Ruby keywords
KEYWORDS_RUBY="BEGIN|END|alias|and|begin|break|case|class|def|defined|do|else|elsif|end|ensure|false|for|if|in|module|next|nil|not|or|redo|rescue|retry|return|self|super|then|true|undef|unless|until|when|while|yield"
#------------------------------------------------------------------------------
# Multi-language comment processing - KEEP LANGUAGES SEPARATE
#------------------------------------------------------------------------------
print_subheader "Processing source files (separating code from comments)..."
# Create per-language output directory
mkdir -p "$RESULTS_DIR/per_language"
COMMENTS_TEMP=$(mktemp)
trap 'rm -f "$COMMENTS_TEMP" /tmp/code_*.tmp 2>/dev/null' EXIT
declare -A LANG_CODE_FILES
# Process C/C++ files
if $HAS_C_FAMILY; then
echo "Processing C/C++ files..."
LANG_CODE_FILES[c_cpp]=$(mktemp /tmp/code_c_cpp.XXXXXX.tmp)
find_files "*.c" "*.cpp" "*.cc" "*.cxx" "*.h" "*.hpp" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[c_cpp]}"
# Extract and strip C-style comments
perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[c_cpp]}" >>"$COMMENTS_TEMP"
perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[c_cpp]}" >"${LANG_CODE_FILES[c_cpp]}.clean"
mv "${LANG_CODE_FILES[c_cpp]}.clean" "${LANG_CODE_FILES[c_cpp]}"
fi
# Process JavaScript files (separate from TypeScript)
if $HAS_JS_FAMILY; then
echo "Processing JavaScript files..."
LANG_CODE_FILES[javascript]=$(mktemp /tmp/code_js.XXXXXX.tmp)
find_files "*.js" "*.jsx" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[javascript]}"
echo "Processing TypeScript files..."
LANG_CODE_FILES[typescript]=$(mktemp /tmp/code_ts.XXXXXX.tmp)
find_files "*.ts" "*.tsx" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[typescript]}"
# Extract and strip comments from both
for lang_file in "${LANG_CODE_FILES[javascript]}" "${LANG_CODE_FILES[typescript]}"; do
[ ! -s "$lang_file" ] && continue
perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "$lang_file" >>"$COMMENTS_TEMP"
perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "$lang_file" >"${lang_file}.clean"
mv "${lang_file}.clean" "$lang_file"
done
fi
# Process Python files
if $HAS_PYTHON; then
echo "Processing Python files..."
LANG_CODE_FILES[python]=$(mktemp /tmp/code_python.XXXXXX.tmp)
find_files "*.py" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[python]}"
perl -ne 'if (/^\s*#(.*)/) { print "$1\n"; } elsif (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[python]}" >>"$COMMENTS_TEMP"
perl -0777 -ne 'while (/"""(.+?)"""/gs) { print "$1\n"; } while (/'"'"''"'"''"'"'(.+?)'"'"''"'"''"'"'/gs) { print "$1\n"; }' "${LANG_CODE_FILES[python]}" >>"$COMMENTS_TEMP"
perl -pe 's/#.*$//' "${LANG_CODE_FILES[python]}" | perl -0777 -pe 's/""".*?"""//gs; s/'"'"''"'"''"'"'.*?'"'"''"'"''"'"'//gs' >"${LANG_CODE_FILES[python]}.clean"
mv "${LANG_CODE_FILES[python]}.clean" "${LANG_CODE_FILES[python]}"
fi
# Process Go files
if $HAS_GO; then
echo "Processing Go files..."
LANG_CODE_FILES[go]=$(mktemp /tmp/code_go.XXXXXX.tmp)
find_files "*.go" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[go]}"
perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[go]}" >>"$COMMENTS_TEMP"
perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[go]}" >"${LANG_CODE_FILES[go]}.clean"
mv "${LANG_CODE_FILES[go]}.clean" "${LANG_CODE_FILES[go]}"
fi
# Process Rust files
if $HAS_RUST; then
echo "Processing Rust files..."
LANG_CODE_FILES[rust]=$(mktemp /tmp/code_rust.XXXXXX.tmp)
find_files "*.rs" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[rust]}"
perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[rust]}" >>"$COMMENTS_TEMP"
perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[rust]}" >"${LANG_CODE_FILES[rust]}.clean"
mv "${LANG_CODE_FILES[rust]}.clean" "${LANG_CODE_FILES[rust]}"
fi
# Process Ruby files
if $HAS_RUBY; then
echo "Processing Ruby files..."
LANG_CODE_FILES[ruby]=$(mktemp /tmp/code_ruby.XXXXXX.tmp)
find_files "*.rb" | head -5000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[ruby]}"
perl -ne 'if (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[ruby]}" >>"$COMMENTS_TEMP"
perl -0777 -ne 'while (/=begin(.+?)=end/gs) { print "$1\n"; }' "${LANG_CODE_FILES[ruby]}" >>"$COMMENTS_TEMP"
perl -pe 's/#.*$//' "${LANG_CODE_FILES[ruby]}" | perl -0777 -pe 's/=begin.*?=end//gs' >"${LANG_CODE_FILES[ruby]}.clean"
mv "${LANG_CODE_FILES[ruby]}.clean" "${LANG_CODE_FILES[ruby]}"
fi
# Process Shell files
if $HAS_SHELL; then
echo "Processing Shell files..."
LANG_CODE_FILES[shell]=$(mktemp /tmp/code_shell.XXXXXX.tmp)
find_files "*.sh" "*.bash" | head -5000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[shell]}"
perl -ne 'if (/^\s*#(.*)/ && !/^#!/) { print "$1\n"; } elsif (/#(.*)$/) { print "$1\n"; }' "${LANG_CODE_FILES[shell]}" >>"$COMMENTS_TEMP"
perl -pe 's/#.*$//' "${LANG_CODE_FILES[shell]}" >"${LANG_CODE_FILES[shell]}.clean"
mv "${LANG_CODE_FILES[shell]}.clean" "${LANG_CODE_FILES[shell]}"
fi
# Process Java files
if $HAS_JAVA; then
echo "Processing Java files..."
LANG_CODE_FILES[java]=$(mktemp /tmp/code_java.XXXXXX.tmp)
find_files "*.java" | head -15000 | xargs cat 2>/dev/null >"${LANG_CODE_FILES[java]}"
perl -0777 -ne 'while (/\/\*(.+?)\*\//gs) { print "$1\n"; } while (/\/\/([^\n]*)/g) { print "$1\n"; }' "${LANG_CODE_FILES[java]}" >>"$COMMENTS_TEMP"
perl -0777 -pe 's|/\*.*?\*/||gs; s|//[^\n]*||g;' "${LANG_CODE_FILES[java]}" >"${LANG_CODE_FILES[java]}.clean"
mv "${LANG_CODE_FILES[java]}.clean" "${LANG_CODE_FILES[java]}"
fi
COMMENT_LINES=$(wc -l <"$COMMENTS_TEMP")
echo ""
echo "Processed languages: ${!LANG_CODE_FILES[*]}"
echo "Total comment lines: $COMMENT_LINES"
#------------------------------------------------------------------------------
# Per-Language Keyword Analysis - Each language gets its own file
#------------------------------------------------------------------------------
print_subheader "Per-Language Keyword Analysis"
# Map language names to keyword variables
declare -A LANG_KEYWORDS
LANG_KEYWORDS[c_cpp]="$KEYWORDS_CPP"
LANG_KEYWORDS[python]="$KEYWORDS_PYTHON"
LANG_KEYWORDS[javascript]="$KEYWORDS_JS"
LANG_KEYWORDS[typescript]="$KEYWORDS_TS"
LANG_KEYWORDS[go]="$KEYWORDS_GO"
LANG_KEYWORDS[rust]="$KEYWORDS_RUST"
LANG_KEYWORDS[ruby]="$KEYWORDS_RUBY"
LANG_KEYWORDS[shell]="$KEYWORDS_SHELL"
LANG_KEYWORDS[java]="$KEYWORDS_JAVA"
# Analyze each language separately
for lang in "${!LANG_CODE_FILES[@]}"; do
code_file="${LANG_CODE_FILES[$lang]}"
keywords="${LANG_KEYWORDS[$lang]}"
output_file="$RESULTS_DIR/per_language/keywords_${lang}.txt"
if [ -f "$code_file" ] && [ -s "$code_file" ] && [ -n "$keywords" ]; then
echo ""
echo -e "${YELLOW}=== $lang Keywords ===${NC}"
ugrep -o "\b($keywords)\b" "$code_file" 2>/dev/null |
fast_count 50 |
tee "$output_file"
fi
done
#------------------------------------------------------------------------------
# Per-Language Function Analysis
#------------------------------------------------------------------------------
print_subheader "Per-Language Function Calls"
for lang in "${!LANG_CODE_FILES[@]}"; do
code_file="${LANG_CODE_FILES[$lang]}"
output_file="$RESULTS_DIR/per_language/functions_${lang}.txt"
if [ -f "$code_file" ] && [ -s "$code_file" ]; then
echo ""
echo -e "${YELLOW}=== $lang Functions ===${NC}"
ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\s*\(' "$code_file" 2>/dev/null |
sed 's/\s*(//' |
grep -vE '^(if|for|while|switch|catch|elif)$' |
fast_count 30 |
tee "$output_file"
fi
done
#------------------------------------------------------------------------------
# Per-Language Import Analysis
#------------------------------------------------------------------------------
print_subheader "Per-Language Imports/Includes"
# C/C++ includes
if [ -n "${LANG_CODE_FILES[c_cpp]}" ] && [ -s "${LANG_CODE_FILES[c_cpp]}" ]; then
echo -e "${YELLOW}=== C/C++ Includes ===${NC}"
ugrep -o '#include\s*[<"][^>"]+[>"]' "${LANG_CODE_FILES[c_cpp]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_c_cpp.txt"
fi
# Python imports
if [ -n "${LANG_CODE_FILES[python]}" ] && [ -s "${LANG_CODE_FILES[python]}" ]; then
echo ""
echo -e "${YELLOW}=== Python Imports ===${NC}"
ugrep -o '^\s*(from\s+\S+\s+import\s+\S+|import\s+\S+)' "${LANG_CODE_FILES[python]}" 2>/dev/null |
sed 's/^\s*//' |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_python.txt"
fi
# JavaScript imports
if [ -n "${LANG_CODE_FILES[javascript]}" ] && [ -s "${LANG_CODE_FILES[javascript]}" ]; then
echo ""
echo -e "${YELLOW}=== JavaScript Imports ===${NC}"
ugrep -o "(import\s+.*\s+from\s+['\"][^'\"]+['\"]|require\s*\(['\"][^'\"]+['\"]\))" "${LANG_CODE_FILES[javascript]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_javascript.txt"
fi
# TypeScript imports
if [ -n "${LANG_CODE_FILES[typescript]}" ] && [ -s "${LANG_CODE_FILES[typescript]}" ]; then
echo ""
echo -e "${YELLOW}=== TypeScript Imports ===${NC}"
ugrep -o "(import\s+.*\s+from\s+['\"][^'\"]+['\"]|require\s*\(['\"][^'\"]+['\"]\))" "${LANG_CODE_FILES[typescript]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_typescript.txt"
fi
# Go imports
if [ -n "${LANG_CODE_FILES[go]}" ] && [ -s "${LANG_CODE_FILES[go]}" ]; then
echo ""
echo -e "${YELLOW}=== Go Imports ===${NC}"
ugrep -o '"[^"]+/[^"]+"' "${LANG_CODE_FILES[go]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_go.txt"
fi
# Rust use statements
if [ -n "${LANG_CODE_FILES[rust]}" ] && [ -s "${LANG_CODE_FILES[rust]}" ]; then
echo ""
echo -e "${YELLOW}=== Rust Use Statements ===${NC}"
ugrep -o '^\s*use\s+[^;]+' "${LANG_CODE_FILES[rust]}" 2>/dev/null |
sed 's/^\s*//' |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_rust.txt"
fi
# Java imports
if [ -n "${LANG_CODE_FILES[java]}" ] && [ -s "${LANG_CODE_FILES[java]}" ]; then
echo ""
echo -e "${YELLOW}=== Java Imports ===${NC}"
ugrep -o '^\s*import\s+[^;]+' "${LANG_CODE_FILES[java]}" 2>/dev/null |
sed 's/^\s*//' |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_java.txt"
fi
# Ruby requires
if [ -n "${LANG_CODE_FILES[ruby]}" ] && [ -s "${LANG_CODE_FILES[ruby]}" ]; then
echo ""
echo -e "${YELLOW}=== Ruby Requires ===${NC}"
ugrep -o "(require\s+['\"][^'\"]+['\"]|require_relative\s+['\"][^'\"]+['\"])" "${LANG_CODE_FILES[ruby]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_ruby.txt"
fi
# Shell sources
if [ -n "${LANG_CODE_FILES[shell]}" ] && [ -s "${LANG_CODE_FILES[shell]}" ]; then
echo ""
echo -e "${YELLOW}=== Shell Sources ===${NC}"
ugrep -o '(source\s+[^\s]+|\.\s+[^\s]+)' "${LANG_CODE_FILES[shell]}" 2>/dev/null |
fast_count 30 |
tee "$RESULTS_DIR/per_language/imports_shell.txt"
fi
#------------------------------------------------------------------------------
# Combined Analysis (for overview/backward compatibility)
#------------------------------------------------------------------------------
print_subheader "Combined Code Identifiers (all languages)"
# Create combined CODE_TEMP
CODE_TEMP=$(mktemp)
for lang_file in "${LANG_CODE_FILES[@]}"; do
[ -f "$lang_file" ] && cat "$lang_file" >>"$CODE_TEMP"
done
ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\b' "$CODE_TEMP" 2>/dev/null |
fast_count $TOP_N |
tee "$RESULTS_DIR/code_identifiers.txt"
print_subheader "Most Used Words in COMMENTS"
ugrep -o '\b[a-zA-Z_][a-zA-Z0-9_]*\b' "$COMMENTS_TEMP" 2>/dev/null |
fast_count $TOP_N |
tee "$RESULTS_DIR/comment_words.txt"
# Create combined files from per-language analysis (for backward compatibility)
{
echo "# Combined keywords from all languages"
echo "# Format: count keyword (from per_language/keywords_*.txt)"
cat "$RESULTS_DIR/per_language"/keywords_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_keywords.txt"
{
echo "# Combined functions from all languages"
echo "# See per_language/functions_*.txt for language-specific breakdown"
cat "$RESULTS_DIR/per_language"/functions_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_function_calls.txt"
{
echo "# Combined imports from all languages"
echo "# See per_language/imports_*.txt for language-specific breakdown"
cat "$RESULTS_DIR/per_language"/imports_*.txt 2>/dev/null | grep -v '^$' | sort -t' ' -k1 -nr | head -100
} >"$RESULTS_DIR/grep_imports.txt"
# List what per-language files were created
echo ""
echo "Per-language analysis files created:"
find "$RESULTS_DIR/per_language/" -maxdepth 1 -type f -printf ' %f\n' 2>/dev/null || true
print_subheader "Generating tags (this may take a while)..."
# Generate tags for different kinds
ctags -R --languages=C,C++ --c-kinds=+fp --fields=+lK -f "$RESULTS_DIR/tags" . 2>/dev/null || true
if [ -f "$RESULTS_DIR/tags" ]; then
TOTAL_TAGS=$(grep -ac '^[^!]' "$RESULTS_DIR/tags" 2>/dev/null || echo "0")
echo "Total symbols found: $TOTAL_TAGS"
print_subheader "Most Common Symbol Names"
# Fast: use cut + counts instead of awk + sort | uniq
# -a flag treats tags file as text (may contain binary-like patterns)
grep -a '^[^!]' "$RESULTS_DIR/tags" | cut -f1 | fast_count $TOP_N |
tee "$RESULTS_DIR/ctags_symbols.txt"
print_subheader "Symbol Types Distribution"
# Fast: extract single-letter kind code after ;" and count
grep -aoP ';"\t\K[a-z]' "$RESULTS_DIR/tags" 2>/dev/null | fast_count 20 | while read count kind; do
case $kind in
f) echo "$count functions" ;;
v) echo "$count variables" ;;
s) echo "$count structs" ;;
t) echo "$count typedefs" ;;
e) echo "$count enum values" ;;
g) echo "$count enums" ;;
m) echo "$count struct/union members" ;;
d) echo "$count macro definitions" ;;
p) echo "$count function prototypes" ;;
u) echo "$count unions" ;;
c) echo "$count classes" ;;
n) echo "$count namespaces" ;;
*) echo "$count kind=$kind" ;;
esac
done | tee "$RESULTS_DIR/ctags_kinds.txt"
fi
#==============================================================================
# STEP 6: cscope Analysis
#==============================================================================
print_header "STEP 6: cscope Database Analysis"
print_subheader "Building cscope database..."
# Find all C source files (respecting .gitignore if available)
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
{
git ls-files -- '*.c' '*.h' 2>/dev/null
git ls-files --others --exclude-standard -- '*.c' '*.h' 2>/dev/null
} | sort -u >"$RESULTS_DIR/cscope.files"
elif [ "$RESPECT_GITIGNORE" = true ]; then
find . \( -name "*.c" -o -name "*.h" \) -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" >"$RESULTS_DIR/cscope.files"
else
find . \( -name "*.c" -o -name "*.h" \) -type f >"$RESULTS_DIR/cscope.files" 2>/dev/null
fi
FILE_COUNT=$(wc -l <"$RESULTS_DIR/cscope.files")
echo "Found $FILE_COUNT source files"
# Build cscope database (can take a while for large repos)
echo "Building database (this may take several minutes for Linux kernel)..."
cscope -b -q -i "$RESULTS_DIR/cscope.files" -f "$RESULTS_DIR/cscope.out" 2>/dev/null || true
if [ -f "$RESULTS_DIR/cscope.out" ]; then
echo "Database built successfully"
echo "Database size: $(du -sh "$RESULTS_DIR/cscope.out" | cut -f1)"
print_subheader "Example: Finding callers of 'printk' function"
cscope -d -f "$RESULTS_DIR/cscope.out" -L -3 printk 2>/dev/null | head -20 || echo "No results"
print_subheader "Example: Finding definition of 'struct file'"
cscope -d -f "$RESULTS_DIR/cscope.out" -L -1 "struct file" 2>/dev/null | head -10 || echo "No results"
fi
#==============================================================================
# STEP 7: clang AST Analysis (if available)
#==============================================================================
print_header "STEP 7: clang-based Analysis (AST-level)"
print_subheader "Analyzing a sample file with clang AST dump"
# Find a simple C file to analyze (respecting .gitignore)
if [ "$RESPECT_GITIGNORE" = true ] && is_git_repo; then
SAMPLE_FILE=$(git ls-files -- '*.c' 2>/dev/null | head -20 | while read -r f; do
[ -f "$f" ] && [ "$(stat -c%s "$f" 2>/dev/null || echo 999999)" -lt 51200 ] && echo "$f"
done | head -1)
elif [ "$RESPECT_GITIGNORE" = true ]; then
SAMPLE_FILE=$(find . -name "*.c" -size -50k -type f 2>/dev/null | grep -Ev "/($EXCLUDE_DIRS)/" | head -1)
else
SAMPLE_FILE=$(find . -name "*.c" -size -50k 2>/dev/null | head -1)
fi
if [ -n "$SAMPLE_FILE" ]; then
echo "Sample file: $SAMPLE_FILE"
echo ""
echo "Function declarations in this file:"
clang -Xclang -ast-dump -fsyntax-only "$SAMPLE_FILE" 2>/dev/null |
grep -E "FunctionDecl.*<.*>" |
head -20 |
sed 's/.*FunctionDecl.*<[^>]*> / /' |
tee "$RESULTS_DIR/clang_sample_functions.txt" || echo "Analysis failed (missing headers)"
fi
print_subheader "Note: Full clang analysis requires compile_commands.json"
echo "For proper AST analysis of the Linux kernel, you need to:"
echo " 1. Configure the kernel: make defconfig"
echo " 2. Generate compile_commands.json: make compile_commands.json"
echo " 3. Use clang-query or clang-check with the database"
#==============================================================================
# STEP 8: Summary
#==============================================================================
print_header "ANALYSIS COMPLETE"
echo "Results saved to: $RESULTS_DIR/"
echo ""
ls -la "$RESULTS_DIR/"
echo ""
echo -e "${GREEN}Quick Summary:${NC}"
echo ""
if [ -f "$RESULTS_DIR/grep_keywords.txt" ]; then
echo "Top 5 Language Keywords (in code):"
head -5 "$RESULTS_DIR/grep_keywords.txt" | awk '{printf " %s: %s times\n", $2, $1}'
fi
echo ""
if [ -f "$RESULTS_DIR/grep_function_calls.txt" ]; then
echo "Top 5 Function/Method Calls (in code):"
head -5 "$RESULTS_DIR/grep_function_calls.txt" | awk '{printf " %s(): %s times\n", $2, $1}'
fi
echo ""
if [ -f "$RESULTS_DIR/comment_words.txt" ]; then
echo "Top 5 Words in Comments:"
head -5 "$RESULTS_DIR/comment_words.txt" | awk '{printf " %s: %s times\n", $2, $1}'
fi
echo ""
if [ -f "$RESULTS_DIR/grep_imports.txt" ]; then
echo "Top 5 Imports/Includes:"
head -5 "$RESULTS_DIR/grep_imports.txt" | awk '{count=$1; $1=""; printf " %s: %s times\n", substr($0,2), count}'
fi
echo ""
echo -e "${BLUE}To explore interactively with cscope (C/C++ only):${NC}"
echo " cd $REPO_DIR && cscope -d -f $RESULTS_DIR/cscope.out"
echo ""
echo -e "${BLUE}To browse tags in vim:${NC}"
echo " cd $REPO_DIR && vim -t main"