testsAndMisc/linux_configuration/scripts/single_use/utils/install_plagiarism_tools.sh
Krzysztof kuhy Rudnicki 42a66a1419 refactor(linux_configuration/scripts): split all scripts into single_use/ and periodic_background/
- Move all linux_configuration scripts into two semantic categories:
  - single_use/: scripts run once manually (fresh install, fixes, setup)
  - periodic_background/: scripts run by systemd timers or daemons
- Preserve existing subdirectory structure within each category
- Fix lib/common.sh source paths for new directory depths
- Fix CONFIG_DIR depth in setup_periodic_system.sh and check_and_enable_services.sh
- Update all references in tests, fresh-install/main.sh, nix modules, and docs
- Fix check_polling_antipatterns.sh false positives (||, regex |, case patterns, jq strings)
- Fix pre-existing mypy exclusion path and type annotations for moved tools/ directory
- Rewrite check_polling_antipatterns.sh using awk (no bash regex loops); add require_serial: true
2026-05-15 00:32:35 +02:00

533 lines
17 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Install Free & Open Source Plagiarism Detection Tools for Text
# Suitable for academic work (theses, papers, etc.)
#
# Tools installed:
# 1. Python NLP-based similarity detection (sklearn, NLTK, spaCy)
# 2. Sherlock text plagiarism detector
# 3. Ferret (Java-based, if Java available)
# 4. Optional: WCopyfind via Wine (Windows tool)
set -euo pipefail
INSTALL_DIR="${HOME}/.local/share/plagiarism-tools"
VENV_DIR="${HOME}/.local/share/plagiarism-venv"
echo "=============================================="
echo " Open Source Plagiarism Detection Installer"
echo " For Academic Text (Theses, Papers, etc.)"
echo "=============================================="
echo ""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
success() { echo -e "${GREEN}$1${NC}"; }
warn() { echo -e "${YELLOW}$1${NC}"; }
error() { echo -e "${RED}$1${NC}"; }
# Create installation directory
mkdir -p "$INSTALL_DIR"
# ------------------------------------------------------------------------------
# 1. Python-based NLP Plagiarism Detection Environment
# ------------------------------------------------------------------------------
echo ""
echo "=== 1. Installing Python NLP-based Plagiarism Tools ==="
# Check for Python 3
if ! command -v python3 &> /dev/null; then
error "Python 3 is required but not installed."
exit 1
fi
# Create virtual environment
if [ ! -d "$VENV_DIR" ]; then
echo "Creating Python virtual environment..."
python3 -m venv "$VENV_DIR"
success "Virtual environment created at $VENV_DIR"
else
warn "Virtual environment already exists at $VENV_DIR"
fi
# Activate and install packages
source "$VENV_DIR/bin/activate"
echo "Installing Python packages for text similarity detection..."
pip install --upgrade pip
pip install --progress-bar on \
scikit-learn \
nltk \
spacy \
gensim \
numpy \
pandas \
python-docx \
PyPDF2 \
beautifulsoup4 \
lxml \
textdistance \
fuzzywuzzy \
python-Levenshtein
success "Python NLP packages installed"
# Download NLTK data
echo "Downloading NLTK data (stopwords, punkt tokenizer)..."
python3 -c "
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
"
success "NLTK data downloaded"
# Download spaCy English model (small)
echo "Downloading spaCy English model..."
python3 -m spacy download en_core_web_sm 2> /dev/null || warn "spaCy model download may need manual install: python -m spacy download en_core_web_sm"
success "spaCy model installed"
# Create a simple plagiarism checker script
cat > "$INSTALL_DIR/check_plagiarism.py" << 'PYEOF'
#!/usr/bin/env python3
"""
Simple Text Plagiarism Checker
Compares documents using multiple similarity algorithms.
Usage:
python check_plagiarism.py file1.txt file2.txt [file3.txt ...]
python check_plagiarism.py --dir /path/to/documents/
"""
import argparse
import os
import sys
from pathlib import Path
from typing import List, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
# Ensure NLTK data is available
try:
stopwords.words('english')
except LookupError:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
def read_file(filepath: str) -> str:
"""Read text from various file formats."""
path = Path(filepath)
suffix = path.suffix.lower()
if suffix == '.pdf':
try:
from PyPDF2 import PdfReader
reader = PdfReader(filepath)
return ' '.join(page.extract_text() or '' for page in reader.pages)
except ImportError:
print("Warning: PyPDF2 not installed, cannot read PDF files")
return ""
elif suffix == '.docx':
try:
from docx import Document
doc = Document(filepath)
return ' '.join(para.text for para in doc.paragraphs)
except ImportError:
print("Warning: python-docx not installed, cannot read DOCX files")
return ""
else:
# Assume plain text
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def preprocess_text(text: str) -> str:
"""Clean and preprocess text for comparison."""
# Lowercase
text = text.lower()
# Tokenize and remove stopwords
try:
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
words = [w for w in words if w.isalnum() and w not in stop_words]
return ' '.join(words)
except Exception:
# Fallback: simple preprocessing
return ' '.join(text.split())
def compute_similarity_matrix(documents: List[str]) -> np.ndarray:
"""Compute TF-IDF cosine similarity matrix."""
vectorizer = TfidfVectorizer(
ngram_range=(1, 3), # Use unigrams, bigrams, trigrams
min_df=1,
max_df=0.95
)
tfidf_matrix = vectorizer.fit_transform(documents)
return cosine_similarity(tfidf_matrix)
def find_similar_passages(text1: str, text2: str, min_words: int = 5) -> List[Tuple[str, str, float]]:
"""Find similar sentence-level passages between two texts."""
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
if not sentences1 or not sentences2:
return []
# Filter short sentences
sentences1 = [s for s in sentences1 if len(s.split()) >= min_words]
sentences2 = [s for s in sentences2 if len(s.split()) >= min_words]
if not sentences1 or not sentences2:
return []
all_sentences = sentences1 + sentences2
preprocessed = [preprocess_text(s) for s in all_sentences]
try:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(preprocessed)
except ValueError:
return []
n1 = len(sentences1)
similarities = []
for i, s1 in enumerate(sentences1):
for j, s2 in enumerate(sentences2):
sim = cosine_similarity(
tfidf_matrix[i:i+1],
tfidf_matrix[n1+j:n1+j+1]
)[0][0]
if sim > 0.5: # Threshold for suspicious similarity
similarities.append((s1, s2, sim))
return sorted(similarities, key=lambda x: x[2], reverse=True)
def main():
parser = argparse.ArgumentParser(
description='Text Plagiarism Checker - Compare documents for similarity'
)
parser.add_argument('files', nargs='*', help='Files to compare')
parser.add_argument('--dir', '-d', help='Directory containing documents to compare')
parser.add_argument('--threshold', '-t', type=float, default=0.3,
help='Similarity threshold for flagging (0-1, default: 0.3)')
parser.add_argument('--detailed', '-v', action='store_true',
help='Show detailed similar passages')
args = parser.parse_args()
# Collect files
files = []
if args.files:
files.extend(args.files)
if args.dir:
dir_path = Path(args.dir)
for ext in ['*.txt', '*.pdf', '*.docx', '*.md', '*.tex']:
files.extend(str(f) for f in dir_path.glob(ext))
if len(files) < 2:
print("Error: Need at least 2 files to compare")
parser.print_help()
sys.exit(1)
print(f"\n{'='*60}")
print(f" Plagiarism Check - Analyzing {len(files)} documents")
print(f"{'='*60}\n")
# Read and preprocess documents
documents = []
filenames = []
for f in files:
if os.path.exists(f):
text = read_file(f)
if text.strip():
documents.append(preprocess_text(text))
filenames.append(os.path.basename(f))
else:
print(f"Warning: {f} is empty or unreadable")
else:
print(f"Warning: {f} does not exist")
if len(documents) < 2:
print("Error: Not enough valid documents to compare")
sys.exit(1)
# Compute similarity
print("Computing document similarities...\n")
sim_matrix = compute_similarity_matrix(documents)
# Report results
print(f"{'Document Pair':<50} {'Similarity':>12}")
print("-" * 62)
suspicious_pairs = []
for i in range(len(documents)):
for j in range(i + 1, len(documents)):
similarity = sim_matrix[i][j]
pair_name = f"{filenames[i]} <-> {filenames[j]}"
if similarity >= args.threshold:
suspicious_pairs.append((i, j, similarity, pair_name))
print(f"{pair_name:<50} {similarity:>10.1%} ⚠️")
else:
print(f"{pair_name:<50} {similarity:>10.1%}")
print("-" * 62)
# Summary
if suspicious_pairs:
print(f"\n⚠ {len(suspicious_pairs)} pair(s) exceed {args.threshold:.0%} similarity threshold\n")
if args.detailed:
print("\n" + "="*60)
print(" Detailed Similar Passages")
print("="*60)
for i, j, sim, pair_name in suspicious_pairs[:3]: # Limit to top 3
print(f"\n{pair_name} ({sim:.1%} similar):")
print("-" * 40)
raw_docs = [read_file(files[i]), read_file(files[j])]
passages = find_similar_passages(raw_docs[0], raw_docs[1])
for s1, s2, psim in passages[:5]: # Top 5 passages
print(f"\n[{psim:.0%}] Document 1: \"{s1[:100]}...\"")
print(f" Document 2: \"{s2[:100]}...\"")
else:
print(f"\n✓ No document pairs exceed {args.threshold:.0%} similarity threshold")
print("\n" + "="*60)
print(" Analysis complete")
print("="*60 + "\n")
if __name__ == '__main__':
main()
PYEOF
chmod +x "$INSTALL_DIR/check_plagiarism.py"
success "Created plagiarism checker script at $INSTALL_DIR/check_plagiarism.py"
# Create convenience wrapper
mkdir -p "$HOME/.local/bin"
cat > "$HOME/.local/bin/plagcheck" << WRAPEOF
#!/usr/bin/env bash
# Wrapper for plagiarism checker
source "$VENV_DIR/bin/activate"
python "$INSTALL_DIR/check_plagiarism.py" "\$@"
WRAPEOF
chmod +x "$HOME/.local/bin/plagcheck"
success "Created 'plagcheck' command in ~/.local/bin/"
deactivate
# ------------------------------------------------------------------------------
# 2. Sherlock for Text (Clone from GitHub)
# ------------------------------------------------------------------------------
echo ""
echo "=== 2. Installing Sherlock Text Plagiarism Detector ==="
SHERLOCK_DIR="$INSTALL_DIR/sherlock"
if [ ! -d "$SHERLOCK_DIR" ]; then
# There are several Sherlock implementations; using a popular Python one
if command -v git &> /dev/null; then
# Clone a text-based similarity tool
git clone --depth 1 https://github.com/Zedeldi/sherlock-py.git "$SHERLOCK_DIR" 2> /dev/null || {
warn "Could not clone sherlock-py, trying alternative..."
# Alternative: Create a simple n-gram based sherlock
mkdir -p "$SHERLOCK_DIR"
cat > "$SHERLOCK_DIR/sherlock.py" << 'SHERLOCKEOF'
#!/usr/bin/env python3
"""
Sherlock - Simple text plagiarism detector using n-gram fingerprinting.
Based on the original Sherlock algorithm.
"""
import argparse
import hashlib
import os
import sys
from collections import defaultdict
from pathlib import Path
def tokenize(text: str) -> list:
"""Simple word tokenization."""
return [w.lower() for w in text.split() if w.isalnum()]
def get_ngrams(tokens: list, n: int = 3) -> list:
"""Generate n-grams from token list."""
return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
def fingerprint(text: str, ngram_size: int = 3, sample_rate: int = 4) -> set:
"""Create document fingerprint using sampled n-gram hashes."""
tokens = tokenize(text)
ngrams = get_ngrams(tokens, ngram_size)
fingerprints = set()
for i, ng in enumerate(ngrams):
if i % sample_rate == 0: # Sample every nth n-gram
h = hashlib.md5(''.join(ng).encode()).hexdigest()[:8]
fingerprints.add(h)
return fingerprints
def compare_documents(fp1: set, fp2: set) -> float:
"""Jaccard similarity between fingerprints."""
if not fp1 or not fp2:
return 0.0
intersection = len(fp1 & fp2)
union = len(fp1 | fp2)
return intersection / union if union > 0 else 0.0
def read_document(filepath: str) -> str:
"""Read document content."""
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def main():
parser = argparse.ArgumentParser(description='Sherlock - Text Plagiarism Detector')
parser.add_argument('files', nargs='+', help='Files to compare')
parser.add_argument('--ngram', '-n', type=int, default=3, help='N-gram size (default: 3)')
parser.add_argument('--threshold', '-t', type=float, default=0.1, help='Similarity threshold')
args = parser.parse_args()
if len(args.files) < 2:
print("Need at least 2 files to compare")
sys.exit(1)
# Read and fingerprint documents
docs = {}
for f in args.files:
if os.path.exists(f):
text = read_document(f)
docs[f] = fingerprint(text, args.ngram)
print(f"\nSherlock Plagiarism Analysis")
print("=" * 50)
# Compare all pairs
files = list(docs.keys())
for i in range(len(files)):
for j in range(i + 1, len(files)):
sim = compare_documents(docs[files[i]], docs[files[j]])
name1 = os.path.basename(files[i])
name2 = os.path.basename(files[j])
flag = " ⚠️ SUSPICIOUS" if sim >= args.threshold else ""
print(f"{name1} <-> {name2}: {sim:.1%}{flag}")
print("=" * 50)
if __name__ == '__main__':
main()
SHERLOCKEOF
chmod +x "$SHERLOCK_DIR/sherlock.py"
}
success "Sherlock installed at $SHERLOCK_DIR"
else
warn "Git not available, skipping Sherlock installation"
fi
else
warn "Sherlock already installed at $SHERLOCK_DIR"
fi
# ------------------------------------------------------------------------------
# 3. Ferret (Java-based) - Optional
# ------------------------------------------------------------------------------
echo ""
echo "=== 3. Checking for Ferret (Java-based plagiarism tool) ==="
if command -v java &> /dev/null; then
FERRET_DIR="$INSTALL_DIR/ferret"
if [ ! -d "$FERRET_DIR" ]; then
mkdir -p "$FERRET_DIR"
echo "Ferret is a Java-based tool from University of Hertfordshire."
echo "Download manually from: https://homepages.herts.ac.uk/~comqcln/Ferret/"
echo "Place JAR file in: $FERRET_DIR"
warn "Ferret requires manual download (academic license)"
fi
else
warn "Java not installed, skipping Ferret"
fi
# ------------------------------------------------------------------------------
# 4. WCopyfind via Wine (Optional)
# ------------------------------------------------------------------------------
echo ""
echo "=== 4. WCopyfind Information (Windows tool, needs Wine) ==="
if command -v wine &> /dev/null; then
echo "Wine is available. WCopyfind can be run via Wine."
echo "Download from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
echo "Run with: wine /path/to/WCopyfind.exe"
warn "WCopyfind requires manual download"
else
echo "Wine not installed. To use WCopyfind:"
echo " 1. Install wine: sudo apt install wine (or equivalent)"
echo " 2. Download WCopyfind from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
warn "WCopyfind skipped (Wine not available)"
fi
# ------------------------------------------------------------------------------
# Summary
# ------------------------------------------------------------------------------
echo ""
echo "=============================================="
echo " Installation Complete!"
echo "=============================================="
echo ""
echo "Installed tools:"
echo ""
echo "1. Python NLP Plagiarism Checker (TF-IDF, cosine similarity)"
echo " Usage: plagcheck file1.txt file2.txt"
echo " plagcheck --dir /path/to/documents/ --detailed"
echo " Location: $INSTALL_DIR/check_plagiarism.py"
echo ""
echo "2. Sherlock (n-gram fingerprinting)"
echo " Location: $SHERLOCK_DIR/sherlock.py"
echo ""
echo "3. Python virtual environment with NLP libraries:"
echo " - scikit-learn (TF-IDF, cosine similarity)"
echo " - NLTK (tokenization, stopwords)"
echo " - spaCy (NLP processing)"
echo " - gensim (document similarity)"
echo " - textdistance, fuzzywuzzy (string matching)"
echo " Activate with: source $VENV_DIR/bin/activate"
echo ""
echo "Quick Start:"
echo " plagcheck thesis_v1.pdf thesis_v2.pdf --detailed"
echo " plagcheck --dir ./student_papers/ --threshold 0.4"
echo ""
echo "Note: Ensure ~/.local/bin is in your PATH:"
echo ' export PATH="$HOME/.local/bin:$PATH"'
echo ""
echo "=============================================="
# Add to PATH reminder
if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then
warn "Add ~/.local/bin to your PATH by adding this to ~/.bashrc or ~/.zshrc:"
echo ' export PATH="$HOME/.local/bin:$PATH"'
fi