testsAndMisc/scripts/utils/install_plagiarism_tools.sh

533 lines
17 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# Install Free & Open Source Plagiarism Detection Tools for Text
# Suitable for academic work (theses, papers, etc.)
#
# Tools installed:
# 1. Python NLP-based similarity detection (sklearn, NLTK, spaCy)
# 2. Sherlock text plagiarism detector
# 3. Ferret (Java-based, if Java available)
# 4. Optional: WCopyfind via Wine (Windows tool)
set -euo pipefail
INSTALL_DIR="${HOME}/.local/share/plagiarism-tools"
VENV_DIR="${HOME}/.local/share/plagiarism-venv"
echo "=============================================="
echo " Open Source Plagiarism Detection Installer"
echo " For Academic Text (Theses, Papers, etc.)"
echo "=============================================="
echo ""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
success() { echo -e "${GREEN}$1${NC}"; }
warn() { echo -e "${YELLOW}$1${NC}"; }
error() { echo -e "${RED}$1${NC}"; }
# Create installation directory
mkdir -p "$INSTALL_DIR"
# ------------------------------------------------------------------------------
# 1. Python-based NLP Plagiarism Detection Environment
# ------------------------------------------------------------------------------
echo ""
echo "=== 1. Installing Python NLP-based Plagiarism Tools ==="
# Check for Python 3
if ! command -v python3 &>/dev/null; then
error "Python 3 is required but not installed."
exit 1
fi
# Create virtual environment
if [ ! -d "$VENV_DIR" ]; then
echo "Creating Python virtual environment..."
python3 -m venv "$VENV_DIR"
success "Virtual environment created at $VENV_DIR"
else
warn "Virtual environment already exists at $VENV_DIR"
fi
# Activate and install packages
source "$VENV_DIR/bin/activate"
echo "Installing Python packages for text similarity detection..."
pip install --upgrade pip
pip install --progress-bar on \
scikit-learn \
nltk \
spacy \
gensim \
numpy \
pandas \
python-docx \
PyPDF2 \
beautifulsoup4 \
lxml \
textdistance \
fuzzywuzzy \
python-Levenshtein
success "Python NLP packages installed"
# Download NLTK data
echo "Downloading NLTK data (stopwords, punkt tokenizer)..."
python3 -c "
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
"
success "NLTK data downloaded"
# Download spaCy English model (small)
echo "Downloading spaCy English model..."
python3 -m spacy download en_core_web_sm 2>/dev/null || warn "spaCy model download may need manual install: python -m spacy download en_core_web_sm"
success "spaCy model installed"
# Create a simple plagiarism checker script
cat >"$INSTALL_DIR/check_plagiarism.py" <<'PYEOF'
#!/usr/bin/env python3
"""
Simple Text Plagiarism Checker
Compares documents using multiple similarity algorithms.
Usage:
python check_plagiarism.py file1.txt file2.txt [file3.txt ...]
python check_plagiarism.py --dir /path/to/documents/
"""
import argparse
import os
import sys
from pathlib import Path
from typing import List, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
# Ensure NLTK data is available
try:
stopwords.words('english')
except LookupError:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
def read_file(filepath: str) -> str:
"""Read text from various file formats."""
path = Path(filepath)
suffix = path.suffix.lower()
if suffix == '.pdf':
try:
from PyPDF2 import PdfReader
reader = PdfReader(filepath)
return ' '.join(page.extract_text() or '' for page in reader.pages)
except ImportError:
print("Warning: PyPDF2 not installed, cannot read PDF files")
return ""
elif suffix == '.docx':
try:
from docx import Document
doc = Document(filepath)
return ' '.join(para.text for para in doc.paragraphs)
except ImportError:
print("Warning: python-docx not installed, cannot read DOCX files")
return ""
else:
# Assume plain text
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def preprocess_text(text: str) -> str:
"""Clean and preprocess text for comparison."""
# Lowercase
text = text.lower()
# Tokenize and remove stopwords
try:
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
words = [w for w in words if w.isalnum() and w not in stop_words]
return ' '.join(words)
except Exception:
# Fallback: simple preprocessing
return ' '.join(text.split())
def compute_similarity_matrix(documents: List[str]) -> np.ndarray:
"""Compute TF-IDF cosine similarity matrix."""
vectorizer = TfidfVectorizer(
ngram_range=(1, 3), # Use unigrams, bigrams, trigrams
min_df=1,
max_df=0.95
)
tfidf_matrix = vectorizer.fit_transform(documents)
return cosine_similarity(tfidf_matrix)
def find_similar_passages(text1: str, text2: str, min_words: int = 5) -> List[Tuple[str, str, float]]:
"""Find similar sentence-level passages between two texts."""
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
if not sentences1 or not sentences2:
return []
# Filter short sentences
sentences1 = [s for s in sentences1 if len(s.split()) >= min_words]
sentences2 = [s for s in sentences2 if len(s.split()) >= min_words]
if not sentences1 or not sentences2:
return []
all_sentences = sentences1 + sentences2
preprocessed = [preprocess_text(s) for s in all_sentences]
try:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(preprocessed)
except ValueError:
return []
n1 = len(sentences1)
similarities = []
for i, s1 in enumerate(sentences1):
for j, s2 in enumerate(sentences2):
sim = cosine_similarity(
tfidf_matrix[i:i+1],
tfidf_matrix[n1+j:n1+j+1]
)[0][0]
if sim > 0.5: # Threshold for suspicious similarity
similarities.append((s1, s2, sim))
return sorted(similarities, key=lambda x: x[2], reverse=True)
def main():
parser = argparse.ArgumentParser(
description='Text Plagiarism Checker - Compare documents for similarity'
)
parser.add_argument('files', nargs='*', help='Files to compare')
parser.add_argument('--dir', '-d', help='Directory containing documents to compare')
parser.add_argument('--threshold', '-t', type=float, default=0.3,
help='Similarity threshold for flagging (0-1, default: 0.3)')
parser.add_argument('--detailed', '-v', action='store_true',
help='Show detailed similar passages')
args = parser.parse_args()
# Collect files
files = []
if args.files:
files.extend(args.files)
if args.dir:
dir_path = Path(args.dir)
for ext in ['*.txt', '*.pdf', '*.docx', '*.md', '*.tex']:
files.extend(str(f) for f in dir_path.glob(ext))
if len(files) < 2:
print("Error: Need at least 2 files to compare")
parser.print_help()
sys.exit(1)
print(f"\n{'='*60}")
print(f" Plagiarism Check - Analyzing {len(files)} documents")
print(f"{'='*60}\n")
# Read and preprocess documents
documents = []
filenames = []
for f in files:
if os.path.exists(f):
text = read_file(f)
if text.strip():
documents.append(preprocess_text(text))
filenames.append(os.path.basename(f))
else:
print(f"Warning: {f} is empty or unreadable")
else:
print(f"Warning: {f} does not exist")
if len(documents) < 2:
print("Error: Not enough valid documents to compare")
sys.exit(1)
# Compute similarity
print("Computing document similarities...\n")
sim_matrix = compute_similarity_matrix(documents)
# Report results
print(f"{'Document Pair':<50} {'Similarity':>12}")
print("-" * 62)
suspicious_pairs = []
for i in range(len(documents)):
for j in range(i + 1, len(documents)):
similarity = sim_matrix[i][j]
pair_name = f"{filenames[i]} <-> {filenames[j]}"
if similarity >= args.threshold:
suspicious_pairs.append((i, j, similarity, pair_name))
print(f"{pair_name:<50} {similarity:>10.1%} ⚠️")
else:
print(f"{pair_name:<50} {similarity:>10.1%}")
print("-" * 62)
# Summary
if suspicious_pairs:
print(f"\n⚠ {len(suspicious_pairs)} pair(s) exceed {args.threshold:.0%} similarity threshold\n")
if args.detailed:
print("\n" + "="*60)
print(" Detailed Similar Passages")
print("="*60)
for i, j, sim, pair_name in suspicious_pairs[:3]: # Limit to top 3
print(f"\n{pair_name} ({sim:.1%} similar):")
print("-" * 40)
raw_docs = [read_file(files[i]), read_file(files[j])]
passages = find_similar_passages(raw_docs[0], raw_docs[1])
for s1, s2, psim in passages[:5]: # Top 5 passages
print(f"\n[{psim:.0%}] Document 1: \"{s1[:100]}...\"")
print(f" Document 2: \"{s2[:100]}...\"")
else:
print(f"\n✓ No document pairs exceed {args.threshold:.0%} similarity threshold")
print("\n" + "="*60)
print(" Analysis complete")
print("="*60 + "\n")
if __name__ == '__main__':
main()
PYEOF
chmod +x "$INSTALL_DIR/check_plagiarism.py"
success "Created plagiarism checker script at $INSTALL_DIR/check_plagiarism.py"
# Create convenience wrapper
mkdir -p "$HOME/.local/bin"
cat >"$HOME/.local/bin/plagcheck" <<WRAPEOF
#!/usr/bin/env bash
# Wrapper for plagiarism checker
source "$VENV_DIR/bin/activate"
python "$INSTALL_DIR/check_plagiarism.py" "\$@"
WRAPEOF
chmod +x "$HOME/.local/bin/plagcheck"
success "Created 'plagcheck' command in ~/.local/bin/"
deactivate
# ------------------------------------------------------------------------------
# 2. Sherlock for Text (Clone from GitHub)
# ------------------------------------------------------------------------------
echo ""
echo "=== 2. Installing Sherlock Text Plagiarism Detector ==="
SHERLOCK_DIR="$INSTALL_DIR/sherlock"
if [ ! -d "$SHERLOCK_DIR" ]; then
# There are several Sherlock implementations; using a popular Python one
if command -v git &>/dev/null; then
# Clone a text-based similarity tool
git clone --depth 1 https://github.com/Zedeldi/sherlock-py.git "$SHERLOCK_DIR" 2>/dev/null || {
warn "Could not clone sherlock-py, trying alternative..."
# Alternative: Create a simple n-gram based sherlock
mkdir -p "$SHERLOCK_DIR"
cat >"$SHERLOCK_DIR/sherlock.py" <<'SHERLOCKEOF'
#!/usr/bin/env python3
"""
Sherlock - Simple text plagiarism detector using n-gram fingerprinting.
Based on the original Sherlock algorithm.
"""
import argparse
import hashlib
import os
import sys
from collections import defaultdict
from pathlib import Path
def tokenize(text: str) -> list:
"""Simple word tokenization."""
return [w.lower() for w in text.split() if w.isalnum()]
def get_ngrams(tokens: list, n: int = 3) -> list:
"""Generate n-grams from token list."""
return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
def fingerprint(text: str, ngram_size: int = 3, sample_rate: int = 4) -> set:
"""Create document fingerprint using sampled n-gram hashes."""
tokens = tokenize(text)
ngrams = get_ngrams(tokens, ngram_size)
fingerprints = set()
for i, ng in enumerate(ngrams):
if i % sample_rate == 0: # Sample every nth n-gram
h = hashlib.md5(''.join(ng).encode()).hexdigest()[:8]
fingerprints.add(h)
return fingerprints
def compare_documents(fp1: set, fp2: set) -> float:
"""Jaccard similarity between fingerprints."""
if not fp1 or not fp2:
return 0.0
intersection = len(fp1 & fp2)
union = len(fp1 | fp2)
return intersection / union if union > 0 else 0.0
def read_document(filepath: str) -> str:
"""Read document content."""
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def main():
parser = argparse.ArgumentParser(description='Sherlock - Text Plagiarism Detector')
parser.add_argument('files', nargs='+', help='Files to compare')
parser.add_argument('--ngram', '-n', type=int, default=3, help='N-gram size (default: 3)')
parser.add_argument('--threshold', '-t', type=float, default=0.1, help='Similarity threshold')
args = parser.parse_args()
if len(args.files) < 2:
print("Need at least 2 files to compare")
sys.exit(1)
# Read and fingerprint documents
docs = {}
for f in args.files:
if os.path.exists(f):
text = read_document(f)
docs[f] = fingerprint(text, args.ngram)
print(f"\nSherlock Plagiarism Analysis")
print("=" * 50)
# Compare all pairs
files = list(docs.keys())
for i in range(len(files)):
for j in range(i + 1, len(files)):
sim = compare_documents(docs[files[i]], docs[files[j]])
name1 = os.path.basename(files[i])
name2 = os.path.basename(files[j])
flag = " ⚠️ SUSPICIOUS" if sim >= args.threshold else ""
print(f"{name1} <-> {name2}: {sim:.1%}{flag}")
print("=" * 50)
if __name__ == '__main__':
main()
SHERLOCKEOF
chmod +x "$SHERLOCK_DIR/sherlock.py"
}
success "Sherlock installed at $SHERLOCK_DIR"
else
warn "Git not available, skipping Sherlock installation"
fi
else
warn "Sherlock already installed at $SHERLOCK_DIR"
fi
# ------------------------------------------------------------------------------
# 3. Ferret (Java-based) - Optional
# ------------------------------------------------------------------------------
echo ""
echo "=== 3. Checking for Ferret (Java-based plagiarism tool) ==="
if command -v java &>/dev/null; then
FERRET_DIR="$INSTALL_DIR/ferret"
if [ ! -d "$FERRET_DIR" ]; then
mkdir -p "$FERRET_DIR"
echo "Ferret is a Java-based tool from University of Hertfordshire."
echo "Download manually from: https://homepages.herts.ac.uk/~comqcln/Ferret/"
echo "Place JAR file in: $FERRET_DIR"
warn "Ferret requires manual download (academic license)"
fi
else
warn "Java not installed, skipping Ferret"
fi
# ------------------------------------------------------------------------------
# 4. WCopyfind via Wine (Optional)
# ------------------------------------------------------------------------------
echo ""
echo "=== 4. WCopyfind Information (Windows tool, needs Wine) ==="
if command -v wine &>/dev/null; then
echo "Wine is available. WCopyfind can be run via Wine."
echo "Download from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
echo "Run with: wine /path/to/WCopyfind.exe"
warn "WCopyfind requires manual download"
else
echo "Wine not installed. To use WCopyfind:"
echo " 1. Install wine: sudo apt install wine (or equivalent)"
echo " 2. Download WCopyfind from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
warn "WCopyfind skipped (Wine not available)"
fi
# ------------------------------------------------------------------------------
# Summary
# ------------------------------------------------------------------------------
echo ""
echo "=============================================="
echo " Installation Complete!"
echo "=============================================="
echo ""
echo "Installed tools:"
echo ""
echo "1. Python NLP Plagiarism Checker (TF-IDF, cosine similarity)"
echo " Usage: plagcheck file1.txt file2.txt"
echo " plagcheck --dir /path/to/documents/ --detailed"
echo " Location: $INSTALL_DIR/check_plagiarism.py"
echo ""
echo "2. Sherlock (n-gram fingerprinting)"
echo " Location: $SHERLOCK_DIR/sherlock.py"
echo ""
echo "3. Python virtual environment with NLP libraries:"
echo " - scikit-learn (TF-IDF, cosine similarity)"
echo " - NLTK (tokenization, stopwords)"
echo " - spaCy (NLP processing)"
echo " - gensim (document similarity)"
echo " - textdistance, fuzzywuzzy (string matching)"
echo " Activate with: source $VENV_DIR/bin/activate"
echo ""
echo "Quick Start:"
echo " plagcheck thesis_v1.pdf thesis_v2.pdf --detailed"
echo " plagcheck --dir ./student_papers/ --threshold 0.4"
echo ""
echo "Note: Ensure ~/.local/bin is in your PATH:"
echo ' export PATH="$HOME/.local/bin:$PATH"'
echo ""
echo "=============================================="
# Add to PATH reminder
if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then
warn "Add ~/.local/bin to your PATH by adding this to ~/.bashrc or ~/.zshrc:"
echo ' export PATH="$HOME/.local/bin:$PATH"'
fi