mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 18:03:07 +02:00
git-subtree-dir: linux_configuration git-subtree-mainline:11427631cdgit-subtree-split:0762e3d07b
533 lines
17 KiB
Bash
Executable File
533 lines
17 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Install Free & Open Source Plagiarism Detection Tools for Text
|
||
# Suitable for academic work (theses, papers, etc.)
|
||
#
|
||
# Tools installed:
|
||
# 1. Python NLP-based similarity detection (sklearn, NLTK, spaCy)
|
||
# 2. Sherlock text plagiarism detector
|
||
# 3. Ferret (Java-based, if Java available)
|
||
# 4. Optional: WCopyfind via Wine (Windows tool)
|
||
|
||
set -euo pipefail
|
||
|
||
INSTALL_DIR="${HOME}/.local/share/plagiarism-tools"
|
||
VENV_DIR="${HOME}/.local/share/plagiarism-venv"
|
||
|
||
echo "=============================================="
|
||
echo " Open Source Plagiarism Detection Installer"
|
||
echo " For Academic Text (Theses, Papers, etc.)"
|
||
echo "=============================================="
|
||
echo ""
|
||
|
||
# Colors for output
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
NC='\033[0m' # No Color
|
||
|
||
success() { echo -e "${GREEN}✓ $1${NC}"; }
|
||
warn() { echo -e "${YELLOW}⚠ $1${NC}"; }
|
||
error() { echo -e "${RED}✗ $1${NC}"; }
|
||
|
||
# Create installation directory
|
||
mkdir -p "$INSTALL_DIR"
|
||
|
||
# ------------------------------------------------------------------------------
|
||
# 1. Python-based NLP Plagiarism Detection Environment
|
||
# ------------------------------------------------------------------------------
|
||
echo ""
|
||
echo "=== 1. Installing Python NLP-based Plagiarism Tools ==="
|
||
|
||
# Check for Python 3
|
||
if ! command -v python3 &> /dev/null; then
|
||
error "Python 3 is required but not installed."
|
||
exit 1
|
||
fi
|
||
|
||
# Create virtual environment
|
||
if [ ! -d "$VENV_DIR" ]; then
|
||
echo "Creating Python virtual environment..."
|
||
python3 -m venv "$VENV_DIR"
|
||
success "Virtual environment created at $VENV_DIR"
|
||
else
|
||
warn "Virtual environment already exists at $VENV_DIR"
|
||
fi
|
||
|
||
# Activate and install packages
|
||
source "$VENV_DIR/bin/activate"
|
||
|
||
echo "Installing Python packages for text similarity detection..."
|
||
pip install --upgrade pip
|
||
|
||
pip install --progress-bar on \
|
||
scikit-learn \
|
||
nltk \
|
||
spacy \
|
||
gensim \
|
||
numpy \
|
||
pandas \
|
||
python-docx \
|
||
PyPDF2 \
|
||
beautifulsoup4 \
|
||
lxml \
|
||
textdistance \
|
||
fuzzywuzzy \
|
||
python-Levenshtein
|
||
|
||
success "Python NLP packages installed"
|
||
|
||
# Download NLTK data
|
||
echo "Downloading NLTK data (stopwords, punkt tokenizer)..."
|
||
python3 -c "
|
||
import nltk
|
||
nltk.download('punkt')
|
||
nltk.download('stopwords')
|
||
nltk.download('punkt_tab')
|
||
nltk.download('averaged_perceptron_tagger')
|
||
nltk.download('wordnet')
|
||
"
|
||
success "NLTK data downloaded"
|
||
|
||
# Download spaCy English model (small)
|
||
echo "Downloading spaCy English model..."
|
||
python3 -m spacy download en_core_web_sm 2> /dev/null || warn "spaCy model download may need manual install: python -m spacy download en_core_web_sm"
|
||
success "spaCy model installed"
|
||
|
||
# Create a simple plagiarism checker script
|
||
cat > "$INSTALL_DIR/check_plagiarism.py" << 'PYEOF'
|
||
#!/usr/bin/env python3
|
||
"""
|
||
Simple Text Plagiarism Checker
|
||
Compares documents using multiple similarity algorithms.
|
||
|
||
Usage:
|
||
python check_plagiarism.py file1.txt file2.txt [file3.txt ...]
|
||
python check_plagiarism.py --dir /path/to/documents/
|
||
"""
|
||
|
||
import argparse
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import List, Tuple
|
||
|
||
import numpy as np
|
||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
import nltk
|
||
from nltk.corpus import stopwords
|
||
from nltk.tokenize import word_tokenize, sent_tokenize
|
||
|
||
# Ensure NLTK data is available
|
||
try:
|
||
stopwords.words('english')
|
||
except LookupError:
|
||
nltk.download('stopwords', quiet=True)
|
||
nltk.download('punkt', quiet=True)
|
||
|
||
|
||
def read_file(filepath: str) -> str:
|
||
"""Read text from various file formats."""
|
||
path = Path(filepath)
|
||
suffix = path.suffix.lower()
|
||
|
||
if suffix == '.pdf':
|
||
try:
|
||
from PyPDF2 import PdfReader
|
||
reader = PdfReader(filepath)
|
||
return ' '.join(page.extract_text() or '' for page in reader.pages)
|
||
except ImportError:
|
||
print("Warning: PyPDF2 not installed, cannot read PDF files")
|
||
return ""
|
||
elif suffix == '.docx':
|
||
try:
|
||
from docx import Document
|
||
doc = Document(filepath)
|
||
return ' '.join(para.text for para in doc.paragraphs)
|
||
except ImportError:
|
||
print("Warning: python-docx not installed, cannot read DOCX files")
|
||
return ""
|
||
else:
|
||
# Assume plain text
|
||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||
return f.read()
|
||
|
||
|
||
def preprocess_text(text: str) -> str:
|
||
"""Clean and preprocess text for comparison."""
|
||
# Lowercase
|
||
text = text.lower()
|
||
# Tokenize and remove stopwords
|
||
try:
|
||
stop_words = set(stopwords.words('english'))
|
||
words = word_tokenize(text)
|
||
words = [w for w in words if w.isalnum() and w not in stop_words]
|
||
return ' '.join(words)
|
||
except Exception:
|
||
# Fallback: simple preprocessing
|
||
return ' '.join(text.split())
|
||
|
||
|
||
def compute_similarity_matrix(documents: List[str]) -> np.ndarray:
|
||
"""Compute TF-IDF cosine similarity matrix."""
|
||
vectorizer = TfidfVectorizer(
|
||
ngram_range=(1, 3), # Use unigrams, bigrams, trigrams
|
||
min_df=1,
|
||
max_df=0.95
|
||
)
|
||
tfidf_matrix = vectorizer.fit_transform(documents)
|
||
return cosine_similarity(tfidf_matrix)
|
||
|
||
|
||
def find_similar_passages(text1: str, text2: str, min_words: int = 5) -> List[Tuple[str, str, float]]:
|
||
"""Find similar sentence-level passages between two texts."""
|
||
sentences1 = sent_tokenize(text1)
|
||
sentences2 = sent_tokenize(text2)
|
||
|
||
if not sentences1 or not sentences2:
|
||
return []
|
||
|
||
# Filter short sentences
|
||
sentences1 = [s for s in sentences1 if len(s.split()) >= min_words]
|
||
sentences2 = [s for s in sentences2 if len(s.split()) >= min_words]
|
||
|
||
if not sentences1 or not sentences2:
|
||
return []
|
||
|
||
all_sentences = sentences1 + sentences2
|
||
preprocessed = [preprocess_text(s) for s in all_sentences]
|
||
|
||
try:
|
||
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
|
||
tfidf_matrix = vectorizer.fit_transform(preprocessed)
|
||
except ValueError:
|
||
return []
|
||
|
||
n1 = len(sentences1)
|
||
similarities = []
|
||
|
||
for i, s1 in enumerate(sentences1):
|
||
for j, s2 in enumerate(sentences2):
|
||
sim = cosine_similarity(
|
||
tfidf_matrix[i:i+1],
|
||
tfidf_matrix[n1+j:n1+j+1]
|
||
)[0][0]
|
||
if sim > 0.5: # Threshold for suspicious similarity
|
||
similarities.append((s1, s2, sim))
|
||
|
||
return sorted(similarities, key=lambda x: x[2], reverse=True)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Text Plagiarism Checker - Compare documents for similarity'
|
||
)
|
||
parser.add_argument('files', nargs='*', help='Files to compare')
|
||
parser.add_argument('--dir', '-d', help='Directory containing documents to compare')
|
||
parser.add_argument('--threshold', '-t', type=float, default=0.3,
|
||
help='Similarity threshold for flagging (0-1, default: 0.3)')
|
||
parser.add_argument('--detailed', '-v', action='store_true',
|
||
help='Show detailed similar passages')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Collect files
|
||
files = []
|
||
if args.files:
|
||
files.extend(args.files)
|
||
if args.dir:
|
||
dir_path = Path(args.dir)
|
||
for ext in ['*.txt', '*.pdf', '*.docx', '*.md', '*.tex']:
|
||
files.extend(str(f) for f in dir_path.glob(ext))
|
||
|
||
if len(files) < 2:
|
||
print("Error: Need at least 2 files to compare")
|
||
parser.print_help()
|
||
sys.exit(1)
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f" Plagiarism Check - Analyzing {len(files)} documents")
|
||
print(f"{'='*60}\n")
|
||
|
||
# Read and preprocess documents
|
||
documents = []
|
||
filenames = []
|
||
for f in files:
|
||
if os.path.exists(f):
|
||
text = read_file(f)
|
||
if text.strip():
|
||
documents.append(preprocess_text(text))
|
||
filenames.append(os.path.basename(f))
|
||
else:
|
||
print(f"Warning: {f} is empty or unreadable")
|
||
else:
|
||
print(f"Warning: {f} does not exist")
|
||
|
||
if len(documents) < 2:
|
||
print("Error: Not enough valid documents to compare")
|
||
sys.exit(1)
|
||
|
||
# Compute similarity
|
||
print("Computing document similarities...\n")
|
||
sim_matrix = compute_similarity_matrix(documents)
|
||
|
||
# Report results
|
||
print(f"{'Document Pair':<50} {'Similarity':>12}")
|
||
print("-" * 62)
|
||
|
||
suspicious_pairs = []
|
||
for i in range(len(documents)):
|
||
for j in range(i + 1, len(documents)):
|
||
similarity = sim_matrix[i][j]
|
||
pair_name = f"{filenames[i]} <-> {filenames[j]}"
|
||
|
||
if similarity >= args.threshold:
|
||
suspicious_pairs.append((i, j, similarity, pair_name))
|
||
print(f"{pair_name:<50} {similarity:>10.1%} ⚠️")
|
||
else:
|
||
print(f"{pair_name:<50} {similarity:>10.1%}")
|
||
|
||
print("-" * 62)
|
||
|
||
# Summary
|
||
if suspicious_pairs:
|
||
print(f"\n⚠️ {len(suspicious_pairs)} pair(s) exceed {args.threshold:.0%} similarity threshold\n")
|
||
|
||
if args.detailed:
|
||
print("\n" + "="*60)
|
||
print(" Detailed Similar Passages")
|
||
print("="*60)
|
||
|
||
for i, j, sim, pair_name in suspicious_pairs[:3]: # Limit to top 3
|
||
print(f"\n{pair_name} ({sim:.1%} similar):")
|
||
print("-" * 40)
|
||
|
||
raw_docs = [read_file(files[i]), read_file(files[j])]
|
||
passages = find_similar_passages(raw_docs[0], raw_docs[1])
|
||
|
||
for s1, s2, psim in passages[:5]: # Top 5 passages
|
||
print(f"\n[{psim:.0%}] Document 1: \"{s1[:100]}...\"")
|
||
print(f" Document 2: \"{s2[:100]}...\"")
|
||
else:
|
||
print(f"\n✓ No document pairs exceed {args.threshold:.0%} similarity threshold")
|
||
|
||
print("\n" + "="*60)
|
||
print(" Analysis complete")
|
||
print("="*60 + "\n")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
PYEOF
|
||
|
||
chmod +x "$INSTALL_DIR/check_plagiarism.py"
|
||
success "Created plagiarism checker script at $INSTALL_DIR/check_plagiarism.py"
|
||
|
||
# Create convenience wrapper
|
||
mkdir -p "$HOME/.local/bin"
|
||
cat > "$HOME/.local/bin/plagcheck" << WRAPEOF
|
||
#!/usr/bin/env bash
|
||
# Wrapper for plagiarism checker
|
||
source "$VENV_DIR/bin/activate"
|
||
python "$INSTALL_DIR/check_plagiarism.py" "\$@"
|
||
WRAPEOF
|
||
chmod +x "$HOME/.local/bin/plagcheck"
|
||
success "Created 'plagcheck' command in ~/.local/bin/"
|
||
|
||
deactivate
|
||
|
||
# ------------------------------------------------------------------------------
|
||
# 2. Sherlock for Text (Clone from GitHub)
|
||
# ------------------------------------------------------------------------------
|
||
echo ""
|
||
echo "=== 2. Installing Sherlock Text Plagiarism Detector ==="
|
||
|
||
SHERLOCK_DIR="$INSTALL_DIR/sherlock"
|
||
if [ ! -d "$SHERLOCK_DIR" ]; then
|
||
# There are several Sherlock implementations; using a popular Python one
|
||
if command -v git &> /dev/null; then
|
||
# Clone a text-based similarity tool
|
||
git clone --depth 1 https://github.com/Zedeldi/sherlock-py.git "$SHERLOCK_DIR" 2> /dev/null || {
|
||
warn "Could not clone sherlock-py, trying alternative..."
|
||
# Alternative: Create a simple n-gram based sherlock
|
||
mkdir -p "$SHERLOCK_DIR"
|
||
cat > "$SHERLOCK_DIR/sherlock.py" << 'SHERLOCKEOF'
|
||
#!/usr/bin/env python3
|
||
"""
|
||
Sherlock - Simple text plagiarism detector using n-gram fingerprinting.
|
||
Based on the original Sherlock algorithm.
|
||
"""
|
||
|
||
import argparse
|
||
import hashlib
|
||
import os
|
||
import sys
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
|
||
def tokenize(text: str) -> list:
|
||
"""Simple word tokenization."""
|
||
return [w.lower() for w in text.split() if w.isalnum()]
|
||
|
||
|
||
def get_ngrams(tokens: list, n: int = 3) -> list:
|
||
"""Generate n-grams from token list."""
|
||
return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
|
||
|
||
|
||
def fingerprint(text: str, ngram_size: int = 3, sample_rate: int = 4) -> set:
|
||
"""Create document fingerprint using sampled n-gram hashes."""
|
||
tokens = tokenize(text)
|
||
ngrams = get_ngrams(tokens, ngram_size)
|
||
|
||
fingerprints = set()
|
||
for i, ng in enumerate(ngrams):
|
||
if i % sample_rate == 0: # Sample every nth n-gram
|
||
h = hashlib.md5(''.join(ng).encode()).hexdigest()[:8]
|
||
fingerprints.add(h)
|
||
|
||
return fingerprints
|
||
|
||
|
||
def compare_documents(fp1: set, fp2: set) -> float:
|
||
"""Jaccard similarity between fingerprints."""
|
||
if not fp1 or not fp2:
|
||
return 0.0
|
||
intersection = len(fp1 & fp2)
|
||
union = len(fp1 | fp2)
|
||
return intersection / union if union > 0 else 0.0
|
||
|
||
|
||
def read_document(filepath: str) -> str:
|
||
"""Read document content."""
|
||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||
return f.read()
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='Sherlock - Text Plagiarism Detector')
|
||
parser.add_argument('files', nargs='+', help='Files to compare')
|
||
parser.add_argument('--ngram', '-n', type=int, default=3, help='N-gram size (default: 3)')
|
||
parser.add_argument('--threshold', '-t', type=float, default=0.1, help='Similarity threshold')
|
||
|
||
args = parser.parse_args()
|
||
|
||
if len(args.files) < 2:
|
||
print("Need at least 2 files to compare")
|
||
sys.exit(1)
|
||
|
||
# Read and fingerprint documents
|
||
docs = {}
|
||
for f in args.files:
|
||
if os.path.exists(f):
|
||
text = read_document(f)
|
||
docs[f] = fingerprint(text, args.ngram)
|
||
|
||
print(f"\nSherlock Plagiarism Analysis")
|
||
print("=" * 50)
|
||
|
||
# Compare all pairs
|
||
files = list(docs.keys())
|
||
for i in range(len(files)):
|
||
for j in range(i + 1, len(files)):
|
||
sim = compare_documents(docs[files[i]], docs[files[j]])
|
||
name1 = os.path.basename(files[i])
|
||
name2 = os.path.basename(files[j])
|
||
flag = " ⚠️ SUSPICIOUS" if sim >= args.threshold else ""
|
||
print(f"{name1} <-> {name2}: {sim:.1%}{flag}")
|
||
|
||
print("=" * 50)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
SHERLOCKEOF
|
||
chmod +x "$SHERLOCK_DIR/sherlock.py"
|
||
}
|
||
success "Sherlock installed at $SHERLOCK_DIR"
|
||
else
|
||
warn "Git not available, skipping Sherlock installation"
|
||
fi
|
||
else
|
||
warn "Sherlock already installed at $SHERLOCK_DIR"
|
||
fi
|
||
|
||
# ------------------------------------------------------------------------------
|
||
# 3. Ferret (Java-based) - Optional
|
||
# ------------------------------------------------------------------------------
|
||
echo ""
|
||
echo "=== 3. Checking for Ferret (Java-based plagiarism tool) ==="
|
||
|
||
if command -v java &> /dev/null; then
|
||
FERRET_DIR="$INSTALL_DIR/ferret"
|
||
if [ ! -d "$FERRET_DIR" ]; then
|
||
mkdir -p "$FERRET_DIR"
|
||
echo "Ferret is a Java-based tool from University of Hertfordshire."
|
||
echo "Download manually from: https://homepages.herts.ac.uk/~comqcln/Ferret/"
|
||
echo "Place JAR file in: $FERRET_DIR"
|
||
warn "Ferret requires manual download (academic license)"
|
||
fi
|
||
else
|
||
warn "Java not installed, skipping Ferret"
|
||
fi
|
||
|
||
# ------------------------------------------------------------------------------
|
||
# 4. WCopyfind via Wine (Optional)
|
||
# ------------------------------------------------------------------------------
|
||
echo ""
|
||
echo "=== 4. WCopyfind Information (Windows tool, needs Wine) ==="
|
||
|
||
if command -v wine &> /dev/null; then
|
||
echo "Wine is available. WCopyfind can be run via Wine."
|
||
echo "Download from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
|
||
echo "Run with: wine /path/to/WCopyfind.exe"
|
||
warn "WCopyfind requires manual download"
|
||
else
|
||
echo "Wine not installed. To use WCopyfind:"
|
||
echo " 1. Install wine: sudo apt install wine (or equivalent)"
|
||
echo " 2. Download WCopyfind from: https://plagiarism.bloomfieldmedia.com/software/wcopyfind/"
|
||
warn "WCopyfind skipped (Wine not available)"
|
||
fi
|
||
|
||
# ------------------------------------------------------------------------------
|
||
# Summary
|
||
# ------------------------------------------------------------------------------
|
||
echo ""
|
||
echo "=============================================="
|
||
echo " Installation Complete!"
|
||
echo "=============================================="
|
||
echo ""
|
||
echo "Installed tools:"
|
||
echo ""
|
||
echo "1. Python NLP Plagiarism Checker (TF-IDF, cosine similarity)"
|
||
echo " Usage: plagcheck file1.txt file2.txt"
|
||
echo " plagcheck --dir /path/to/documents/ --detailed"
|
||
echo " Location: $INSTALL_DIR/check_plagiarism.py"
|
||
echo ""
|
||
echo "2. Sherlock (n-gram fingerprinting)"
|
||
echo " Location: $SHERLOCK_DIR/sherlock.py"
|
||
echo ""
|
||
echo "3. Python virtual environment with NLP libraries:"
|
||
echo " - scikit-learn (TF-IDF, cosine similarity)"
|
||
echo " - NLTK (tokenization, stopwords)"
|
||
echo " - spaCy (NLP processing)"
|
||
echo " - gensim (document similarity)"
|
||
echo " - textdistance, fuzzywuzzy (string matching)"
|
||
echo " Activate with: source $VENV_DIR/bin/activate"
|
||
echo ""
|
||
echo "Quick Start:"
|
||
echo " plagcheck thesis_v1.pdf thesis_v2.pdf --detailed"
|
||
echo " plagcheck --dir ./student_papers/ --threshold 0.4"
|
||
echo ""
|
||
echo "Note: Ensure ~/.local/bin is in your PATH:"
|
||
echo ' export PATH="$HOME/.local/bin:$PATH"'
|
||
echo ""
|
||
echo "=============================================="
|
||
|
||
# Add to PATH reminder
|
||
if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then
|
||
warn "Add ~/.local/bin to your PATH by adding this to ~/.bashrc or ~/.zshrc:"
|
||
echo ' export PATH="$HOME/.local/bin:$PATH"'
|
||
fi
|