WUT_Computer_Science/compare_chunks.py



def reformat_sentence(sentence):
    """
    Reformats a sentence by replacing slashes '/' with ' ]'.
    """
    return sentence.replace('/', ' ]')

def insert_commas(sentence, reference_sentence):
    """
    Inserts commas into 'sentence' at the same positions as they appear in 'reference_sentence'.
    """
    # Splitting the sentences into chunks
    sentence_chunks = sentence.split(' ] [ ')
    reference_chunks = reference_sentence.split(' ] [ ')

    # Insert commas into the original sentence based on the reference sentence
    for i, chunk in enumerate(reference_chunks):
        if ',' in chunk and i < len(sentence_chunks):
            sentence_chunks[i] += ','

    # Reconstruct the sentence with inserted commas
    return ' ] [ '.join(sentence_chunks)


def process_sentences(sentences1, sentences2):
    """
    Processes two lists of sentences according to the specified rules.
    """
    processed_sentences = []

    for sentence1, sentence2 in zip(sentences1, sentences2):
         # Reformat the first sentence
        reformatted_sentence1 = reformat_sentence(sentence1)
        reformatted_sentence1 = insert_commas(reformatted_sentence1, sentence2)
        # Splitting the sentences into words, ignoring the square brackets
        words1 = set(' '.join(reformatted_sentence1.strip().split('] [')).replace('[', '').replace(']', '').lower().split())
        words2 = set(' '.join(sentence2.strip().split('] [')).replace('[', '').replace(']', '').lower().split())

        # Finding differences in words
        diff = words2 - words1

        # Preparing the output sentence
        output_sentence = reformatted_sentence1.strip()
        if '[ . ]' in sentence2:
            output_sentence += ' [ . ]'

        # Adding the differences and the output sentence to the processed_sentences list
        processed_sentences.append({
            "sentence1": reformatted_sentence1.strip(),
            "sentence2": sentence2.strip(),
            "differences": diff,
            "merged_sentence": output_sentence
        })
        if diff and diff != {'.'}:
            print("Difference found!")
            print("Sentence 1:", reformatted_sentence1.strip())
            print("Sentence 2:", sentence2.strip())
            print("Differences:", diff)
            print("\n")

    return processed_sentences

# First, let's open and read the contents of both files to understand their structure and content.
file_path_1 = 'test_goldStandard/images/images-chunks-gpt-one.txt'
file_path_2 = 'test_goldStandard/images/STSint.testinput.images.sent1.chunk.txt'

# Reading the files
with open(file_path_1, 'r') as file1, open(file_path_2, 'r') as file2:
    sentences_file_1 = file1.readlines()
    sentences_file_2 = file2.readlines()

# Process the sentences
processed_sentences = process_sentences(sentences_file_1, sentences_file_2)
feat: script for comparing chunks to each other 2024-01-09 21:00:42 +01:00

			`def reformat_sentence(sentence):`
			`"""`
			`Reformats a sentence by replacing slashes '/' with ' ]'.`
			`"""`
			`return sentence.replace('/', ' ]')`

			`def insert_commas(sentence, reference_sentence):`
			`"""`
			`Inserts commas into 'sentence' at the same positions as they appear in 'reference_sentence'.`
			`"""`
			`# Splitting the sentences into chunks`
			`sentence_chunks = sentence.split(' ] [ ')`
			`reference_chunks = reference_sentence.split(' ] [ ')`

			`# Insert commas into the original sentence based on the reference sentence`
			`for i, chunk in enumerate(reference_chunks):`
			`if ',' in chunk and i < len(sentence_chunks):`
			`sentence_chunks[i] += ','`

			`# Reconstruct the sentence with inserted commas`
			`return ' ] [ '.join(sentence_chunks)`


			`def process_sentences(sentences1, sentences2):`
			`"""`
			`Processes two lists of sentences according to the specified rules.`
			`"""`
			`processed_sentences = []`

			`for sentence1, sentence2 in zip(sentences1, sentences2):`
			`# Reformat the first sentence`
			`reformatted_sentence1 = reformat_sentence(sentence1)`
			`reformatted_sentence1 = insert_commas(reformatted_sentence1, sentence2)`
			`# Splitting the sentences into words, ignoring the square brackets`
			`words1 = set(' '.join(reformatted_sentence1.strip().split('] [')).replace('[', '').replace(']', '').lower().split())`
			`words2 = set(' '.join(sentence2.strip().split('] [')).replace('[', '').replace(']', '').lower().split())`

			`# Finding differences in words`
			`diff = words2 - words1`

			`# Preparing the output sentence`
			`output_sentence = reformatted_sentence1.strip()`
			`if '[ . ]' in sentence2:`
			`output_sentence += ' [ . ]'`

			`# Adding the differences and the output sentence to the processed_sentences list`
			`processed_sentences.append({`
			`"sentence1": reformatted_sentence1.strip(),`
			`"sentence2": sentence2.strip(),`
			`"differences": diff,`
			`"merged_sentence": output_sentence`
			`})`
			`if diff and diff != {'.'}:`
			`print("Difference found!")`
			`print("Sentence 1:", reformatted_sentence1.strip())`
			`print("Sentence 2:", sentence2.strip())`
			`print("Differences:", diff)`
			`print("\n")`

			`return processed_sentences`

			`# First, let's open and read the contents of both files to understand their structure and content.`
			`file_path_1 = 'test_goldStandard/images/images-chunks-gpt-one.txt'`
			`file_path_2 = 'test_goldStandard/images/STSint.testinput.images.sent1.chunk.txt'`

			`# Reading the files`
			`with open(file_path_1, 'r') as file1, open(file_path_2, 'r') as file2:`
			`sentences_file_1 = file1.readlines()`
			`sentences_file_2 = file2.readlines()`

			`# Process the sentences`
			`processed_sentences = process_sentences(sentences_file_1, sentences_file_2)`