mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 21:43:08 +02:00
74 lines
2.8 KiB
Python
74 lines
2.8 KiB
Python
|
|
|
||
|
|
|
||
|
|
def reformat_sentence(sentence):
|
||
|
|
"""
|
||
|
|
Reformats a sentence by replacing slashes '/' with ' ]'.
|
||
|
|
"""
|
||
|
|
return sentence.replace('/', ' ]')
|
||
|
|
|
||
|
|
def insert_commas(sentence, reference_sentence):
|
||
|
|
"""
|
||
|
|
Inserts commas into 'sentence' at the same positions as they appear in 'reference_sentence'.
|
||
|
|
"""
|
||
|
|
# Splitting the sentences into chunks
|
||
|
|
sentence_chunks = sentence.split(' ] [ ')
|
||
|
|
reference_chunks = reference_sentence.split(' ] [ ')
|
||
|
|
|
||
|
|
# Insert commas into the original sentence based on the reference sentence
|
||
|
|
for i, chunk in enumerate(reference_chunks):
|
||
|
|
if ',' in chunk and i < len(sentence_chunks):
|
||
|
|
sentence_chunks[i] += ','
|
||
|
|
|
||
|
|
# Reconstruct the sentence with inserted commas
|
||
|
|
return ' ] [ '.join(sentence_chunks)
|
||
|
|
|
||
|
|
|
||
|
|
def process_sentences(sentences1, sentences2):
|
||
|
|
"""
|
||
|
|
Processes two lists of sentences according to the specified rules.
|
||
|
|
"""
|
||
|
|
processed_sentences = []
|
||
|
|
|
||
|
|
for sentence1, sentence2 in zip(sentences1, sentences2):
|
||
|
|
# Reformat the first sentence
|
||
|
|
reformatted_sentence1 = reformat_sentence(sentence1)
|
||
|
|
reformatted_sentence1 = insert_commas(reformatted_sentence1, sentence2)
|
||
|
|
# Splitting the sentences into words, ignoring the square brackets
|
||
|
|
words1 = set(' '.join(reformatted_sentence1.strip().split('] [')).replace('[', '').replace(']', '').lower().split())
|
||
|
|
words2 = set(' '.join(sentence2.strip().split('] [')).replace('[', '').replace(']', '').lower().split())
|
||
|
|
|
||
|
|
# Finding differences in words
|
||
|
|
diff = words2 - words1
|
||
|
|
|
||
|
|
# Preparing the output sentence
|
||
|
|
output_sentence = reformatted_sentence1.strip()
|
||
|
|
if '[ . ]' in sentence2:
|
||
|
|
output_sentence += ' [ . ]'
|
||
|
|
|
||
|
|
# Adding the differences and the output sentence to the processed_sentences list
|
||
|
|
processed_sentences.append({
|
||
|
|
"sentence1": reformatted_sentence1.strip(),
|
||
|
|
"sentence2": sentence2.strip(),
|
||
|
|
"differences": diff,
|
||
|
|
"merged_sentence": output_sentence
|
||
|
|
})
|
||
|
|
if diff and diff != {'.'}:
|
||
|
|
print("Difference found!")
|
||
|
|
print("Sentence 1:", reformatted_sentence1.strip())
|
||
|
|
print("Sentence 2:", sentence2.strip())
|
||
|
|
print("Differences:", diff)
|
||
|
|
print("\n")
|
||
|
|
|
||
|
|
return processed_sentences
|
||
|
|
|
||
|
|
# First, let's open and read the contents of both files to understand their structure and content.
|
||
|
|
file_path_1 = 'test_goldStandard/images/images-chunks-gpt-one.txt'
|
||
|
|
file_path_2 = 'test_goldStandard/images/STSint.testinput.images.sent1.chunk.txt'
|
||
|
|
|
||
|
|
# Reading the files
|
||
|
|
with open(file_path_1, 'r') as file1, open(file_path_2, 'r') as file2:
|
||
|
|
sentences_file_1 = file1.readlines()
|
||
|
|
sentences_file_2 = file2.readlines()
|
||
|
|
|
||
|
|
# Process the sentences
|
||
|
|
processed_sentences = process_sentences(sentences_file_1, sentences_file_2)
|