feat: made gpt_chunks work on all files

This commit is contained in:
Krzysztof R. 2024-01-06 21:33:32 +01:00
parent fc355386b9
commit 97bf546c75
5 changed files with 22 additions and 129 deletions

View File

@ -1 +0,0 @@
both bulbs a and c / still have / a closed path

View File

@ -790,9 +790,28 @@ def chunk_sentences(input_file, output):
# Usage
FILE_PATH = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
OUTPUT_PATH = "output.txt"
# Define your input and output file paths
input_files = [
"test_goldStandard/student/STSint.testinput.answers-students.sent1.txt",
"test_goldStandard/student/STSint.testinput.answers-students.sent2.txt",
"test_goldStandard/images/STSint.testinput.images.sent1.txt",
"test_goldStandard/images/STSint.testinput.images.sent2.txt",
"test_goldStandard/headlines/STSint.testinput.headlines.sent1.txt",
"test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt"
]
output_files = [
"test_goldStandard/student/students-chunks-gpt-one.txt",
"test_goldStandard/student/students-chunks-gpt-two.txt",
"test_goldStandard/images/images-chunks-gpt-one.txt",
"test_goldStandard/images/images-chunks-gpt-two.txt",
"test_goldStandard/headlines/headlines-chunks-gpt-one.txt",
"test_goldStandard/headlines/headlines-chunks-gpt-two.txt"
]
# Change me to os.environ['API_KEY']
client = OpenAI(api_key=os.environ['API_KEY'])
chunk_sentences(FILE_PATH, OUTPUT_PATH)
for input_path, output_path in zip(input_files, output_files):
chunk_sentences(input_path, output_path)
print("FINISHED GPT CHUNKS FOR FILE: ", input_path)

View File

@ -1,98 +0,0 @@
import re
def read_file(file_path):
lines = []
with open(file_path, 'r') as file:
lines = file.readlines()
return lines
def brackets(lines):
reformatted_lines = []
for line in lines:
# Split the line into segments of bracketed and non-bracketed parts
segments = re.split(r'(\[.*?\])', line)
reformatted_line = ""
for segment in segments:
# If the segment is already in brackets, keep as is
if segment.startswith('[') and segment.endswith(']'):
reformatted_line += segment
# Else, enclose the segment in brackets if it's not empty or just whitespace
elif segment.strip():
reformatted_line += f"[{segment.strip()}]"
reformatted_lines.append(reformatted_line)
# Join the reformatted lines into a single string
return reformatted_lines
def reformat_chunk_number(lines):
reformatted_lines = []
current_chunk = []
last_chunk_number = 0
for line in lines:
# Check if the line starts with a chunk pattern (case-insensitive)
if re.match(r'\[chunk \d+\]', line, re.IGNORECASE):
# Extract the chunk number
chunk_number = int(re.search(r'\d+', line).group())
# If the chunk number is sequential, add the sentence to the current_chunk
if chunk_number == last_chunk_number + 1:
sentence = line.split(']', 1)[1].strip()
current_chunk.append(f"[{sentence}]")
last_chunk_number = chunk_number
else:
# Append the current_chunk to reformatted_lines and start a new chunk
if current_chunk:
reformatted_lines.append(' '.join(current_chunk))
current_chunk = []
# Start the new chunk
sentence = line.split(']', 1)[1].strip()
current_chunk = [f"[{sentence}]"]
last_chunk_number = chunk_number
else:
# If the line is not a chunk, add current_chunk to reformatted_lines and reset
if current_chunk:
reformatted_lines.append(' '.join(current_chunk))
current_chunk = []
last_chunk_number = 0
# Add the non-chunk line to reformatted_lines
reformatted_lines.append(line.strip())
# Add any remaining chunks
if current_chunk:
reformatted_lines.append(' '.join(current_chunk))
return reformatted_lines
def reformat_slash_separated_sentences(lines):
reformatted_lines = []
for line in lines:
# Check if the line contains a slash "/", indicating a split sentence
if '/' in line:
# Split the sentence at each slash and enclose each segment in brackets
segments = [f"[{segment.strip()}]" for segment in line.split('/')]
reformatted_line = ' '.join(segments)
reformatted_lines.append(reformatted_line)
else:
# For lines without slashes, add them as they are
reformatted_lines.append(line.strip())
return reformatted_lines
# File path
file_path = 'test_goldStandard/student/chunks_gpt_student_one.txt'
# Reformat the file content
lines = read_file(file_path)
reformated_text = reformat_chunk_number(lines)
reformated_text = reformat_slash_separated_sentences(reformated_text)
reformated_text = brackets(reformated_text)
print(reformated_text)
output_path = 'chunks_gpt_student_two_reformated.txt'
with open(output_path, 'w') as output_file:
for sentence in reformated_text:
output_file.write(sentence + '\n')

View File

@ -1,26 +0,0 @@
import re
def remove_empty_brackets(input_string):
pattern = r'\[\s*\]'
cleaned_string = re.sub(pattern, '', input_string)
cleaned_string = re.sub(r'\s{2,}', ' ', cleaned_string).strip()
return cleaned_string
def reformat_brackets(input_string):
formatted_string = re.sub(r'\[\s*(.*?)\s*\]', r'[ \1 ]', input_string)
return formatted_string
def process_file(input_file_path, output_file_path):
with open(input_file_path, 'r') as file:
lines = file.readlines()
# Apply both remove_empty_brackets and reformat_brackets to each line
processed_lines = [reformat_brackets(remove_empty_brackets(line)) + '\n' for line in lines]
with open(output_file_path, 'w') as file:
file.writelines(processed_lines)
# Example usage
input_file_path = "output.txt"
output_file_path = "reformated.txt"
process_file(input_file_path, output_file_path)

View File

@ -1 +0,0 @@
both bulbs a and c still have a closed path