From 97bf546c75874ed2ef2fdc9e20b3c4fb43e3897f Mon Sep 17 00:00:00 2001 From: "Krzysztof R." Date: Sat, 6 Jan 2024 21:33:32 +0100 Subject: [PATCH] feat: made gpt_chunks work on all files --- chunks_gpt_student_two.txt | 1 - gpt_chunks.py | 25 ++++++++-- reformat_response.py | 98 -------------------------------------- remove_empty_brackets.py | 26 ---------- work_with_me.txt | 1 - 5 files changed, 22 insertions(+), 129 deletions(-) delete mode 100644 chunks_gpt_student_two.txt delete mode 100644 reformat_response.py delete mode 100644 remove_empty_brackets.py delete mode 100644 work_with_me.txt diff --git a/chunks_gpt_student_two.txt b/chunks_gpt_student_two.txt deleted file mode 100644 index 77e6127e..00000000 --- a/chunks_gpt_student_two.txt +++ /dev/null @@ -1 +0,0 @@ -both bulbs a and c / still have / a closed path diff --git a/gpt_chunks.py b/gpt_chunks.py index bb4ede98..b0fec3a1 100644 --- a/gpt_chunks.py +++ b/gpt_chunks.py @@ -790,9 +790,28 @@ def chunk_sentences(input_file, output): # Usage -FILE_PATH = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt" -OUTPUT_PATH = "output.txt" +# Define your input and output file paths +input_files = [ + "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt", + "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt", + "test_goldStandard/images/STSint.testinput.images.sent1.txt", + "test_goldStandard/images/STSint.testinput.images.sent2.txt", + "test_goldStandard/headlines/STSint.testinput.headlines.sent1.txt", + "test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt" +] + +output_files = [ + "test_goldStandard/student/students-chunks-gpt-one.txt", + "test_goldStandard/student/students-chunks-gpt-two.txt", + "test_goldStandard/images/images-chunks-gpt-one.txt", + "test_goldStandard/images/images-chunks-gpt-two.txt", + "test_goldStandard/headlines/headlines-chunks-gpt-one.txt", + "test_goldStandard/headlines/headlines-chunks-gpt-two.txt" +] + # Change me to os.environ['API_KEY'] client = OpenAI(api_key=os.environ['API_KEY']) -chunk_sentences(FILE_PATH, OUTPUT_PATH) +for input_path, output_path in zip(input_files, output_files): + chunk_sentences(input_path, output_path) + print("FINISHED GPT CHUNKS FOR FILE: ", input_path) diff --git a/reformat_response.py b/reformat_response.py deleted file mode 100644 index 3d64c594..00000000 --- a/reformat_response.py +++ /dev/null @@ -1,98 +0,0 @@ -import re - -def read_file(file_path): - lines = [] - with open(file_path, 'r') as file: - lines = file.readlines() - return lines - -def brackets(lines): - reformatted_lines = [] - for line in lines: - # Split the line into segments of bracketed and non-bracketed parts - segments = re.split(r'(\[.*?\])', line) - reformatted_line = "" - - for segment in segments: - # If the segment is already in brackets, keep as is - if segment.startswith('[') and segment.endswith(']'): - reformatted_line += segment - # Else, enclose the segment in brackets if it's not empty or just whitespace - elif segment.strip(): - reformatted_line += f"[{segment.strip()}]" - - reformatted_lines.append(reformatted_line) - - # Join the reformatted lines into a single string - return reformatted_lines - -def reformat_chunk_number(lines): - reformatted_lines = [] - current_chunk = [] - last_chunk_number = 0 - - for line in lines: - # Check if the line starts with a chunk pattern (case-insensitive) - if re.match(r'\[chunk \d+\]', line, re.IGNORECASE): - # Extract the chunk number - chunk_number = int(re.search(r'\d+', line).group()) - - # If the chunk number is sequential, add the sentence to the current_chunk - if chunk_number == last_chunk_number + 1: - sentence = line.split(']', 1)[1].strip() - current_chunk.append(f"[{sentence}]") - last_chunk_number = chunk_number - else: - # Append the current_chunk to reformatted_lines and start a new chunk - if current_chunk: - reformatted_lines.append(' '.join(current_chunk)) - current_chunk = [] - - # Start the new chunk - sentence = line.split(']', 1)[1].strip() - current_chunk = [f"[{sentence}]"] - last_chunk_number = chunk_number - else: - # If the line is not a chunk, add current_chunk to reformatted_lines and reset - if current_chunk: - reformatted_lines.append(' '.join(current_chunk)) - current_chunk = [] - last_chunk_number = 0 - - # Add the non-chunk line to reformatted_lines - reformatted_lines.append(line.strip()) - - # Add any remaining chunks - if current_chunk: - reformatted_lines.append(' '.join(current_chunk)) - - return reformatted_lines - -def reformat_slash_separated_sentences(lines): - reformatted_lines = [] - - for line in lines: - # Check if the line contains a slash "/", indicating a split sentence - if '/' in line: - # Split the sentence at each slash and enclose each segment in brackets - segments = [f"[{segment.strip()}]" for segment in line.split('/')] - reformatted_line = ' '.join(segments) - reformatted_lines.append(reformatted_line) - else: - # For lines without slashes, add them as they are - reformatted_lines.append(line.strip()) - - return reformatted_lines - -# File path -file_path = 'test_goldStandard/student/chunks_gpt_student_one.txt' -# Reformat the file content -lines = read_file(file_path) -reformated_text = reformat_chunk_number(lines) -reformated_text = reformat_slash_separated_sentences(reformated_text) -reformated_text = brackets(reformated_text) -print(reformated_text) -output_path = 'chunks_gpt_student_two_reformated.txt' -with open(output_path, 'w') as output_file: - for sentence in reformated_text: - output_file.write(sentence + '\n') \ No newline at end of file diff --git a/remove_empty_brackets.py b/remove_empty_brackets.py deleted file mode 100644 index 78c7bc33..00000000 --- a/remove_empty_brackets.py +++ /dev/null @@ -1,26 +0,0 @@ -import re - -def remove_empty_brackets(input_string): - pattern = r'\[\s*\]' - cleaned_string = re.sub(pattern, '', input_string) - cleaned_string = re.sub(r'\s{2,}', ' ', cleaned_string).strip() - return cleaned_string - -def reformat_brackets(input_string): - formatted_string = re.sub(r'\[\s*(.*?)\s*\]', r'[ \1 ]', input_string) - return formatted_string - -def process_file(input_file_path, output_file_path): - with open(input_file_path, 'r') as file: - lines = file.readlines() - - # Apply both remove_empty_brackets and reformat_brackets to each line - processed_lines = [reformat_brackets(remove_empty_brackets(line)) + '\n' for line in lines] - - with open(output_file_path, 'w') as file: - file.writelines(processed_lines) - -# Example usage -input_file_path = "output.txt" -output_file_path = "reformated.txt" -process_file(input_file_path, output_file_path) diff --git a/work_with_me.txt b/work_with_me.txt deleted file mode 100644 index e1905595..00000000 --- a/work_with_me.txt +++ /dev/null @@ -1 +0,0 @@ -both bulbs a and c still have a closed path \ No newline at end of file