feat: made gpt_chunks work on all files

2026-07-04 14:43:08 +02:00 · 2024-01-06 21:33:32 +01:00 · 2024-01-06 21:33:32 +01:00 · 97bf546c75
commit 97bf546c75
parent fc355386b9
5 changed files with 22 additions and 129 deletions
--- a/chunks_gpt_student_two.txt
+++ b/chunks_gpt_student_two.txt
@ -1 +0,0 @@
-both bulbs a and c / still have / a closed path
--- a/gpt_chunks.py
+++ b/gpt_chunks.py
@ -790,9 +790,28 @@ def chunk_sentences(input_file, output):


 # Usage
-FILE_PATH = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
-OUTPUT_PATH = "output.txt"
+# Define your input and output file paths
+input_files = [
+    "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt",
+    "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt",
+    "test_goldStandard/images/STSint.testinput.images.sent1.txt",
+    "test_goldStandard/images/STSint.testinput.images.sent2.txt",
+    "test_goldStandard/headlines/STSint.testinput.headlines.sent1.txt",
+    "test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt"
+]
+
+output_files = [
+    "test_goldStandard/student/students-chunks-gpt-one.txt",
+    "test_goldStandard/student/students-chunks-gpt-two.txt",
+    "test_goldStandard/images/images-chunks-gpt-one.txt",
+    "test_goldStandard/images/images-chunks-gpt-two.txt",
+    "test_goldStandard/headlines/headlines-chunks-gpt-one.txt",
+    "test_goldStandard/headlines/headlines-chunks-gpt-two.txt"
+]
+
 # Change me to os.environ['API_KEY']
 client = OpenAI(api_key=os.environ['API_KEY'])

-chunk_sentences(FILE_PATH, OUTPUT_PATH)
+for input_path, output_path in zip(input_files, output_files):
+    chunk_sentences(input_path, output_path)
+    print("FINISHED GPT CHUNKS FOR FILE: ", input_path)
--- a/reformat_response.py
+++ b/reformat_response.py
@ -1,98 +0,0 @@
-import re
-
-def read_file(file_path): 
-    lines = []
-    with open(file_path, 'r') as file:
-        lines = file.readlines()
-    return lines
-
-def brackets(lines):
-    reformatted_lines = []
-    for line in lines:
-        # Split the line into segments of bracketed and non-bracketed parts
-        segments = re.split(r'(\[.*?\])', line)
-        reformatted_line = ""
-
-        for segment in segments:
-            # If the segment is already in brackets, keep as is
-            if segment.startswith('[') and segment.endswith(']'):
-                reformatted_line += segment
-            # Else, enclose the segment in brackets if it's not empty or just whitespace
-            elif segment.strip():
-                reformatted_line += f"[{segment.strip()}]"
-
-        reformatted_lines.append(reformatted_line)
-
-    # Join the reformatted lines into a single string
-    return reformatted_lines
-
-def reformat_chunk_number(lines):
-    reformatted_lines = []
-    current_chunk = []
-    last_chunk_number = 0
-
-    for line in lines:
-        # Check if the line starts with a chunk pattern (case-insensitive)
-        if re.match(r'\[chunk \d+\]', line, re.IGNORECASE):
-            # Extract the chunk number
-            chunk_number = int(re.search(r'\d+', line).group())
-
-            # If the chunk number is sequential, add the sentence to the current_chunk
-            if chunk_number == last_chunk_number + 1:
-                sentence = line.split(']', 1)[1].strip()
-                current_chunk.append(f"[{sentence}]")
-                last_chunk_number = chunk_number
-            else:
-                # Append the current_chunk to reformatted_lines and start a new chunk
-                if current_chunk:
-                    reformatted_lines.append(' '.join(current_chunk))
-                    current_chunk = []
-
-                # Start the new chunk
-                sentence = line.split(']', 1)[1].strip()
-                current_chunk = [f"[{sentence}]"]
-                last_chunk_number = chunk_number
-        else:
-            # If the line is not a chunk, add current_chunk to reformatted_lines and reset
-            if current_chunk:
-                reformatted_lines.append(' '.join(current_chunk))
-                current_chunk = []
-                last_chunk_number = 0
-
-            # Add the non-chunk line to reformatted_lines
-            reformatted_lines.append(line.strip())
-
-    # Add any remaining chunks
-    if current_chunk:
-        reformatted_lines.append(' '.join(current_chunk))
-
-    return reformatted_lines
-
-def reformat_slash_separated_sentences(lines):
-    reformatted_lines = []
-
-    for line in lines:
-        # Check if the line contains a slash "/", indicating a split sentence
-        if '/' in line:
-            # Split the sentence at each slash and enclose each segment in brackets
-            segments = [f"[{segment.strip()}]" for segment in line.split('/')]
-            reformatted_line = ' '.join(segments)
-            reformatted_lines.append(reformatted_line)
-        else:
-            # For lines without slashes, add them as they are
-            reformatted_lines.append(line.strip())
-
-    return reformatted_lines
-
-# File path
-file_path = 'test_goldStandard/student/chunks_gpt_student_one.txt'
-# Reformat the file content
-lines = read_file(file_path)
-reformated_text = reformat_chunk_number(lines)
-reformated_text = reformat_slash_separated_sentences(reformated_text)
-reformated_text = brackets(reformated_text)
-print(reformated_text)
-output_path = 'chunks_gpt_student_two_reformated.txt'
-with open(output_path, 'w') as output_file:
-    for sentence in reformated_text:
-        output_file.write(sentence + '\n')
--- a/remove_empty_brackets.py
+++ b/remove_empty_brackets.py
@ -1,26 +0,0 @@
-import re
-
-def remove_empty_brackets(input_string):
-    pattern = r'\[\s*\]'
-    cleaned_string = re.sub(pattern, '', input_string)
-    cleaned_string = re.sub(r'\s{2,}', ' ', cleaned_string).strip()
-    return cleaned_string
-
-def reformat_brackets(input_string):
-    formatted_string = re.sub(r'\[\s*(.*?)\s*\]', r'[ \1 ]', input_string)
-    return formatted_string
-
-def process_file(input_file_path, output_file_path):
-    with open(input_file_path, 'r') as file:
-        lines = file.readlines()
-
-    # Apply both remove_empty_brackets and reformat_brackets to each line
-    processed_lines = [reformat_brackets(remove_empty_brackets(line)) + '\n' for line in lines]
-
-    with open(output_file_path, 'w') as file:
-        file.writelines(processed_lines)
-
-# Example usage
-input_file_path = "output.txt"
-output_file_path = "reformated.txt"
-process_file(input_file_path, output_file_path)
--- a/work_with_me.txt
+++ b/work_with_me.txt
@ -1 +0,0 @@
-both bulbs a and c still have a closed path
				`@ -1 +0,0 @@`
				`both bulbs a and c / still have / a closed path`