From 97bf546c75874ed2ef2fdc9e20b3c4fb43e3897f Mon Sep 17 00:00:00 2001
From: "Krzysztof R." <krzysztofrudnicki0@gmail.com>
Date: Sat, 6 Jan 2024 21:33:32 +0100
Subject: [PATCH] feat: made gpt_chunks work on all files

---
 chunks_gpt_student_two.txt |  1 -
 gpt_chunks.py              | 25 ++++++++--
 reformat_response.py       | 98 --------------------------------------
 remove_empty_brackets.py   | 26 ----------
 work_with_me.txt           |  1 -
 5 files changed, 22 insertions(+), 129 deletions(-)
 delete mode 100644 chunks_gpt_student_two.txt
 delete mode 100644 reformat_response.py
 delete mode 100644 remove_empty_brackets.py
 delete mode 100644 work_with_me.txt

diff --git a/chunks_gpt_student_two.txt b/chunks_gpt_student_two.txt
deleted file mode 100644
index 77e6127e..00000000
--- a/chunks_gpt_student_two.txt
+++ /dev/null
@@ -1 +0,0 @@
-both bulbs a and c / still have / a closed path
diff --git a/gpt_chunks.py b/gpt_chunks.py
index bb4ede98..b0fec3a1 100644
--- a/gpt_chunks.py
+++ b/gpt_chunks.py
@@ -790,9 +790,28 @@ def chunk_sentences(input_file, output):
 
 
 # Usage
-FILE_PATH = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
-OUTPUT_PATH = "output.txt"
+# Define your input and output file paths
+input_files = [
+    "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt",
+    "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt",
+    "test_goldStandard/images/STSint.testinput.images.sent1.txt",
+    "test_goldStandard/images/STSint.testinput.images.sent2.txt",
+    "test_goldStandard/headlines/STSint.testinput.headlines.sent1.txt",
+    "test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt"
+]
+
+output_files = [
+    "test_goldStandard/student/students-chunks-gpt-one.txt",
+    "test_goldStandard/student/students-chunks-gpt-two.txt",
+    "test_goldStandard/images/images-chunks-gpt-one.txt",
+    "test_goldStandard/images/images-chunks-gpt-two.txt",
+    "test_goldStandard/headlines/headlines-chunks-gpt-one.txt",
+    "test_goldStandard/headlines/headlines-chunks-gpt-two.txt"
+]
+
 # Change me to os.environ['API_KEY']
 client = OpenAI(api_key=os.environ['API_KEY'])
 
-chunk_sentences(FILE_PATH, OUTPUT_PATH)
+for input_path, output_path in zip(input_files, output_files):
+    chunk_sentences(input_path, output_path)
+    print("FINISHED GPT CHUNKS FOR FILE: ", input_path)
diff --git a/reformat_response.py b/reformat_response.py
deleted file mode 100644
index 3d64c594..00000000
--- a/reformat_response.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import re
-
-def read_file(file_path): 
-    lines = []
-    with open(file_path, 'r') as file:
-        lines = file.readlines()
-    return lines
-
-def brackets(lines):
-    reformatted_lines = []
-    for line in lines:
-        # Split the line into segments of bracketed and non-bracketed parts
-        segments = re.split(r'(\[.*?\])', line)
-        reformatted_line = ""
-
-        for segment in segments:
-            # If the segment is already in brackets, keep as is
-            if segment.startswith('[') and segment.endswith(']'):
-                reformatted_line += segment
-            # Else, enclose the segment in brackets if it's not empty or just whitespace
-            elif segment.strip():
-                reformatted_line += f"[{segment.strip()}]"
-
-        reformatted_lines.append(reformatted_line)
-
-    # Join the reformatted lines into a single string
-    return reformatted_lines
-
-def reformat_chunk_number(lines):
-    reformatted_lines = []
-    current_chunk = []
-    last_chunk_number = 0
-
-    for line in lines:
-        # Check if the line starts with a chunk pattern (case-insensitive)
-        if re.match(r'\[chunk \d+\]', line, re.IGNORECASE):
-            # Extract the chunk number
-            chunk_number = int(re.search(r'\d+', line).group())
-
-            # If the chunk number is sequential, add the sentence to the current_chunk
-            if chunk_number == last_chunk_number + 1:
-                sentence = line.split(']', 1)[1].strip()
-                current_chunk.append(f"[{sentence}]")
-                last_chunk_number = chunk_number
-            else:
-                # Append the current_chunk to reformatted_lines and start a new chunk
-                if current_chunk:
-                    reformatted_lines.append(' '.join(current_chunk))
-                    current_chunk = []
-
-                # Start the new chunk
-                sentence = line.split(']', 1)[1].strip()
-                current_chunk = [f"[{sentence}]"]
-                last_chunk_number = chunk_number
-        else:
-            # If the line is not a chunk, add current_chunk to reformatted_lines and reset
-            if current_chunk:
-                reformatted_lines.append(' '.join(current_chunk))
-                current_chunk = []
-                last_chunk_number = 0
-
-            # Add the non-chunk line to reformatted_lines
-            reformatted_lines.append(line.strip())
-
-    # Add any remaining chunks
-    if current_chunk:
-        reformatted_lines.append(' '.join(current_chunk))
-
-    return reformatted_lines
-
-def reformat_slash_separated_sentences(lines):
-    reformatted_lines = []
-
-    for line in lines:
-        # Check if the line contains a slash "/", indicating a split sentence
-        if '/' in line:
-            # Split the sentence at each slash and enclose each segment in brackets
-            segments = [f"[{segment.strip()}]" for segment in line.split('/')]
-            reformatted_line = ' '.join(segments)
-            reformatted_lines.append(reformatted_line)
-        else:
-            # For lines without slashes, add them as they are
-            reformatted_lines.append(line.strip())
-
-    return reformatted_lines
-
-# File path
-file_path = 'test_goldStandard/student/chunks_gpt_student_one.txt'
-# Reformat the file content
-lines = read_file(file_path)
-reformated_text = reformat_chunk_number(lines)
-reformated_text = reformat_slash_separated_sentences(reformated_text)
-reformated_text = brackets(reformated_text)
-print(reformated_text)
-output_path = 'chunks_gpt_student_two_reformated.txt'
-with open(output_path, 'w') as output_file:
-    for sentence in reformated_text:
-        output_file.write(sentence + '\n')
\ No newline at end of file
diff --git a/remove_empty_brackets.py b/remove_empty_brackets.py
deleted file mode 100644
index 78c7bc33..00000000
--- a/remove_empty_brackets.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import re
-
-def remove_empty_brackets(input_string):
-    pattern = r'\[\s*\]'
-    cleaned_string = re.sub(pattern, '', input_string)
-    cleaned_string = re.sub(r'\s{2,}', ' ', cleaned_string).strip()
-    return cleaned_string
-
-def reformat_brackets(input_string):
-    formatted_string = re.sub(r'\[\s*(.*?)\s*\]', r'[ \1 ]', input_string)
-    return formatted_string
-
-def process_file(input_file_path, output_file_path):
-    with open(input_file_path, 'r') as file:
-        lines = file.readlines()
-
-    # Apply both remove_empty_brackets and reformat_brackets to each line
-    processed_lines = [reformat_brackets(remove_empty_brackets(line)) + '\n' for line in lines]
-
-    with open(output_file_path, 'w') as file:
-        file.writelines(processed_lines)
-
-# Example usage
-input_file_path = "output.txt"
-output_file_path = "reformated.txt"
-process_file(input_file_path, output_file_path)
diff --git a/work_with_me.txt b/work_with_me.txt
deleted file mode 100644
index e1905595..00000000
--- a/work_with_me.txt
+++ /dev/null
@@ -1 +0,0 @@
-both bulbs a and c still have a closed path
\ No newline at end of file