mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 17:43:12 +02:00
818 lines
28 KiB
Python
818 lines
28 KiB
Python
"""
|
|
Orders gpt to chunkify files from task
|
|
"""
|
|
import re
|
|
import os
|
|
from openai import OpenAI
|
|
|
|
|
|
def sentence_loop(sentences):
|
|
"""
|
|
Processes a list of sentences, applying chunking and formatting to each.
|
|
|
|
This function iterates through a list of sentences, processing each one
|
|
using the `process_sentence` function. It collects the results into a list.
|
|
If processing fails for any sentence, the function halts and returns None.
|
|
|
|
Parameters:
|
|
sentences (list of str): A list of sentences to be processed.
|
|
|
|
Returns:
|
|
list of str: A list of processed and formatted sentences, or None if processing
|
|
of any sentence fails.
|
|
"""
|
|
# Process each sentence
|
|
chunked_sentences = []
|
|
for sentence in sentences:
|
|
chunked_sentences = process_sentence(sentence, chunked_sentences)
|
|
if not chunked_sentences:
|
|
return []
|
|
return chunked_sentences
|
|
|
|
|
|
def remove_leading_numbering(input_string):
|
|
"""
|
|
Removes any leading numbering from the string.
|
|
|
|
This function is designed to strip off numbering
|
|
that typically appears at the beginning of a string,
|
|
such as '1. ', '2. ', etc. It's useful for processing strings where such numbering is irrelevant
|
|
or should be omitted for formatting purposes.
|
|
|
|
Parameters:
|
|
string (str): The string from which leading numbering should be removed.
|
|
|
|
Returns:
|
|
str: The string with leading numbering removed.
|
|
"""
|
|
return re.sub(r"^\d+\.\s*", "", input_string)
|
|
|
|
|
|
def process_brackets(input_string):
|
|
"""
|
|
Processes a string to extract and adjust parts enclosed in square brackets.
|
|
|
|
This function identifies and extracts all segments within square brackets from the input string.
|
|
It also adjusts the last part by handling any trailing punctuation,
|
|
using the 'handle_trailing_punctuation' function.
|
|
This is particularly useful for strings
|
|
that contain multiple bracketed segments that need individual processing.
|
|
|
|
Parameters:
|
|
input_string (str): The string to be processed for bracketed segments.
|
|
|
|
Returns:
|
|
list of str: A list containing the extracted and adjusted bracketed segments from the string.
|
|
"""
|
|
parts = re.findall(r"\[[^\]]*\]", input_string)
|
|
if parts:
|
|
parts[-1] = handle_trailing_punctuation(input_string, parts[-1])
|
|
return parts
|
|
|
|
|
|
def handle_trailing_punctuation(input_string, last_part):
|
|
"""
|
|
Handles and appends trailing punctuation to the last part of a bracketed segment.
|
|
|
|
This function checks the given string for any punctuation (like '.', '!', or '?')
|
|
immediately following the last bracketed segment. If found, it appends this punctuation
|
|
to the end of the last part inside the brackets. This is used to maintain sentence
|
|
punctuation integrity after processing bracketed segments.
|
|
|
|
Parameters:
|
|
input_string (str): The original string containing the bracketed segments.
|
|
last_part (str): The last bracketed segment extracted from the string.
|
|
|
|
Returns:
|
|
str: The last part with any trailing punctuation correctly appended.
|
|
"""
|
|
trailing_punctuation = re.search(r"\]\s*([.!?])\s*$", input_string)
|
|
if trailing_punctuation:
|
|
return last_part[:-1] + trailing_punctuation.group(1) + "]"
|
|
return last_part
|
|
|
|
|
|
def process_non_bracketed_parts(input_string):
|
|
"""
|
|
Split the input string by hyphens surrounded by optional whitespace and return
|
|
a list of non-empty parts enclosed in square brackets.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be processed.
|
|
|
|
Returns:
|
|
list of str: A list of non-empty parts from the input string, enclosed in square brackets.
|
|
"""
|
|
parts = re.split(r"\s*-\s*", input_string)
|
|
return [f"[ {part.strip()} ]" for part in parts if part.strip()]
|
|
|
|
|
|
def reformat_list(strings):
|
|
"""
|
|
Reformat a list of strings by processing each string element. For strings containing
|
|
square brackets, it calls 'process_brackets' to handle the content within the brackets,
|
|
and for other strings, it calls 'process_non_bracketed_parts' to handle them. The
|
|
resulting parts are joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
strings (list of str): A list of strings to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of input strings.
|
|
"""
|
|
result = []
|
|
for string in strings:
|
|
string = remove_leading_numbering(string)
|
|
if "[" in string and "]" in string:
|
|
result.extend(process_brackets(string))
|
|
else:
|
|
result.extend(process_non_bracketed_parts(string))
|
|
return " ".join(result)
|
|
|
|
|
|
def reformat_square_not_full(input_string):
|
|
"""
|
|
Reformat the input string by splitting it into parts that are either enclosed in
|
|
square brackets or not. For parts not in brackets, it removes trailing periods,
|
|
trims spaces, and encloses them in square brackets. For parts already in brackets,
|
|
it adds spaces inside the brackets. The formatted parts are then joined into a
|
|
single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Split the string into parts that are either in brackets or not
|
|
parts = re.split(r"(\[[^\]]*\])", input_string)
|
|
|
|
formatted_parts = []
|
|
for part in parts:
|
|
if part: # Ignore empty strings
|
|
# For parts not in brackets,
|
|
# remove trailing period, trim spaces, and enclose in brackets
|
|
if not part.startswith("["):
|
|
part = re.sub(r"\.\s*$", "", part).strip()
|
|
formatted_parts.append(f"[ {part} ]")
|
|
else:
|
|
# For parts already in brackets, add spaces inside
|
|
formatted_parts.append(re.sub(r"\[([^]]+)\]", r"[ \1 ]", part))
|
|
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def reformat_slash(input_string):
|
|
"""
|
|
Reformat the input string by splitting it on slashes, trimming spaces from each
|
|
part, and enclosing each part in square brackets. The formatted parts are then
|
|
joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Split the string on slashes and strip spaces
|
|
parts = [part.strip() for part in input_string.split("/")]
|
|
# Enclose each part in square brackets
|
|
formatted_parts = [f"[ {part} ]" for part in parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def reformat_pipe(input_string):
|
|
"""
|
|
Reformat the input string by splitting it on ' | ' (pipe symbol with spaces),
|
|
trimming spaces from each part, and enclosing each part in square brackets. The
|
|
formatted parts are then joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Split the string on ' | ' and strip spaces
|
|
parts = [part.strip() for part in input_string.split("|")]
|
|
# Enclose each part in square brackets
|
|
formatted_parts = [f"[ {part} ]" for part in parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_pipe_format(input_string):
|
|
"""
|
|
Check if the input string follows the ' | ' (pipe symbol with spaces) pattern.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check for ' | ' pattern
|
|
pattern = r"^(?:[^|]+\|)+[^|]+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def is_numbered_line_format(input_string):
|
|
"""
|
|
Check if the input string(input_string) follows the pattern of lines starting with
|
|
'number. word/phrase \n'.
|
|
|
|
Parameters:
|
|
input_string (str or list of str): The input string(input_string) to be checked.
|
|
It can be a single string or a list of strings.
|
|
|
|
Returns:
|
|
bool or list of bool:
|
|
If the input is a single string, it returns True
|
|
if it matches the pattern, otherwise False.
|
|
If the input is a list of strings
|
|
it returns a list of booleans indicating if each string matches the pattern.
|
|
"""
|
|
# Regular expression to match lines starting with 'number. word/phrase \n'
|
|
pattern = r"^(?:\d+\.\s+.+\n?)+$"
|
|
|
|
# Check if the input is a list of strings
|
|
if isinstance(input_string, list):
|
|
return [bool(re.match(pattern, string)) for string in string]
|
|
|
|
# If the input is a single string, process it directly
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def reformat_ists(input_string):
|
|
"""
|
|
Reformat the input string by finding all occurrences of the (iSTS n) pattern
|
|
(where 'n' is a number), extracting the text inside the parentheses, and
|
|
enclosing each extracted part in square brackets. The formatted parts are
|
|
then joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Use regular expression to find all occurrences of the iSTS pattern and
|
|
# extract the text
|
|
parts = re.findall(r"\(iSTS \d+\)\s*([^)]+)", input_string)
|
|
# Enclose each extracted part in square brackets
|
|
formatted_parts = [f"[ {part.strip()} ]" for part in parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_ists_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern of (iSTS n) followed by text, with
|
|
multiple occurrences allowed, where 'n' is a number.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check for the iSTS pattern
|
|
pattern = r"^(?:\(iSTS \d+\)\s*[^)]+\s*)+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def reformat_with_sections(input_string):
|
|
"""
|
|
Reformat the input string by using regular expressions to split it into parts
|
|
following section labels (e.g., [S1], [S2], etc.), removing empty strings,
|
|
and enclosing each part in square brackets. The formatted parts are then
|
|
joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Use regular expression to extract text following the section labels
|
|
parts = re.split(r"\s*\[\s*S\d+\s*\]\s*", input_string)
|
|
# Remove empty strings and enclose each part in square brackets
|
|
formatted_parts = [f"[ {part.strip()} ]" for part in parts if part.strip()]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_section_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern of section labels (e.g., [S1], [S2], etc.)
|
|
followed by text, with multiple occurrences allowed.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check for the section label pattern
|
|
pattern = r"^(?:\s*\[\s*S\d+\s*\]\s*.+)+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def is_chunk_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern of 'Chunk X: text' or '[Chunk X] text'
|
|
patterns, where 'X' is a number, with multiple occurrences allowed.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Further optimized regular expression
|
|
pattern = r"^(Chunk \d+:|\[\s*Chunk \d+\s*\]).*?(?:\s+(?:Chunk \d+:|\[\s*Chunk \d+\s*\]).*?)*$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
|
|
def reformat_chunks(input_string):
|
|
"""
|
|
Reformat the input string by finding all occurrences of both
|
|
'Chunk X: text' and '[Chunk X] text'
|
|
patterns, extracting the text within them, and enclosing each extracted part in square brackets.
|
|
The formatted parts are then joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Find all occurrences of both chunk patterns and extract the text
|
|
parts = re.findall(
|
|
r"(?:Chunk \d+:|\[\s*Chunk \d+\s*\])\s*(.*?)(?=\s*(?:Chunk \d+:|\[\s*Chunk \d+\s*\]|$))",
|
|
input_string,
|
|
)
|
|
|
|
# Flatten the list of tuples and remove empty strings
|
|
flattened_parts = [item for sublist in parts for item in sublist if item]
|
|
|
|
# Enclose each extracted part in square brackets
|
|
formatted_parts = [f"[ {part.strip()} ]" for part in flattened_parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def reformat_ists_markers(input_string):
|
|
"""
|
|
Reformat the input string by splitting it at '[ iSTS ]' markers, removing empty strings,
|
|
and enclosing each part in square brackets. The formatted parts are then joined into
|
|
a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Split the string at '[ iSTS ]', remove empty strings, and enclose each
|
|
# part in square brackets
|
|
parts = [
|
|
part.strip()
|
|
for part in re.split(r"\[\s*iSTS\s*\]", input_string)
|
|
if part.strip()
|
|
]
|
|
formatted_parts = [f"[ {part} ]" for part in parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_ists_marker_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern with '[ iSTS ]' markers followed by text,
|
|
with multiple occurrences allowed.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check for the pattern with '[ iSTS ]'
|
|
pattern = r"^(?:\[\s*iSTS\s*\].+)+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def reformat_with_dashes(input_string):
|
|
"""
|
|
Reformat the input string by splitting it on ' - ' (dash with spaces),
|
|
removing empty strings, and enclosing each part in square brackets.
|
|
The formatted parts are then joined into a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Split the string on ' - ', remove empty strings, and enclose each part
|
|
# in square brackets
|
|
parts = [part.strip() for part in input_string.split("-") if part.strip()]
|
|
formatted_parts = [f"[ {part} ]" for part in parts]
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_dash_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern with ' - ' (dash with spaces)
|
|
separating non-empty parts.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check for the pattern with ' - '
|
|
pattern = r"^[^-]+(?:\s*-\s*[^-]+)+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def preliminary_reformat(input_string):
|
|
"""
|
|
Remove common prefixes like
|
|
'iSTS chunks:'
|
|
and
|
|
'Here are the iSTS chunks for the given sentence:'
|
|
from the input string.
|
|
This function is intended as a preliminary step before further reformatting.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: The input string with common prefixes stripped.
|
|
"""
|
|
input_string = input_string.strip("iSTS chunks:")
|
|
input_string = input_string.strip(
|
|
"Here are the iSTS chunks for the given sentence:"
|
|
)
|
|
return input_string
|
|
|
|
|
|
def reformat_brackets_and_text(input_string):
|
|
"""
|
|
Reformat the input string by removing trailing dots, splitting it into parts that are either
|
|
enclosed in square brackets or not, removing commas and trimming spaces for non-bracket parts,
|
|
and enclosing non-bracket parts in square brackets. The formatted parts are then joined into
|
|
a single string with spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string containing the processed parts of the input string.
|
|
"""
|
|
# Remove the trailing dot if present
|
|
input_string = re.sub(r"\.$", "", input_string)
|
|
|
|
# Split the string into parts that are either in brackets or not
|
|
parts = re.split(r"(\[[^\]]*\])", input_string)
|
|
|
|
formatted_parts = []
|
|
for part in parts:
|
|
if part: # Ignore empty strings
|
|
# Remove commas and trim spaces for parts not in brackets
|
|
if not part.startswith("["):
|
|
part = re.sub(r",\s*", " ", part).strip()
|
|
# Enclose the non-bracket part in brackets
|
|
formatted_parts.append(f"[ {part} ]")
|
|
else:
|
|
# Directly append the part in brackets
|
|
formatted_parts.append(part)
|
|
|
|
return " ".join(formatted_parts)
|
|
|
|
|
|
def is_brackets_and_text_format(input_string):
|
|
"""
|
|
Check if the input string follows the pattern of parts
|
|
that are either enclosed in square brackets
|
|
or not, with optional commas and spaces between the parts.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be checked.
|
|
|
|
Returns:
|
|
bool: True if the input string matches the pattern, otherwise False.
|
|
"""
|
|
# Regular expression to check the pattern
|
|
pattern = r"^(\[[^\]]*\]|\s*[^[\]]+\s*)(,\s*|\s+|$)+$"
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def is_correct_format(input_string):
|
|
"""
|
|
Checks if a given string matches a specific format of sequences enclosed in square brackets.
|
|
|
|
This function uses a regular expression to determine if the input string adheres to a format
|
|
where sequences of text are enclosed within square brackets. Each sequence must be separated
|
|
by spaces, and the entire string must only consist of these bracketed sequences.
|
|
|
|
Parameters:
|
|
input_string (str): The string to be checked against the format.
|
|
|
|
Returns:
|
|
bool: True if the string matches the format, False otherwise.
|
|
"""
|
|
# Regular expression to match sequences of '[ some words ]'
|
|
pattern = r"^(\[\s*[^]]+\s*\]\s*)+$"
|
|
|
|
# Check if the string matches the pattern
|
|
return bool(re.match(pattern, input_string))
|
|
|
|
|
|
def check_format(input_data):
|
|
"""
|
|
Checks if the input data (input_data or list of strings) matches a specific format.
|
|
|
|
This function determines if the input data,
|
|
which can be either a single string or a list of strings,
|
|
adheres to a predefined format where sequences of text are enclosed within square brackets.
|
|
The format check is performed by the 'is_correct_format' function.
|
|
For a list, the check is applied to each string in the list.
|
|
|
|
Parameters:
|
|
input_data (str or list of str):
|
|
The input data to be checked.
|
|
Can be a single string or a list of strings.
|
|
|
|
Returns:
|
|
bool or list of bool: If input_data is a string,
|
|
returns True if it matches the format, False otherwise.
|
|
If input_data is a list, returns a list of booleans corresponding to each string.
|
|
Returns None if input_data is neither a string nor a list.
|
|
"""
|
|
# Check if the input is a string, and if so, process it directly
|
|
if isinstance(input_data, str):
|
|
return is_correct_format(input_data)
|
|
|
|
# If the input is a list, process each element in the list
|
|
if isinstance(input_data, list):
|
|
return [is_correct_format(string) for string in input_data]
|
|
|
|
# Return None or raise an error if the input is neither a string nor a list
|
|
return None
|
|
|
|
def check_and_reformat(input_string, check_func, format_func):
|
|
"""
|
|
Checks what type of string got inputed and formats it accordingly
|
|
"""
|
|
if check_func(input_string):
|
|
print(f"formatted by {format_func.__name__}", input_string)
|
|
return format_func(input_string)
|
|
return None
|
|
|
|
def reformat(input_string):
|
|
"""
|
|
Reformat the input string by applying a series of format checks and corresponding
|
|
reformatting functions. This function attempts to identify the format of the input
|
|
string and apply the appropriate reformatting.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str or bool: The reformatted string if a suitable format is recognized, or False if the format
|
|
is not recognized and cannot be reformatted.
|
|
"""
|
|
input_string = preliminary_reformat(input_string)
|
|
if check_format(input_string):
|
|
return input_string
|
|
|
|
format_checks = [
|
|
(is_numbered_line_format, reformat_list),
|
|
(is_brackets_and_text_format, reformat_brackets_and_text),
|
|
(lambda s: "[" in s and "]" in s, reformat_square_not_full),
|
|
(lambda s: "/" in s, reformat_slash),
|
|
(is_pipe_format, reformat_pipe),
|
|
(is_ists_format, reformat_ists),
|
|
(is_section_format, reformat_with_sections),
|
|
(is_chunk_format, reformat_chunks),
|
|
(is_ists_marker_format, reformat_ists_markers),
|
|
(is_dash_format, reformat_with_dashes),
|
|
]
|
|
|
|
for check_func, format_func in format_checks:
|
|
result = check_and_reformat(input_string, check_func, format_func)
|
|
if result is not None:
|
|
return result
|
|
|
|
print("ERROR! reformat did not recognize the format of string! ", input_string)
|
|
return False
|
|
|
|
|
|
def remove_empty_brackets(input_string):
|
|
"""
|
|
Remove empty brackets (with or without spaces) from the input string and
|
|
eliminate any extra spaces that might be left after removing brackets.
|
|
|
|
Parameters:
|
|
input_string (str): The input string from which empty brackets should be removed.
|
|
|
|
Returns:
|
|
str: The input string with empty brackets removed and extra spaces eliminated.
|
|
"""
|
|
# Regular expression to match empty brackets (with any number of spaces)
|
|
pattern = r"\[\s*\]"
|
|
|
|
# Replace empty brackets with an empty string
|
|
cleaned_string = re.sub(pattern, "", input_string)
|
|
|
|
# Remove any extra spaces that might be left after removing brackets
|
|
cleaned_string = re.sub(r"\s{2,}", " ", cleaned_string).strip()
|
|
|
|
return cleaned_string
|
|
|
|
|
|
def reformat_brackets(input_string):
|
|
"""
|
|
Reformat the input string by enclosing text inside square brackets.
|
|
This function searches for text enclosed in square brackets (with or without spaces)
|
|
and ensures that the text inside the brackets is surrounded by spaces.
|
|
|
|
Parameters:
|
|
input_string (str): The input string to be reformatted.
|
|
|
|
Returns:
|
|
str: A reformatted string with text inside square brackets and spaces around it.
|
|
"""
|
|
formatted_string = re.sub(r"\[\s*(.*?)\s*\]", r"[ \1 ]", input_string)
|
|
return formatted_string
|
|
|
|
|
|
def generate_instruction(sentence):
|
|
"""
|
|
Generate an instruction for dividing a given sentence into iSTS chunks.
|
|
|
|
Parameters:
|
|
sentence (str): The sentence to be divided into chunks.
|
|
|
|
Returns:
|
|
str: A formatted instruction with the provided sentence.
|
|
"""
|
|
return (
|
|
"Please divide the following sentence into iSTS chunks."
|
|
"Try to return chunks as a string in format "
|
|
"[chunk 1] [chunk 2] [chunk 3] and so on... "
|
|
"Here is the sentence: " + sentence
|
|
)
|
|
|
|
|
|
def call_api(instruction):
|
|
"""
|
|
Call the API using the provided instruction to generate a response.
|
|
|
|
Parameters:
|
|
client (OpenAI.Client): An OpenAI API client object.
|
|
instruction (str): The instruction to provide to the API.
|
|
|
|
Returns:
|
|
dict: The response generated by the API.
|
|
"""
|
|
return client.chat.completions.create(
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": instruction,
|
|
}
|
|
],
|
|
model="gpt-3.5-turbo",
|
|
)
|
|
|
|
|
|
def process_response(response):
|
|
"""
|
|
Process the response received from the API and extract the chunked sentence.
|
|
|
|
Parameters:
|
|
response (dict): The response generated by the API.
|
|
|
|
Returns:
|
|
str: The chunked sentence extracted from the response.
|
|
"""
|
|
chunked_sentence = response.choices[0].message.content.strip()
|
|
return chunked_sentence
|
|
|
|
|
|
def check_and_format_sentence(chunked_sentence):
|
|
"""
|
|
Check and format a chunked sentence by applying a series of checks and reformatting.
|
|
|
|
Parameters:
|
|
chunked_sentence (str): The chunked sentence to be checked and formatted.
|
|
|
|
Returns:
|
|
str or bool: The formatted chunked sentence if it passes all checks,
|
|
or False if there are errors.
|
|
"""
|
|
reformated = reformat(chunked_sentence)
|
|
if not reformated:
|
|
print("ERROR! failed to reformat! ", chunked_sentence, reformated)
|
|
return False
|
|
reformated = reformated.strip(",")
|
|
if not is_correct_format(reformated):
|
|
print("ERROR! wrong format ", chunked_sentence, reformated)
|
|
return False
|
|
return reformated
|
|
|
|
|
|
def finalize_formatting(reformatted):
|
|
"""
|
|
Finalize the formatting of a reformatted string by removing empty brackets and ensuring
|
|
that text inside brackets is surrounded by spaces.
|
|
|
|
Parameters:
|
|
reformatted (str): The reformatted string to be finalized.
|
|
|
|
Returns:
|
|
str: The finalized and formatted string.
|
|
"""
|
|
reformatted = remove_empty_brackets(reformatted)
|
|
reformatted = reformat_brackets(reformatted)
|
|
return reformatted
|
|
|
|
|
|
def process_sentence(sentence, chunked_sentences):
|
|
"""
|
|
Process a sentence by generating iSTS chunks and
|
|
adding them to the provided list of chunked sentences.
|
|
|
|
Parameters:
|
|
sentence (str): The input sentence to be processed.
|
|
chunked_sentences (list): A list of previously chunked sentences to which the processed sentence
|
|
will be appended.
|
|
client (OpenAI.Client): An OpenAI API client object for calling the API.
|
|
|
|
Returns:
|
|
list or bool: The updated list of chunked sentences if successful, or False if there are errors.
|
|
"""
|
|
instruction = generate_instruction(sentence)
|
|
response = call_api(instruction)
|
|
chunked_sentence = process_response(response)
|
|
reformatted = check_and_format_sentence(chunked_sentence)
|
|
if not reformatted:
|
|
return False
|
|
reformatted = finalize_formatting(reformatted)
|
|
chunked_sentences.append(reformatted)
|
|
return chunked_sentences
|
|
|
|
|
|
def chunk_sentences(input_file, output):
|
|
"""
|
|
Read sentences from a file, process each sentence to generate iSTS chunks,
|
|
and write the chunked sentences to an output file.
|
|
|
|
Parameters:
|
|
file_path (str): The path to the input file containing sentences to be processed.
|
|
output_path (str): The path to the output file where chunked sentences will be written.
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
# Read the sentences from the file
|
|
with open(input_file, "r", encoding="utf-8") as file:
|
|
sentences = file.readlines()
|
|
|
|
# Process each sentence
|
|
chunked_sentences = sentence_loop(sentences)
|
|
print(chunked_sentences)
|
|
|
|
# Write the chunked sentences to a new file
|
|
with open(output, "w", encoding="utf-8") as output_file:
|
|
for sentence in chunked_sentences:
|
|
output_file.write(sentence + "\n")
|
|
|
|
|
|
# Usage
|
|
# Define your input and output file paths
|
|
input_files = [
|
|
"test_goldStandard/student/STSint.testinput.answers-students.sent1.txt",
|
|
"test_goldStandard/student/STSint.testinput.answers-students.sent2.txt",
|
|
"test_goldStandard/images/STSint.testinput.images.sent1.txt",
|
|
"test_goldStandard/images/STSint.testinput.images.sent2.txt",
|
|
"test_goldStandard/headlines/STSint.testinput.headlines.sent1.txt",
|
|
"test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt"
|
|
]
|
|
|
|
output_files = [
|
|
"test_goldStandard/student/students-chunks-gpt-one.txt",
|
|
"test_goldStandard/student/students-chunks-gpt-two.txt",
|
|
"test_goldStandard/images/images-chunks-gpt-one.txt",
|
|
"test_goldStandard/images/images-chunks-gpt-two.txt",
|
|
"test_goldStandard/headlines/headlines-chunks-gpt-one.txt",
|
|
"test_goldStandard/headlines/headlines-chunks-gpt-two.txt"
|
|
]
|
|
|
|
# Change me to os.environ['API_KEY']
|
|
client = OpenAI(api_key=os.environ['API_KEY'])
|
|
|
|
for input_path, output_path in zip(input_files, output_files):
|
|
chunk_sentences(input_path, output_path)
|
|
print("FINISHED GPT CHUNKS FOR FILE: ", input_path)
|