chore: mati commits

This commit is contained in:
Krzysztof R. 2024-01-06 15:03:05 +01:00
parent 6c2d5dee1b
commit cc659976ee
6 changed files with 13049 additions and 27 deletions

0
__init__.py Normal file
View File

View File

@ -24,7 +24,7 @@ def process_sentence(sentence, chunked_sentences):
return chunked_sentences
def chunk_sentences(file_path, output_path, api_key):
def chunk_sentences(file_path, output_path):
# Read the sentences from the file
with open(file_path, 'r') as file:
sentences = file.readlines()
@ -39,9 +39,9 @@ def chunk_sentences(file_path, output_path, api_key):
# Usage
file_path = 'STSint.testinput.answers-students.sent1.txt'
output_path = 'chunks_one.txt'
api_key = os.environ['API_KEY']
client = OpenAI(api_key=os.environ['API_KEY'])
file_path = 'test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt'
output_path = 'chunks_gpt_headlines_two.txt'
# Change me to os.environ['API_KEY']
client = OpenAI(api_key='REDACTED_OPENAI_API_KEY')
chunk_sentences(file_path, output_path, api_key)
chunk_sentences(file_path, output_path)

22
main.py
View File

@ -1,8 +1,18 @@
import processing.py
import processing
import pandas as pd
# paths to students andsewrs database
studentAnswers1_path = "\test_goldstandard\STSint.testinput.answers-students.sent1.txt"
studentAnswers2_path = "\test_goldstandard\STSint.testinput.answers-students.sent2.txt"
studentAnsewrs_chunked_path1 = "test_goldstandard\STSint.testinput.answers-students.sent1.chunk.txt"
studentAnsewrs_chunked_path2 = "test_goldstandard\STSint.testinput.answers-students.sent2.chunk.txt"
studentsAnsewrs_alignment_path = "test_goldstandard\STSint.testinput.answers-students.wa"
studentAnswers1_path = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
studentAnswers2_path = "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt"
studentAnsewrs_chunked_path1 = "test_goldStandard/student/STSint.testinput.answers-students.sent1.chunk.txt"
studentAnsewrs_chunked_path2 = "test_goldStandard/student/STSint.testinput.answers-students.sent2.chunk.txt"
studentsAnsewrs_alignment_path = "test_goldStandard/student/STSint.testinput.answers-students.wa"
# load data
studentAnserws = processing.load_sentences(studentAnswers1_path, studentAnswers1_path)
goldstandard_chunked = processing.load_chunked(studentAnsewrs_chunked_path1, studentAnsewrs_chunked_path2)
goldstandard_alignment = processing.load_alignment(studentsAnsewrs_alignment_path)
data = pd.merge(goldstandard_chunked, goldstandard_alignment, left_index=True, right_index=True).head(5)
print(data)

View File

@ -3,7 +3,7 @@ import numpy as np
from lxml import etree
def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.dataframe:
def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.DataFrame:
"""
Loads the sentences from the given paths and outputs in a 2 columns dataframe
"""
@ -34,7 +34,7 @@ def chunk2list(chunks: str) -> list:
return split
def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.dataframe:
def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.DataFrame:
"""
Loads chunked sentances in [ chunk1 ] [ chunk2 ] format into dataframe with lists of chunks
"""
@ -65,12 +65,12 @@ def return_characteers(cell: str) -> str:
"""
converts the alignment data to restore the <==> and & tokens
"""
cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
return cell
cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
return cell
def load_alignment(alignment_path: str) -> pd.dataframe:
def load_alignment(alignment_path: str) -> pd.DataFrame:
"""
Loads the alignment file. Parses only the <alignment> tag and puts the data into a dataframe
"""
@ -82,7 +82,7 @@ def load_alignment(alignment_path: str) -> pd.dataframe:
# it also needs a root wrapped to function properly
modified_content = f'<root>{modified_content}</root>'
modified_file_path = 'test_goldstandard/fixedarrows.wa'
modified_file_path = 'temp.wa'
with open(modified_file_path, 'w') as modified_file:
modified_file.write(modified_content)
@ -116,7 +116,7 @@ def test_XML():
# test out the format
print(alignments_data[0]["alignment_text"])
def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, validate, test: pd.dataframe:
def generate_train_test_split(x: pd.DataFrame, y: pd.DataFrame):
"""
Generates a train, validate, test split of the given dataframes in a 60% 20% 20% ratio
"""
@ -125,5 +125,3 @@ def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, valida
return train, validate, test

13019
temp.wa Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long