mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 21:03:07 +02:00
chore: mati commits
This commit is contained in:
parent
6c2d5dee1b
commit
cc659976ee
0
__init__.py
Normal file
0
__init__.py
Normal file
@ -24,7 +24,7 @@ def process_sentence(sentence, chunked_sentences):
|
||||
return chunked_sentences
|
||||
|
||||
|
||||
def chunk_sentences(file_path, output_path, api_key):
|
||||
def chunk_sentences(file_path, output_path):
|
||||
# Read the sentences from the file
|
||||
with open(file_path, 'r') as file:
|
||||
sentences = file.readlines()
|
||||
@ -39,9 +39,9 @@ def chunk_sentences(file_path, output_path, api_key):
|
||||
|
||||
|
||||
# Usage
|
||||
file_path = 'STSint.testinput.answers-students.sent1.txt'
|
||||
output_path = 'chunks_one.txt'
|
||||
api_key = os.environ['API_KEY']
|
||||
client = OpenAI(api_key=os.environ['API_KEY'])
|
||||
file_path = 'test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt'
|
||||
output_path = 'chunks_gpt_headlines_two.txt'
|
||||
# Change me to os.environ['API_KEY']
|
||||
client = OpenAI(api_key='REDACTED_OPENAI_API_KEY')
|
||||
|
||||
chunk_sentences(file_path, output_path, api_key)
|
||||
chunk_sentences(file_path, output_path)
|
||||
|
||||
22
main.py
22
main.py
@ -1,8 +1,18 @@
|
||||
import processing.py
|
||||
import processing
|
||||
import pandas as pd
|
||||
|
||||
# paths to students andsewrs database
|
||||
studentAnswers1_path = "\test_goldstandard\STSint.testinput.answers-students.sent1.txt"
|
||||
studentAnswers2_path = "\test_goldstandard\STSint.testinput.answers-students.sent2.txt"
|
||||
studentAnsewrs_chunked_path1 = "test_goldstandard\STSint.testinput.answers-students.sent1.chunk.txt"
|
||||
studentAnsewrs_chunked_path2 = "test_goldstandard\STSint.testinput.answers-students.sent2.chunk.txt"
|
||||
studentsAnsewrs_alignment_path = "test_goldstandard\STSint.testinput.answers-students.wa"
|
||||
studentAnswers1_path = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
|
||||
studentAnswers2_path = "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt"
|
||||
studentAnsewrs_chunked_path1 = "test_goldStandard/student/STSint.testinput.answers-students.sent1.chunk.txt"
|
||||
studentAnsewrs_chunked_path2 = "test_goldStandard/student/STSint.testinput.answers-students.sent2.chunk.txt"
|
||||
studentsAnsewrs_alignment_path = "test_goldStandard/student/STSint.testinput.answers-students.wa"
|
||||
|
||||
# load data
|
||||
studentAnserws = processing.load_sentences(studentAnswers1_path, studentAnswers1_path)
|
||||
goldstandard_chunked = processing.load_chunked(studentAnsewrs_chunked_path1, studentAnsewrs_chunked_path2)
|
||||
goldstandard_alignment = processing.load_alignment(studentsAnsewrs_alignment_path)
|
||||
|
||||
|
||||
data = pd.merge(goldstandard_chunked, goldstandard_alignment, left_index=True, right_index=True).head(5)
|
||||
print(data)
|
||||
@ -3,7 +3,7 @@ import numpy as np
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.dataframe:
|
||||
def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.DataFrame:
|
||||
"""
|
||||
Loads the sentences from the given paths and outputs in a 2 columns dataframe
|
||||
"""
|
||||
@ -34,7 +34,7 @@ def chunk2list(chunks: str) -> list:
|
||||
return split
|
||||
|
||||
|
||||
def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.dataframe:
|
||||
def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.DataFrame:
|
||||
"""
|
||||
Loads chunked sentances in [ chunk1 ] [ chunk2 ] format into dataframe with lists of chunks
|
||||
"""
|
||||
@ -65,12 +65,12 @@ def return_characteers(cell: str) -> str:
|
||||
"""
|
||||
converts the alignment data to restore the <==> and & tokens
|
||||
"""
|
||||
cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
|
||||
cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
|
||||
return cell
|
||||
cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
|
||||
cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
|
||||
return cell
|
||||
|
||||
|
||||
def load_alignment(alignment_path: str) -> pd.dataframe:
|
||||
def load_alignment(alignment_path: str) -> pd.DataFrame:
|
||||
"""
|
||||
Loads the alignment file. Parses only the <alignment> tag and puts the data into a dataframe
|
||||
"""
|
||||
@ -82,7 +82,7 @@ def load_alignment(alignment_path: str) -> pd.dataframe:
|
||||
# it also needs a root wrapped to function properly
|
||||
modified_content = f'<root>{modified_content}</root>'
|
||||
|
||||
modified_file_path = 'test_goldstandard/fixedarrows.wa'
|
||||
modified_file_path = 'temp.wa'
|
||||
with open(modified_file_path, 'w') as modified_file:
|
||||
modified_file.write(modified_content)
|
||||
|
||||
@ -116,7 +116,7 @@ def test_XML():
|
||||
# test out the format
|
||||
print(alignments_data[0]["alignment_text"])
|
||||
|
||||
def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, validate, test: pd.dataframe:
|
||||
def generate_train_test_split(x: pd.DataFrame, y: pd.DataFrame):
|
||||
"""
|
||||
Generates a train, validate, test split of the given dataframes in a 60% 20% 20% ratio
|
||||
"""
|
||||
@ -125,5 +125,3 @@ def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, valida
|
||||
|
||||
return train, validate, test
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user