chore: mati commits

2026-07-04 21:03:07 +02:00 · 2024-01-06 15:03:05 +01:00 · 2024-01-06 15:03:05 +01:00 · cc659976ee
commit cc659976ee
parent 6c2d5dee1b
6 changed files with 13049 additions and 27 deletions
--- a/init.py
+++ b/init.py
--- a/gpt_chunks.py
+++ b/gpt_chunks.py
@ -24,7 +24,7 @@ def process_sentence(sentence, chunked_sentences):
  return chunked_sentences


-def chunk_sentences(file_path, output_path, api_key):
+def chunk_sentences(file_path, output_path):
  # Read the sentences from the file
  with open(file_path, 'r') as file:
    sentences = file.readlines()
@ -39,9 +39,9 @@ def chunk_sentences(file_path, output_path, api_key):


 # Usage
-file_path = 'STSint.testinput.answers-students.sent1.txt'
-output_path = 'chunks_one.txt'
-api_key = os.environ['API_KEY']
-client = OpenAI(api_key=os.environ['API_KEY'])
+file_path = 'test_goldStandard/headlines/STSint.testinput.headlines.sent2.txt'
+output_path = 'chunks_gpt_headlines_two.txt'
+# Change me to os.environ['API_KEY']
+client = OpenAI(api_key='REDACTED_OPENAI_API_KEY')

-chunk_sentences(file_path, output_path, api_key)
+chunk_sentences(file_path, output_path)
--- a/main.py
+++ b/main.py
@ -1,8 +1,18 @@
-import processing.py
+import processing
+import pandas as pd

 # paths to students andsewrs database
-studentAnswers1_path = "\test_goldstandard\STSint.testinput.answers-students.sent1.txt"
-studentAnswers2_path = "\test_goldstandard\STSint.testinput.answers-students.sent2.txt"
-studentAnsewrs_chunked_path1 = "test_goldstandard\STSint.testinput.answers-students.sent1.chunk.txt"
-studentAnsewrs_chunked_path2 = "test_goldstandard\STSint.testinput.answers-students.sent2.chunk.txt"
-studentsAnsewrs_alignment_path = "test_goldstandard\STSint.testinput.answers-students.wa"
+studentAnswers1_path = "test_goldStandard/student/STSint.testinput.answers-students.sent1.txt"
+studentAnswers2_path = "test_goldStandard/student/STSint.testinput.answers-students.sent2.txt"
+studentAnsewrs_chunked_path1 = "test_goldStandard/student/STSint.testinput.answers-students.sent1.chunk.txt"
+studentAnsewrs_chunked_path2 = "test_goldStandard/student/STSint.testinput.answers-students.sent2.chunk.txt"
+studentsAnsewrs_alignment_path = "test_goldStandard/student/STSint.testinput.answers-students.wa"
+
+# load data
+studentAnserws = processing.load_sentences(studentAnswers1_path, studentAnswers1_path)
+goldstandard_chunked = processing.load_chunked(studentAnsewrs_chunked_path1, studentAnsewrs_chunked_path2)
+goldstandard_alignment = processing.load_alignment(studentsAnsewrs_alignment_path)
+
+
+data = pd.merge(goldstandard_chunked, goldstandard_alignment, left_index=True, right_index=True).head(5)
+print(data)
--- a/processing.py
+++ b/processing.py
@ -3,7 +3,7 @@ import numpy as np
 from lxml import etree


-def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.dataframe:
+def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.DataFrame:
  """
  Loads the sentences from the given paths and outputs in a 2 columns dataframe
  """
@ -34,7 +34,7 @@ def chunk2list(chunks: str) -> list:
  return split


-def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.dataframe:
+def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.DataFrame:
  """
  Loads chunked sentances in [ chunk1 ] [ chunk2 ] format into dataframe with lists of chunks
  """
@ -65,12 +65,12 @@ def return_characteers(cell: str) -> str:
  """
  converts the alignment data to restore the <==> and & tokens
  """
-    cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
-    cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
-    return cell
+  cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
+  cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
+  return cell


-def load_alignment(alignment_path: str) -> pd.dataframe:
+def load_alignment(alignment_path: str) -> pd.DataFrame:
  """
  Loads the alignment file. Parses only the <alignment> tag and puts the data into a dataframe
  """
@ -82,7 +82,7 @@ def load_alignment(alignment_path: str) -> pd.dataframe:
  # it also needs a root wrapped to function properly 
  modified_content = f'<root>{modified_content}</root>'

-  modified_file_path = 'test_goldstandard/fixedarrows.wa'
+  modified_file_path = 'temp.wa'
  with open(modified_file_path, 'w') as modified_file:
    modified_file.write(modified_content)

@ -116,7 +116,7 @@ def test_XML():
  # test out the format
  print(alignments_data[0]["alignment_text"])

-def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, validate, test: pd.dataframe:
+def generate_train_test_split(x: pd.DataFrame, y: pd.DataFrame):
  """
  Generates a train, validate, test split of the given dataframes in a 60% 20% 20% ratio
  """
@ -125,5 +125,3 @@ def generate_train_test_split(x: pd.dataframe, y: pd.dataframe) -> train, valida

  return train, validate, test

-
-
--- a/temp.wa
+++ b/temp.wa
--- a/test_goldStandard/emergency.py
+++ b/test_goldStandard/emergency.py