WUT_Computer_Science/processing.py

import pandas as pd
import numpy as np
from lxml import etree


def load_sentences(senteance1_path: str, sentance2_path: str) -> pd.DataFrame:
  """
  Loads the sentences from the given paths and outputs in a 2 columns dataframe
  """
  sentance1 = pd.read_csv(senteance1_path,
                          dtype=str,
                          delimiter="}",
                          header=None)
  sentance1.columns = ["headlines_sentance1"]

  sentance2 = pd.read_csv(sentance2_path,
                          dtype=str,
                          delimiter="}",
                          header=None)
  sentance2.columns = ["headlines_sentance2"]

  sentances = pd.concat([sentance1, sentance2], axis=1)
  return sentances


def chunk2list(chunks: str) -> list:
  """
  Takes str that is all chunks from a chunked sentance and returns a list of all the chunks as seperate items
  """
  chunks = chunks.replace('[', '')
  chunks = chunks.replace(']', '')
  chunks = chunks.replace('   ', '|')
  split = chunks.split('|')
  return split


def load_chunked(chunked_path1: str, chunked_path2: str) -> pd.DataFrame:
  """
  Loads chunked sentances in [ chunk1 ] [ chunk2 ] format into dataframe with lists of chunks
  """
  chunked_sentance1 = pd.read_csv(chunked_path1,
                                  dtype=str,
                                  delimiter="}",
                                  header=None)
  chunked_sentance1.columns = ["headlines_chunked_sentance1"]

  chunked_sentance2 = pd.read_csv(chunked_path2,
                                  dtype=str,
                                  delimiter="}",
                                  header=None)
  chunked_sentance2.columns = ["headlines_chunked_sentance2"]

  headlines_chunked = pd.concat([chunked_sentance1, chunked_sentance2], axis=1)

  # convert chunks from str to list
  headlines_chunked['headlines_chunked_sentance1'] = headlines_chunked[
      'headlines_chunked_sentance1'].apply(chunk2list)
  headlines_chunked['headlines_chunked_sentance2'] = headlines_chunked[
      'headlines_chunked_sentance2'].apply(chunk2list)

  return headlines_chunked


def return_characteers(cell: str) -> str:
  """
  converts the alignment data to restore the <==> and & tokens
  """
  cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')
  cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')
  return cell


def load_alignment(alignment_path: str) -> pd.DataFrame:
  """
  Loads the alignment file. Parses only the <alignment> tag and puts the data into a dataframe
  """
  with open(alignment_path, 'r') as file:
    file_content = file.read()

  # <==> and & break xml loaders so it needs to be replaces with something else
  modified_content = file_content.replace('<==>', 'ARROWS_PLACEHOLDER').replace('&', 'AMPERSAND_PLACEHOLDER')
  # it also needs a root wrapped to function properly
  modified_content = f'<root>{modified_content}</root>'

  modified_file_path = 'temp.wa'
  with open(modified_file_path, 'w') as modified_file:
    modified_file.write(modified_content)

  # Parse the modified file using ElementTree
  tree = etree.parse(modified_file_path)
  root = tree.getroot()

  # get ansewrs
  alignments_data = []

  for alignment in root.xpath('//alignment'):
      # Extract relevant information from the alignment element
      data = {
          'sentence_id': alignment.xpath('ancestor::sentence/@id')[0],
          'alignment_text': alignment.text
      }
      alignments_data.append(data)

  y = pd.DataFrame(alignments_data)
  y = y.drop(columns=["sentence_id"])
  y["alignment_text"] = y["alignment_text"].apply(return_characteers)
  return y


def prettyprint(element, **kwargs):
  xml = etree.tostring(element, pretty_print=True, **kwargs)
  print(xml.decode(), end='')


def test_XML():
  # test out the format
  print(alignments_data[0]["alignment_text"])

def generate_train_test_split(x: pd.DataFrame, y: pd.DataFrame):
  """
  Generates a train, validate, test split of the given dataframes in a 60% 20% 20% ratio
  """
  data = pd.merge(x, y, left_index=True, right_index=True)
  train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])

  return train, validate, test