WUT_Computer_Science/data_loading.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{\n",
    " \"cells\": [\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 58,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"import pandas as pd\\n\",\n",
    "    \"import numpy as np\\n\",\n",
    "    \"from lxml import etree\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"markdown\",\n",
    "   \"metadata\": {},\n",
    "   \"source\": [\n",
    "    \"# Input format\\n\",\n",
    "    \"\\n\",\n",
    "    \"The input consists of two files:\\n\",\n",
    "    \"\\n\",\n",
    "    \"* a file with the first sentences in each pair\\n\",\n",
    "    \"* a file with the second sentences in each pair\\n\",\n",
    "    \"\\n\",\n",
    "    \"The sentences are tokenized.\\n\",\n",
    "    \"\\n\",\n",
    "    \"Please check STSint.input.*.sent1.txt and STSint.*.input.sent2.txt\\n\",\n",
    "    \"\\n\",\n",
    "    \"Participants can also use the input sentences with gold standard chunks:\\n\",\n",
    "    \"\\n\",\n",
    "    \"* a file with the first sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
    "    \"* a file with the second sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
    "    \"\\n\",\n",
    "    \"Please check STSint.input.*.sent1.chunk.txt and STSint.input.*.sent2.chunk.txt\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 26,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"# loading unchunked headlines\\n\",\n",
    "    \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.txt\\\"\\n\",\n",
    "    \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.txt\\\"\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"headlines_sentance1.columns = [\\\"headlines_sentance1\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"headlines_sentance2.columns = [\\\"headlines_sentance2\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines = pd.concat([headlines_sentance1, headlines_sentance2], axis=1)\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 22,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"# loading unchunked images\\n\",\n",
    "    \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.images.sent1.txt\\\"\\n\",\n",
    "    \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.images.sent2.txt\\\"\\n\",\n",
    "    \"\\n\",\n",
    "    \"image_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"image_sentance1.columns = [\\\"image_sentance1\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"image_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"image_sentance2.columns = [\\\"image_sentance2\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"images = pd.concat([image_sentance1, image_sentance2], axis=1)\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 23,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"# loading unchunked students\\n\",\n",
    "    \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.answers-students.sent1.txt\\\"\\n\",\n",
    "    \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.answers-students.sent2.txt\\\"\\n\",\n",
    "    \"\\n\",\n",
    "    \"student_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"student_sentance1.columns = [\\\"student_sentance1\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"student_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"student_sentance2.columns = [\\\"student_sentance2\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"students = pd.concat([student_sentance1, student_sentance2], axis=1)\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 31,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"def chunk2list(chunks:str) -> list:\\n\",\n",
    "    \"    \\\"\\\"\\\"\\n\",\n",
    "    \"    Takes str that is all chunks from a chunked sentance and returns a list of all the chunks as seperate items \\n\",\n",
    "    \"    \\\"\\\"\\\"\\n\",\n",
    "    \"    chunks = chunks.replace('[', '')\\n\",\n",
    "    \"    chunks = chunks.replace(']', '')\\n\",\n",
    "    \"    chunks = chunks.replace('   ', '|')\\n\",\n",
    "    \"    split = chunks.split('|')\\n\",\n",
    "    \"    return split\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 33,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"# loading chunked headlines\\n\",\n",
    "    \"chunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.chunk.txt\\\"\\n\",\n",
    "    \"chunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.chunk.txt\\\"\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines_chunked_sentance1 = pd.read_csv(chunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"headlines_chunked_sentance1.columns = [\\\"headlines_chunked_sentance1\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines_chunked_sentance2 = pd.read_csv(chunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
    "    \"headlines_chunked_sentance2.columns = [\\\"headlines_chunked_sentance2\\\"]\\n\",\n",
    "    \"\\n\",\n",
    "    \"headlines_chunked = pd.concat([headlines_chunked_sentance1, headlines_chunked_sentance2], axis=1)\\n\",\n",
    "    \"\\n\",\n",
    "    \"# convert chunks from str to list\\n\",\n",
    "    \"headlines_chunked['headlines_chunked_sentance1'] = headlines_chunked['headlines_chunked_sentance1'].apply(chunk2list)\\n\",\n",
    "    \"headlines_chunked['headlines_chunked_sentance2'] = headlines_chunked['headlines_chunked_sentance2'].apply(chunk2list)\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 72,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"headlines_aligned_path = \\\"test_goldstandard/STSint.testinput.headlines.wa\\\" \\n\",\n",
    "    \"\\n\",\n",
    "    \"with open(headlines_aligned_path, 'r') as file:\\n\",\n",
    "    \"    file_content = file.read()\\n\",\n",
    "    \"\\n\",\n",
    "    \"# <==> and & break xml loaders so it needs to be replaces with something else\\n\",\n",
    "    \"modified_content = file_content.replace('<==>', 'ARROWS_PLACEHOLDER').replace('&', 'AMPERSAND_PLACEHOLDER')\\n\",\n",
    "    \"# it also needs a root wrapped to function properly \\n\",\n",
    "    \"modified_content = f'<root>{modified_content}</root>'\\n\",\n",
    "    \"\\n\",\n",
    "    \"modified_file_path = 'test_goldstandard/STSint.testinput.headlines.fixedarrows.wa'\\n\",\n",
    "    \"with open(modified_file_path, 'w') as modified_file:\\n\",\n",
    "    \"    modified_file.write(modified_content)\\n\",\n",
    "    \"\\n\",\n",
    "    \"# Parse the modified file using ElementTree\\n\",\n",
    "    \"tree = etree.parse(modified_file_path)\\n\",\n",
    "    \"root = tree.getroot()\\n\",\n",
    "    \"\\n\",\n",
    "    \"# function for printing XML\\n\",\n",
    "    \"def prettyprint(element, **kwargs):\\n\",\n",
    "    \"    xml = etree.tostring(element, pretty_print=True, **kwargs)\\n\",\n",
    "    \"    print(xml.decode(), end='')\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 85,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"# get ansewrs\\n\",\n",
    "    \"alignments_data = []\\n\",\n",
    "    \"\\n\",\n",
    "    \"for alignment in root.xpath('//alignment'):\\n\",\n",
    "    \"    # Extract relevant information from the alignment element\\n\",\n",
    "    \"    data = {\\n\",\n",
    "    \"        'sentence_id': alignment.xpath('ancestor::sentence/@id')[0],\\n\",\n",
    "    \"        'alignment_text': alignment.text\\n\",\n",
    "    \"    }\\n\",\n",
    "    \"    alignments_data.append(data)\\n\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 88,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [\n",
    "    {\n",
    "     \"name\": \"stdout\",\n",
    "     \"output_type\": \"stream\",\n",
    "     \"text\": [\n",
    "      \"\\n\",\n",
    "      \"6 7 8 ARROWS_PLACEHOLDER 5 6 // EQUI // 5 // for the Philippines ARROWS_PLACEHOLDER to Philippines \\n\",\n",
    "      \"5 ARROWS_PLACEHOLDER 2 // SIMI // 3 // departs ARROWS_PLACEHOLDER sends \\n\",\n",
    "      \"9 ARROWS_PLACEHOLDER 0 // NOALI // NIL // Thursday ARROWS_PLACEHOLDER -not aligned- \\n\",\n",
    "      \"1 ARROWS_PLACEHOLDER 1 // EQUI // 5 // China ARROWS_PLACEHOLDER China \\n\",\n",
    "      \"2 3 4 ARROWS_PLACEHOLDER 3 4 // REL // 4 // 's Peace Ark ARROWS_PLACEHOLDER aid team \\n\",\n",
    "      \"\\n\"\n",
    "     ]\n",
    "    }\n",
    "   ],\n",
    "   \"source\": [\n",
    "    \"# test out the format\\n\",\n",
    "    \"print(alignments_data[0][\\\"alignment_text\\\"])\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 107,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [],\n",
    "   \"source\": [\n",
    "    \"y = pd.DataFrame(alignments_data)\\n\",\n",
    "    \"y = y.drop(columns=[\\\"sentence_id\\\"])\\n\",\n",
    "    \"\\n\",\n",
    "    \"#return to <==> and &\\n\",\n",
    "    \"def return_characteers(cell: str) -> str:\\n\",\n",
    "    \"    cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')\\n\",\n",
    "    \"    cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')\\n\",\n",
    "    \"    return cell\\n\",\n",
    "    \"\\n\",\n",
    "    \"y[\\\"alignment_text\\\"] = y[\\\"alignment_text\\\"].apply(return_characteers)\"\n",
    "   ]\n",
    "  },\n",
    "  {\n",
    "   \"cell_type\": \"code\",\n",
    "   \"execution_count\": 108,\n",
    "   \"metadata\": {},\n",
    "   \"outputs\": [\n",
    "    {\n",
    "     \"name\": \"stderr\",\n",
    "     \"output_type\": \"stream\",\n",
    "     \"text\": [\n",
    "      \"c:\\\\Users\\\\Mati\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\\n\",\n",
    "      \"  return bound(*args, **kwds)\\n\"\n",
    "     ]\n",
    "    }\n",
    "   ],\n",
    "   \"source\": [\n",
    "    \"# generate train test split\\n\",\n",
    "    \"x = headlines_chunked\\n\",\n",
    "    \"y = y\\n\",\n",
    "    \"\\n\",\n",
    "    \"data = pd.merge(x, y, left_index=True, right_index=True)\\n\",\n",
    "    \"\\n\",\n",
    "    \"train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])\"\n",
    "   ]\n",
    "  }\n",
    " ],\n",
    " \"metadata\": {\n",
    "  \"kernelspec\": {\n",
    "   \"display_name\": \"Python 3\",\n",
    "   \"language\": \"python\",\n",
    "   \"name\": \"python3\"\n",
    "  },\n",
    "  \"language_info\": {\n",
    "   \"codemirror_mode\": {\n",
    "    \"name\": \"ipython\",\n",
    "    \"version\": 3\n",
    "   },\n",
    "   \"file_extension\": \".py\",\n",
    "   \"mimetype\": \"text/x-python\",\n",
    "   \"name\": \"python\",\n",
    "   \"nbconvert_exporter\": \"python\",\n",
    "   \"pygments_lexer\": \"ipython3\",\n",
    "   \"version\": \"3.12.0\"\n",
    "  }\n",
    " },\n",
    " \"nbformat\": 4,\n",
    " \"nbformat_minor\": 2\n",
    "}\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}