WUT_Computer_Science/data_loading.ipynb
2024-01-06 14:02:40 +01:00

290 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"{\n",
" \"cells\": [\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 58,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"import pandas as pd\\n\",\n",
" \"import numpy as np\\n\",\n",
" \"from lxml import etree\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"markdown\",\n",
" \"metadata\": {},\n",
" \"source\": [\n",
" \"# Input format\\n\",\n",
" \"\\n\",\n",
" \"The input consists of two files:\\n\",\n",
" \"\\n\",\n",
" \"* a file with the first sentences in each pair\\n\",\n",
" \"* a file with the second sentences in each pair\\n\",\n",
" \"\\n\",\n",
" \"The sentences are tokenized.\\n\",\n",
" \"\\n\",\n",
" \"Please check STSint.input.*.sent1.txt and STSint.*.input.sent2.txt\\n\",\n",
" \"\\n\",\n",
" \"Participants can also use the input sentences with gold standard chunks:\\n\",\n",
" \"\\n\",\n",
" \"* a file with the first sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
" \"* a file with the second sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
" \"\\n\",\n",
" \"Please check STSint.input.*.sent1.chunk.txt and STSint.input.*.sent2.chunk.txt\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 26,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"# loading unchunked headlines\\n\",\n",
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.txt\\\"\\n\",\n",
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.txt\\\"\\n\",\n",
" \"\\n\",\n",
" \"headlines_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"headlines_sentance1.columns = [\\\"headlines_sentance1\\\"]\\n\",\n",
" \"\\n\",\n",
" \"headlines_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"headlines_sentance2.columns = [\\\"headlines_sentance2\\\"]\\n\",\n",
" \"\\n\",\n",
" \"headlines = pd.concat([headlines_sentance1, headlines_sentance2], axis=1)\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 22,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"# loading unchunked images\\n\",\n",
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.images.sent1.txt\\\"\\n\",\n",
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.images.sent2.txt\\\"\\n\",\n",
" \"\\n\",\n",
" \"image_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"image_sentance1.columns = [\\\"image_sentance1\\\"]\\n\",\n",
" \"\\n\",\n",
" \"image_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"image_sentance2.columns = [\\\"image_sentance2\\\"]\\n\",\n",
" \"\\n\",\n",
" \"images = pd.concat([image_sentance1, image_sentance2], axis=1)\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 23,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"# loading unchunked students\\n\",\n",
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.answers-students.sent1.txt\\\"\\n\",\n",
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.answers-students.sent2.txt\\\"\\n\",\n",
" \"\\n\",\n",
" \"student_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"student_sentance1.columns = [\\\"student_sentance1\\\"]\\n\",\n",
" \"\\n\",\n",
" \"student_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"student_sentance2.columns = [\\\"student_sentance2\\\"]\\n\",\n",
" \"\\n\",\n",
" \"students = pd.concat([student_sentance1, student_sentance2], axis=1)\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 31,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"def chunk2list(chunks:str) -> list:\\n\",\n",
" \" \\\"\\\"\\\"\\n\",\n",
" \" Takes str that is all chunks from a chunked sentance and returns a list of all the chunks as seperate items \\n\",\n",
" \" \\\"\\\"\\\"\\n\",\n",
" \" chunks = chunks.replace('[', '')\\n\",\n",
" \" chunks = chunks.replace(']', '')\\n\",\n",
" \" chunks = chunks.replace(' ', '|')\\n\",\n",
" \" split = chunks.split('|')\\n\",\n",
" \" return split\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 33,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"# loading chunked headlines\\n\",\n",
" \"chunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.chunk.txt\\\"\\n\",\n",
" \"chunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.chunk.txt\\\"\\n\",\n",
" \"\\n\",\n",
" \"headlines_chunked_sentance1 = pd.read_csv(chunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"headlines_chunked_sentance1.columns = [\\\"headlines_chunked_sentance1\\\"]\\n\",\n",
" \"\\n\",\n",
" \"headlines_chunked_sentance2 = pd.read_csv(chunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
" \"headlines_chunked_sentance2.columns = [\\\"headlines_chunked_sentance2\\\"]\\n\",\n",
" \"\\n\",\n",
" \"headlines_chunked = pd.concat([headlines_chunked_sentance1, headlines_chunked_sentance2], axis=1)\\n\",\n",
" \"\\n\",\n",
" \"# convert chunks from str to list\\n\",\n",
" \"headlines_chunked['headlines_chunked_sentance1'] = headlines_chunked['headlines_chunked_sentance1'].apply(chunk2list)\\n\",\n",
" \"headlines_chunked['headlines_chunked_sentance2'] = headlines_chunked['headlines_chunked_sentance2'].apply(chunk2list)\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 72,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"headlines_aligned_path = \\\"test_goldstandard/STSint.testinput.headlines.wa\\\" \\n\",\n",
" \"\\n\",\n",
" \"with open(headlines_aligned_path, 'r') as file:\\n\",\n",
" \" file_content = file.read()\\n\",\n",
" \"\\n\",\n",
" \"# <==> and & break xml loaders so it needs to be replaces with something else\\n\",\n",
" \"modified_content = file_content.replace('<==>', 'ARROWS_PLACEHOLDER').replace('&', 'AMPERSAND_PLACEHOLDER')\\n\",\n",
" \"# it also needs a root wrapped to function properly \\n\",\n",
" \"modified_content = f'<root>{modified_content}</root>'\\n\",\n",
" \"\\n\",\n",
" \"modified_file_path = 'test_goldstandard/STSint.testinput.headlines.fixedarrows.wa'\\n\",\n",
" \"with open(modified_file_path, 'w') as modified_file:\\n\",\n",
" \" modified_file.write(modified_content)\\n\",\n",
" \"\\n\",\n",
" \"# Parse the modified file using ElementTree\\n\",\n",
" \"tree = etree.parse(modified_file_path)\\n\",\n",
" \"root = tree.getroot()\\n\",\n",
" \"\\n\",\n",
" \"# function for printing XML\\n\",\n",
" \"def prettyprint(element, **kwargs):\\n\",\n",
" \" xml = etree.tostring(element, pretty_print=True, **kwargs)\\n\",\n",
" \" print(xml.decode(), end='')\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 85,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"# get ansewrs\\n\",\n",
" \"alignments_data = []\\n\",\n",
" \"\\n\",\n",
" \"for alignment in root.xpath('//alignment'):\\n\",\n",
" \" # Extract relevant information from the alignment element\\n\",\n",
" \" data = {\\n\",\n",
" \" 'sentence_id': alignment.xpath('ancestor::sentence/@id')[0],\\n\",\n",
" \" 'alignment_text': alignment.text\\n\",\n",
" \" }\\n\",\n",
" \" alignments_data.append(data)\\n\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 88,\n",
" \"metadata\": {},\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"stdout\",\n",
" \"output_type\": \"stream\",\n",
" \"text\": [\n",
" \"\\n\",\n",
" \"6 7 8 ARROWS_PLACEHOLDER 5 6 // EQUI // 5 // for the Philippines ARROWS_PLACEHOLDER to Philippines \\n\",\n",
" \"5 ARROWS_PLACEHOLDER 2 // SIMI // 3 // departs ARROWS_PLACEHOLDER sends \\n\",\n",
" \"9 ARROWS_PLACEHOLDER 0 // NOALI // NIL // Thursday ARROWS_PLACEHOLDER -not aligned- \\n\",\n",
" \"1 ARROWS_PLACEHOLDER 1 // EQUI // 5 // China ARROWS_PLACEHOLDER China \\n\",\n",
" \"2 3 4 ARROWS_PLACEHOLDER 3 4 // REL // 4 // 's Peace Ark ARROWS_PLACEHOLDER aid team \\n\",\n",
" \"\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# test out the format\\n\",\n",
" \"print(alignments_data[0][\\\"alignment_text\\\"])\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 107,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"y = pd.DataFrame(alignments_data)\\n\",\n",
" \"y = y.drop(columns=[\\\"sentence_id\\\"])\\n\",\n",
" \"\\n\",\n",
" \"#return to <==> and &\\n\",\n",
" \"def return_characteers(cell: str) -> str:\\n\",\n",
" \" cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')\\n\",\n",
" \" cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')\\n\",\n",
" \" return cell\\n\",\n",
" \"\\n\",\n",
" \"y[\\\"alignment_text\\\"] = y[\\\"alignment_text\\\"].apply(return_characteers)\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 108,\n",
" \"metadata\": {},\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"stderr\",\n",
" \"output_type\": \"stream\",\n",
" \"text\": [\n",
" \"c:\\\\Users\\\\Mati\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\\n\",\n",
" \" return bound(*args, **kwds)\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# generate train test split\\n\",\n",
" \"x = headlines_chunked\\n\",\n",
" \"y = y\\n\",\n",
" \"\\n\",\n",
" \"data = pd.merge(x, y, left_index=True, right_index=True)\\n\",\n",
" \"\\n\",\n",
" \"train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])\"\n",
" ]\n",
" }\n",
" ],\n",
" \"metadata\": {\n",
" \"kernelspec\": {\n",
" \"display_name\": \"Python 3\",\n",
" \"language\": \"python\",\n",
" \"name\": \"python3\"\n",
" },\n",
" \"language_info\": {\n",
" \"codemirror_mode\": {\n",
" \"name\": \"ipython\",\n",
" \"version\": 3\n",
" },\n",
" \"file_extension\": \".py\",\n",
" \"mimetype\": \"text/x-python\",\n",
" \"name\": \"python\",\n",
" \"nbconvert_exporter\": \"python\",\n",
" \"pygments_lexer\": \"ipython3\",\n",
" \"version\": \"3.12.0\"\n",
" }\n",
" },\n",
" \"nbformat\": 4,\n",
" \"nbformat_minor\": 2\n",
"}\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}