{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "{\n", " \"cells\": [\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 58,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"import pandas as pd\\n\",\n", " \"import numpy as np\\n\",\n", " \"from lxml import etree\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"markdown\",\n", " \"metadata\": {},\n", " \"source\": [\n", " \"# Input format\\n\",\n", " \"\\n\",\n", " \"The input consists of two files:\\n\",\n", " \"\\n\",\n", " \"* a file with the first sentences in each pair\\n\",\n", " \"* a file with the second sentences in each pair\\n\",\n", " \"\\n\",\n", " \"The sentences are tokenized.\\n\",\n", " \"\\n\",\n", " \"Please check STSint.input.*.sent1.txt and STSint.*.input.sent2.txt\\n\",\n", " \"\\n\",\n", " \"Participants can also use the input sentences with gold standard chunks:\\n\",\n", " \"\\n\",\n", " \"* a file with the first sentences in each pair, with '[' and ']' to mark chunks\\n\",\n", " \"* a file with the second sentences in each pair, with '[' and ']' to mark chunks\\n\",\n", " \"\\n\",\n", " \"Please check STSint.input.*.sent1.chunk.txt and STSint.input.*.sent2.chunk.txt\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 26,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"# loading unchunked headlines\\n\",\n", " \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.txt\\\"\\n\",\n", " \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.txt\\\"\\n\",\n", " \"\\n\",\n", " \"headlines_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"headlines_sentance1.columns = [\\\"headlines_sentance1\\\"]\\n\",\n", " \"\\n\",\n", " \"headlines_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"headlines_sentance2.columns = [\\\"headlines_sentance2\\\"]\\n\",\n", " \"\\n\",\n", " \"headlines = pd.concat([headlines_sentance1, headlines_sentance2], axis=1)\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 22,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"# loading unchunked images\\n\",\n", " \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.images.sent1.txt\\\"\\n\",\n", " \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.images.sent2.txt\\\"\\n\",\n", " \"\\n\",\n", " \"image_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"image_sentance1.columns = [\\\"image_sentance1\\\"]\\n\",\n", " \"\\n\",\n", " \"image_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"image_sentance2.columns = [\\\"image_sentance2\\\"]\\n\",\n", " \"\\n\",\n", " \"images = pd.concat([image_sentance1, image_sentance2], axis=1)\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 23,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"# loading unchunked students\\n\",\n", " \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.answers-students.sent1.txt\\\"\\n\",\n", " \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.answers-students.sent2.txt\\\"\\n\",\n", " \"\\n\",\n", " \"student_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"student_sentance1.columns = [\\\"student_sentance1\\\"]\\n\",\n", " \"\\n\",\n", " \"student_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"student_sentance2.columns = [\\\"student_sentance2\\\"]\\n\",\n", " \"\\n\",\n", " \"students = pd.concat([student_sentance1, student_sentance2], axis=1)\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 31,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"def chunk2list(chunks:str) -> list:\\n\",\n", " \" \\\"\\\"\\\"\\n\",\n", " \" Takes str that is all chunks from a chunked sentance and returns a list of all the chunks as seperate items \\n\",\n", " \" \\\"\\\"\\\"\\n\",\n", " \" chunks = chunks.replace('[', '')\\n\",\n", " \" chunks = chunks.replace(']', '')\\n\",\n", " \" chunks = chunks.replace(' ', '|')\\n\",\n", " \" split = chunks.split('|')\\n\",\n", " \" return split\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 33,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"# loading chunked headlines\\n\",\n", " \"chunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.chunk.txt\\\"\\n\",\n", " \"chunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.chunk.txt\\\"\\n\",\n", " \"\\n\",\n", " \"headlines_chunked_sentance1 = pd.read_csv(chunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"headlines_chunked_sentance1.columns = [\\\"headlines_chunked_sentance1\\\"]\\n\",\n", " \"\\n\",\n", " \"headlines_chunked_sentance2 = pd.read_csv(chunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n", " \"headlines_chunked_sentance2.columns = [\\\"headlines_chunked_sentance2\\\"]\\n\",\n", " \"\\n\",\n", " \"headlines_chunked = pd.concat([headlines_chunked_sentance1, headlines_chunked_sentance2], axis=1)\\n\",\n", " \"\\n\",\n", " \"# convert chunks from str to list\\n\",\n", " \"headlines_chunked['headlines_chunked_sentance1'] = headlines_chunked['headlines_chunked_sentance1'].apply(chunk2list)\\n\",\n", " \"headlines_chunked['headlines_chunked_sentance2'] = headlines_chunked['headlines_chunked_sentance2'].apply(chunk2list)\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 72,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"headlines_aligned_path = \\\"test_goldstandard/STSint.testinput.headlines.wa\\\" \\n\",\n", " \"\\n\",\n", " \"with open(headlines_aligned_path, 'r') as file:\\n\",\n", " \" file_content = file.read()\\n\",\n", " \"\\n\",\n", " \"# <==> and & break xml loaders so it needs to be replaces with something else\\n\",\n", " \"modified_content = file_content.replace('<==>', 'ARROWS_PLACEHOLDER').replace('&', 'AMPERSAND_PLACEHOLDER')\\n\",\n", " \"# it also needs a root wrapped to function properly \\n\",\n", " \"modified_content = f'{modified_content}'\\n\",\n", " \"\\n\",\n", " \"modified_file_path = 'test_goldstandard/STSint.testinput.headlines.fixedarrows.wa'\\n\",\n", " \"with open(modified_file_path, 'w') as modified_file:\\n\",\n", " \" modified_file.write(modified_content)\\n\",\n", " \"\\n\",\n", " \"# Parse the modified file using ElementTree\\n\",\n", " \"tree = etree.parse(modified_file_path)\\n\",\n", " \"root = tree.getroot()\\n\",\n", " \"\\n\",\n", " \"# function for printing XML\\n\",\n", " \"def prettyprint(element, **kwargs):\\n\",\n", " \" xml = etree.tostring(element, pretty_print=True, **kwargs)\\n\",\n", " \" print(xml.decode(), end='')\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 85,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"# get ansewrs\\n\",\n", " \"alignments_data = []\\n\",\n", " \"\\n\",\n", " \"for alignment in root.xpath('//alignment'):\\n\",\n", " \" # Extract relevant information from the alignment element\\n\",\n", " \" data = {\\n\",\n", " \" 'sentence_id': alignment.xpath('ancestor::sentence/@id')[0],\\n\",\n", " \" 'alignment_text': alignment.text\\n\",\n", " \" }\\n\",\n", " \" alignments_data.append(data)\\n\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 88,\n", " \"metadata\": {},\n", " \"outputs\": [\n", " {\n", " \"name\": \"stdout\",\n", " \"output_type\": \"stream\",\n", " \"text\": [\n", " \"\\n\",\n", " \"6 7 8 ARROWS_PLACEHOLDER 5 6 // EQUI // 5 // for the Philippines ARROWS_PLACEHOLDER to Philippines \\n\",\n", " \"5 ARROWS_PLACEHOLDER 2 // SIMI // 3 // departs ARROWS_PLACEHOLDER sends \\n\",\n", " \"9 ARROWS_PLACEHOLDER 0 // NOALI // NIL // Thursday ARROWS_PLACEHOLDER -not aligned- \\n\",\n", " \"1 ARROWS_PLACEHOLDER 1 // EQUI // 5 // China ARROWS_PLACEHOLDER China \\n\",\n", " \"2 3 4 ARROWS_PLACEHOLDER 3 4 // REL // 4 // 's Peace Ark ARROWS_PLACEHOLDER aid team \\n\",\n", " \"\\n\"\n", " ]\n", " }\n", " ],\n", " \"source\": [\n", " \"# test out the format\\n\",\n", " \"print(alignments_data[0][\\\"alignment_text\\\"])\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 107,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"y = pd.DataFrame(alignments_data)\\n\",\n", " \"y = y.drop(columns=[\\\"sentence_id\\\"])\\n\",\n", " \"\\n\",\n", " \"#return to <==> and &\\n\",\n", " \"def return_characteers(cell: str) -> str:\\n\",\n", " \" cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')\\n\",\n", " \" cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')\\n\",\n", " \" return cell\\n\",\n", " \"\\n\",\n", " \"y[\\\"alignment_text\\\"] = y[\\\"alignment_text\\\"].apply(return_characteers)\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 108,\n", " \"metadata\": {},\n", " \"outputs\": [\n", " {\n", " \"name\": \"stderr\",\n", " \"output_type\": \"stream\",\n", " \"text\": [\n", " \"c:\\\\Users\\\\Mati\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\\n\",\n", " \" return bound(*args, **kwds)\\n\"\n", " ]\n", " }\n", " ],\n", " \"source\": [\n", " \"# generate train test split\\n\",\n", " \"x = headlines_chunked\\n\",\n", " \"y = y\\n\",\n", " \"\\n\",\n", " \"data = pd.merge(x, y, left_index=True, right_index=True)\\n\",\n", " \"\\n\",\n", " \"train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])\"\n", " ]\n", " }\n", " ],\n", " \"metadata\": {\n", " \"kernelspec\": {\n", " \"display_name\": \"Python 3\",\n", " \"language\": \"python\",\n", " \"name\": \"python3\"\n", " },\n", " \"language_info\": {\n", " \"codemirror_mode\": {\n", " \"name\": \"ipython\",\n", " \"version\": 3\n", " },\n", " \"file_extension\": \".py\",\n", " \"mimetype\": \"text/x-python\",\n", " \"name\": \"python\",\n", " \"nbconvert_exporter\": \"python\",\n", " \"pygments_lexer\": \"ipython3\",\n", " \"version\": \"3.12.0\"\n", " }\n", " },\n", " \"nbformat\": 4,\n", " \"nbformat_minor\": 2\n", "}\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }