mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 19:03:01 +02:00
290 lines
12 KiB
Plaintext
290 lines
12 KiB
Plaintext
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"{\n",
|
||
|
|
" \"cells\": [\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 58,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"import pandas as pd\\n\",\n",
|
||
|
|
" \"import numpy as np\\n\",\n",
|
||
|
|
" \"from lxml import etree\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"markdown\",\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# Input format\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"The input consists of two files:\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"* a file with the first sentences in each pair\\n\",\n",
|
||
|
|
" \"* a file with the second sentences in each pair\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"The sentences are tokenized.\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"Please check STSint.input.*.sent1.txt and STSint.*.input.sent2.txt\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"Participants can also use the input sentences with gold standard chunks:\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"* a file with the first sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
|
||
|
|
" \"* a file with the second sentences in each pair, with '[' and ']' to mark chunks\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"Please check STSint.input.*.sent1.chunk.txt and STSint.input.*.sent2.chunk.txt\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 26,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# loading unchunked headlines\\n\",\n",
|
||
|
|
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.txt\\\"\\n\",\n",
|
||
|
|
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.txt\\\"\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"headlines_sentance1.columns = [\\\"headlines_sentance1\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"headlines_sentance2.columns = [\\\"headlines_sentance2\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines = pd.concat([headlines_sentance1, headlines_sentance2], axis=1)\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 22,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# loading unchunked images\\n\",\n",
|
||
|
|
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.images.sent1.txt\\\"\\n\",\n",
|
||
|
|
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.images.sent2.txt\\\"\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"image_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"image_sentance1.columns = [\\\"image_sentance1\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"image_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"image_sentance2.columns = [\\\"image_sentance2\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"images = pd.concat([image_sentance1, image_sentance2], axis=1)\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 23,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# loading unchunked students\\n\",\n",
|
||
|
|
" \"unchunked_path_1 = \\\"test_goldstandard/STSint.testinput.answers-students.sent1.txt\\\"\\n\",\n",
|
||
|
|
" \"unchunked_path_2 = \\\"test_goldstandard/STSint.testinput.answers-students.sent2.txt\\\"\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"student_sentance1 = pd.read_csv(unchunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"student_sentance1.columns = [\\\"student_sentance1\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"student_sentance2 = pd.read_csv(unchunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"student_sentance2.columns = [\\\"student_sentance2\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"students = pd.concat([student_sentance1, student_sentance2], axis=1)\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 31,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"def chunk2list(chunks:str) -> list:\\n\",\n",
|
||
|
|
" \" \\\"\\\"\\\"\\n\",\n",
|
||
|
|
" \" Takes str that is all chunks from a chunked sentance and returns a list of all the chunks as seperate items \\n\",\n",
|
||
|
|
" \" \\\"\\\"\\\"\\n\",\n",
|
||
|
|
" \" chunks = chunks.replace('[', '')\\n\",\n",
|
||
|
|
" \" chunks = chunks.replace(']', '')\\n\",\n",
|
||
|
|
" \" chunks = chunks.replace(' ', '|')\\n\",\n",
|
||
|
|
" \" split = chunks.split('|')\\n\",\n",
|
||
|
|
" \" return split\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 33,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# loading chunked headlines\\n\",\n",
|
||
|
|
" \"chunked_path_1 = \\\"test_goldstandard/STSint.testinput.headlines.sent1.chunk.txt\\\"\\n\",\n",
|
||
|
|
" \"chunked_path_2 = \\\"test_goldstandard/STSint.testinput.headlines.sent2.chunk.txt\\\"\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines_chunked_sentance1 = pd.read_csv(chunked_path_1, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"headlines_chunked_sentance1.columns = [\\\"headlines_chunked_sentance1\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines_chunked_sentance2 = pd.read_csv(chunked_path_2, dtype=str, delimiter=\\\"}\\\", header=None)\\n\",\n",
|
||
|
|
" \"headlines_chunked_sentance2.columns = [\\\"headlines_chunked_sentance2\\\"]\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"headlines_chunked = pd.concat([headlines_chunked_sentance1, headlines_chunked_sentance2], axis=1)\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"# convert chunks from str to list\\n\",\n",
|
||
|
|
" \"headlines_chunked['headlines_chunked_sentance1'] = headlines_chunked['headlines_chunked_sentance1'].apply(chunk2list)\\n\",\n",
|
||
|
|
" \"headlines_chunked['headlines_chunked_sentance2'] = headlines_chunked['headlines_chunked_sentance2'].apply(chunk2list)\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 72,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"headlines_aligned_path = \\\"test_goldstandard/STSint.testinput.headlines.wa\\\" \\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"with open(headlines_aligned_path, 'r') as file:\\n\",\n",
|
||
|
|
" \" file_content = file.read()\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"# <==> and & break xml loaders so it needs to be replaces with something else\\n\",\n",
|
||
|
|
" \"modified_content = file_content.replace('<==>', 'ARROWS_PLACEHOLDER').replace('&', 'AMPERSAND_PLACEHOLDER')\\n\",\n",
|
||
|
|
" \"# it also needs a root wrapped to function properly \\n\",\n",
|
||
|
|
" \"modified_content = f'<root>{modified_content}</root>'\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"modified_file_path = 'test_goldstandard/STSint.testinput.headlines.fixedarrows.wa'\\n\",\n",
|
||
|
|
" \"with open(modified_file_path, 'w') as modified_file:\\n\",\n",
|
||
|
|
" \" modified_file.write(modified_content)\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"# Parse the modified file using ElementTree\\n\",\n",
|
||
|
|
" \"tree = etree.parse(modified_file_path)\\n\",\n",
|
||
|
|
" \"root = tree.getroot()\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"# function for printing XML\\n\",\n",
|
||
|
|
" \"def prettyprint(element, **kwargs):\\n\",\n",
|
||
|
|
" \" xml = etree.tostring(element, pretty_print=True, **kwargs)\\n\",\n",
|
||
|
|
" \" print(xml.decode(), end='')\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 85,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# get ansewrs\\n\",\n",
|
||
|
|
" \"alignments_data = []\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"for alignment in root.xpath('//alignment'):\\n\",\n",
|
||
|
|
" \" # Extract relevant information from the alignment element\\n\",\n",
|
||
|
|
" \" data = {\\n\",\n",
|
||
|
|
" \" 'sentence_id': alignment.xpath('ancestor::sentence/@id')[0],\\n\",\n",
|
||
|
|
" \" 'alignment_text': alignment.text\\n\",\n",
|
||
|
|
" \" }\\n\",\n",
|
||
|
|
" \" alignments_data.append(data)\\n\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 88,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [\n",
|
||
|
|
" {\n",
|
||
|
|
" \"name\": \"stdout\",\n",
|
||
|
|
" \"output_type\": \"stream\",\n",
|
||
|
|
" \"text\": [\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"6 7 8 ARROWS_PLACEHOLDER 5 6 // EQUI // 5 // for the Philippines ARROWS_PLACEHOLDER to Philippines \\n\",\n",
|
||
|
|
" \"5 ARROWS_PLACEHOLDER 2 // SIMI // 3 // departs ARROWS_PLACEHOLDER sends \\n\",\n",
|
||
|
|
" \"9 ARROWS_PLACEHOLDER 0 // NOALI // NIL // Thursday ARROWS_PLACEHOLDER -not aligned- \\n\",\n",
|
||
|
|
" \"1 ARROWS_PLACEHOLDER 1 // EQUI // 5 // China ARROWS_PLACEHOLDER China \\n\",\n",
|
||
|
|
" \"2 3 4 ARROWS_PLACEHOLDER 3 4 // REL // 4 // 's Peace Ark ARROWS_PLACEHOLDER aid team \\n\",\n",
|
||
|
|
" \"\\n\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" }\n",
|
||
|
|
" ],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# test out the format\\n\",\n",
|
||
|
|
" \"print(alignments_data[0][\\\"alignment_text\\\"])\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 107,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"y = pd.DataFrame(alignments_data)\\n\",\n",
|
||
|
|
" \"y = y.drop(columns=[\\\"sentence_id\\\"])\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"#return to <==> and &\\n\",\n",
|
||
|
|
" \"def return_characteers(cell: str) -> str:\\n\",\n",
|
||
|
|
" \" cell = cell.replace('ARROWS_PLACEHOLDER', '<==>')\\n\",\n",
|
||
|
|
" \" cell = cell.replace('AMPERSAND_PLACEHOLDER', '&')\\n\",\n",
|
||
|
|
" \" return cell\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"y[\\\"alignment_text\\\"] = y[\\\"alignment_text\\\"].apply(return_characteers)\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" },\n",
|
||
|
|
" {\n",
|
||
|
|
" \"cell_type\": \"code\",\n",
|
||
|
|
" \"execution_count\": 108,\n",
|
||
|
|
" \"metadata\": {},\n",
|
||
|
|
" \"outputs\": [\n",
|
||
|
|
" {\n",
|
||
|
|
" \"name\": \"stderr\",\n",
|
||
|
|
" \"output_type\": \"stream\",\n",
|
||
|
|
" \"text\": [\n",
|
||
|
|
" \"c:\\\\Users\\\\Mati\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\\n\",\n",
|
||
|
|
" \" return bound(*args, **kwds)\\n\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" }\n",
|
||
|
|
" ],\n",
|
||
|
|
" \"source\": [\n",
|
||
|
|
" \"# generate train test split\\n\",\n",
|
||
|
|
" \"x = headlines_chunked\\n\",\n",
|
||
|
|
" \"y = y\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"data = pd.merge(x, y, left_index=True, right_index=True)\\n\",\n",
|
||
|
|
" \"\\n\",\n",
|
||
|
|
" \"train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])\"\n",
|
||
|
|
" ]\n",
|
||
|
|
" }\n",
|
||
|
|
" ],\n",
|
||
|
|
" \"metadata\": {\n",
|
||
|
|
" \"kernelspec\": {\n",
|
||
|
|
" \"display_name\": \"Python 3\",\n",
|
||
|
|
" \"language\": \"python\",\n",
|
||
|
|
" \"name\": \"python3\"\n",
|
||
|
|
" },\n",
|
||
|
|
" \"language_info\": {\n",
|
||
|
|
" \"codemirror_mode\": {\n",
|
||
|
|
" \"name\": \"ipython\",\n",
|
||
|
|
" \"version\": 3\n",
|
||
|
|
" },\n",
|
||
|
|
" \"file_extension\": \".py\",\n",
|
||
|
|
" \"mimetype\": \"text/x-python\",\n",
|
||
|
|
" \"name\": \"python\",\n",
|
||
|
|
" \"nbconvert_exporter\": \"python\",\n",
|
||
|
|
" \"pygments_lexer\": \"ipython3\",\n",
|
||
|
|
" \"version\": \"3.12.0\"\n",
|
||
|
|
" }\n",
|
||
|
|
" },\n",
|
||
|
|
" \"nbformat\": 4,\n",
|
||
|
|
" \"nbformat_minor\": 2\n",
|
||
|
|
"}\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"language_info": {
|
||
|
|
"name": "python"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 2
|
||
|
|
}
|