feat: report, make main.py conform to pep8

2026-07-04 19:23:03 +02:00 · 2023-04-25 00:26:05 +02:00 · 2023-04-25 00:26:05 +02:00 · dc5d24826d
commit dc5d24826d
parent 00a5de7a30
6 changed files with 441 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -161,3 +161,304 @@ cython_debug/

 lab3/*.jpg

+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+
+# changes
+*.soc
+
+# comment
+*.cut
+
+# cprotect
+*.cpt
+
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+
+# endnotes
+*.ent
+
+# fixme
+*.lox
+
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+
+# gnuplot
+*.gnuplot
+*.table
+
+# gnuplottex
+*-gnuplottex-*
+
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
+# *.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# luatexja-ruby
+*.ltjruby
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+
+# minted
+_minted*
+*.pyg
+
+# morewrites
+*.mw
+
+# newpax
+*.newpax
+
+# nomencl
+*.nlg
+*.nlo
+*.nls
+
+# pax
+*.pax
+
+# pdfpcnotes
+*.pdfpc
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# scrwfile
+*.wrt
+
+# svg
+svg-inkscape/
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+# pythontex
+*.pytxcode
+pythontex-files-*/
+
+# tcolorbox
+*.listing
+
+# thmtools
+*.loe
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# titletoc
+*.ptc
+
+# todonotes
+*.tdo
+
+# vhistory
+*.hst
+*.ver
+
+# easy-todo
+*.lod
+
+# xcolor
+*.xcp
+
+# xmpincl
+*.xmpi
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+## Editors:
+# WinEdt
+*.bak
+*.sav
+
+# Texpad
+.texpadtmp
+
+# LyX
+*.lyx~
+
+# Kile
+*.backup
+
+# gummi
+.*.swp
+
+# KBibTeX
+*~[0-9]*
+
+# TeXnicCenter
+*.tps
+
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+
+# expex forward references with \gathertags
+*-tags.tex
+
+# standalone packages
+*.sta
+
+# Makeindex log files
+*.lpz
+
+# xwatermark package
+*.xwm
+
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
--- a/lab4/main.py
+++ b/lab4/main.py
@ -1,40 +1,69 @@
+"""
+Program that predicts wine quality based on variant2.csv data
+"""
 import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression, LogisticRegression
-from sklearn.svm import SVC
-from sklearn.metrics import accuracy_score, mean_squared_error
+from sklearn.metrics import mean_squared_error, accuracy_score, f1_score

-filename = '/home/kuchy/EARIN/lab4/variant2.csv'
+wine_df = pd.read_csv("variant2.csv")
+wine_df.head()
+wine_df.describe()
+wine_df.info()

-# Load the dataset
-wine_data = pd.read_csv(filename)

-# Split into features and labels
-X = wine_data.drop("quality", axis=1)
-y = wine_data["quality"]
+X = wine_df.iloc[:, :-1].values
+y = wine_df.iloc[:, -1].values

-# Split into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=0)

-# Linear Regression
-lin_reg = LinearRegression()
-lin_reg.fit(X_train, y_train)
-y_pred_lin = lin_reg.predict(X_test)
-lin_reg_rmse = mean_squared_error(y_test, y_pred_lin, squared=False)
+scaler = StandardScaler()
+X_train = scaler.fit_transform(X_train)
+X_test = scaler.transform(X_test)
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)

-# Logistic Regression
-log_reg = LogisticRegression(multi_class='multinomial', solver='newton-cg')
-log_reg.fit(X_train, y_train)
-y_pred_log = log_reg.predict(X_test)
-log_reg_accuracy = accuracy_score(y_test, y_pred_log)
+y_pred = regressor.predict(X_test)

-# SVM
-svm = SVC()
-svm.fit(X_train, y_train)
-y_pred_svm = svm.predict(X_test)
-svm_accuracy = accuracy_score(y_test, y_pred_svm)
+mse = mean_squared_error(y_test, y_pred)
+print("MSE:", mse)
+classifier = LogisticRegression()
+classifier.fit(X_train, y_train)

-# Compare performance
-print("Linear Regression RMSE:", lin_reg_rmse)
-print("Logistic Regression accuracy:", log_reg_accuracy)
-print("SVM accuracy:", svm_accuracy)
+y_pred = classifier.predict(X_test)
+
+accuracy = accuracy_score(y_test, y_pred)
+print("Accuracy:", accuracy)
+y_pred_train = regressor.predict(X_train)
+
+train_mse = mean_squared_error(y_train, y_pred_train)
+print("Training MSE:", train_mse)
+
+train_r_squared = regressor.score(X_train, y_train)
+print("Training R^2:", train_r_squared)
+
+test_r_squared = regressor.score(X_test, y_test)
+print("Testing R^2:", test_r_squared)
+y_pred_train = classifier.predict(X_train)
+
+train_accuracy = accuracy_score(y_train, y_pred_train)
+print("Training Accuracy:", train_accuracy)
+
+train_f1_score = f1_score(y_train, y_pred_train, average="weighted")
+print("Training F1 Score:", train_f1_score)
+
+test_f1_score = f1_score(y_test, y_pred, average="weighted")
+print("Testing F1 Score:", test_f1_score)
+
+Data1 = sns.countplot(x="quality", data=wine_df)
+plt.draw()
+plt.waitforbuttonpress(0)
+plt.close()
+Data2 = sns.heatmap(wine_df.corr(), annot=True)
+plt.draw()
+plt.waitforbuttonpress(0)
+plt.close()
--- a/lab4/report/EARIN_RUDNICKI_KLISZKO_LAB_4.pdf
+++ b/lab4/report/EARIN_RUDNICKI_KLISZKO_LAB_4.pdf
--- a/lab4/report/EARIN_RUDNICKI_KLISZKO_LAB_4.tex
+++ b/lab4/report/EARIN_RUDNICKI_KLISZKO_LAB_4.tex
@ -1,9 +1,87 @@
-\documentclass{article}
+\documentclass{article}[12pt]
+\usepackage{graphicx} % Required for inserting images
+\usepackage{listings}
+\usepackage{hyperref}
+\usepackage{tabularx}
+\usepackage{float}
+\usepackage{subfig}
+\usepackage[a4paper, total={6in, 8in}]{geometry}

-\title{EARIN}
-\author{Krzysztof Rudnicki}
+\title{EARIN Lab 3 Report}
+\author{Krzysztof Rudnicki, 307585 \\ Jakub Kliszko, 303866  }
+\date{\today}

 \begin{document}
+
 \maketitle
-dupa
+\section{Exercise Variant 2 - Predicting wine quality}
+Our task was to write a program that predicts wine quality based on data containing: \\
+fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
+
+
+\section{Implementation}
+Program can be ran by installing python, moving to project directory and issuing command:
+\begin{lstlisting}[language=bash]
+python main.py
+\end{lstlisting}
+We have decided on implementing Linear and Logistical regression methods as we found them the easiest to implement \\  
+There will be 3 types of output \\ 
+\begin{enumerate}
+    \item Number of wines with given quality (Graphical)
+    \item How a given parameter impacts quality (Graphical)
+    \item How well did linear and logistical regression performed (Textual)
+\end{enumerate}
+Upon clicking any button the next plot will be shown
+\section{Results}
+We have successfully implemented program to predict wine quality \\ 
+\subsection{Data investigation}
+There are 11 features in total and 1599 instances of those features \\ 
+It is clear that there is an inbalance in quality of wines with majority of wines being either '5' or '6':
+\begin{figure}[H]
+    \caption{Plot showing inbalance in quality of wine}
+    \includegraphics[width=\textwidth]{inbalance.png}
+    \centering
+    \end{figure}
+More importantly we checked correlation of parameters:
+\begin{figure}[H]
+    \caption{Plot showing correlation between parameters, bright squares are positve correleation, dark squares are negative correleation}
+    \includegraphics[width=\textwidth]{corr.png}
+    \centering
+    \end{figure}
+Bright squares mean that the parameters have positive correlation to each other \\ 
+Darker squares mean that the parameters have negative correlation to each other \\ 
+\newpage
+We are most intrested in correleation of certain parameters to quality value \\ 
+Alcohol has by far the biggest positive impact on quality with coreleation value of 0.48 (where value of 1 means that those two parameters are equal to eachother), then we have sulphates and citric acid with roughly the same values (0.25 and 0.23 respectively) \\ 
+The worst impact on quality is done by volatile acidity (-0.39)
+\subsection{Methods comparison}
+For Linear regression we checked values of:
+\begin{itemize}
+    \item Training Mean squared error - Difference between predicted and true values, the lower the better
+    \item Training $R^2$ - for given data, The higher the better 
+    \item Testing $R^2$ - for new data, The higher the better
+\end{itemize}
+
+For Logistic regression we checked values of:
+\begin{itemize}
+    \item Training Accuracy - how many instances we correctly classified, the higher the better 
+    \item Training F1 Score - for given data, The higher the better 
+    \item Testing F1 Score - for new data, The higher the better
+\end{itemize}
+
+For Linear regression we received values:
+\begin{lstlisting}[language=bash]
+Training MSE: 0.4258083784387746
+Training R^2: 0.36545196162068627
+Testing R^2: 0.3283887639580225
+\end{lstlisting}
+
+For Logistic regression we received values:
+\begin{lstlisting}[language=bash]
+Training Accuracy: 0.596559812353401
+Training F1 Score: 0.5806169210603433
+Testing F1 Score: 0.6166756344362352
+\end{lstlisting}
+We can see that Logistic regression outperforms linear regression, its test scores which is supposed to be as high as possible are twice as good as ones in linear regression 
+
 \end{document}
--- a/lab4/report/corr.png
+++ b/lab4/report/corr.png
--- a/lab4/report/inbalance.png
+++ b/lab4/report/inbalance.png