feat: report, make main.py conform to pep8

This commit is contained in:
Krzysztof Rudnicki 2023-04-25 00:26:05 +02:00
parent 00a5de7a30
commit dc5d24826d
6 changed files with 441 additions and 33 deletions

301
.gitignore vendored
View File

@ -161,3 +161,304 @@ cython_debug/
lab3/*.jpg
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
# fixme
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
# easy-todo
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

View File

@ -1,40 +1,69 @@
"""
Program that predicts wine quality based on variant2.csv data
"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
filename = '/home/kuchy/EARIN/lab4/variant2.csv'
wine_df = pd.read_csv("variant2.csv")
wine_df.head()
wine_df.describe()
wine_df.info()
# Load the dataset
wine_data = pd.read_csv(filename)
# Split into features and labels
X = wine_data.drop("quality", axis=1)
y = wine_data["quality"]
X = wine_df.iloc[:, :-1].values
y = wine_df.iloc[:, -1].values
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
lin_reg_rmse = mean_squared_error(y_test, y_pred_lin, squared=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Logistic Regression
log_reg = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log)
y_pred = regressor.predict(X_test)
# SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
# Compare performance
print("Linear Regression RMSE:", lin_reg_rmse)
print("Logistic Regression accuracy:", log_reg_accuracy)
print("SVM accuracy:", svm_accuracy)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
y_pred_train = regressor.predict(X_train)
train_mse = mean_squared_error(y_train, y_pred_train)
print("Training MSE:", train_mse)
train_r_squared = regressor.score(X_train, y_train)
print("Training R^2:", train_r_squared)
test_r_squared = regressor.score(X_test, y_test)
print("Testing R^2:", test_r_squared)
y_pred_train = classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)
train_f1_score = f1_score(y_train, y_pred_train, average="weighted")
print("Training F1 Score:", train_f1_score)
test_f1_score = f1_score(y_test, y_pred, average="weighted")
print("Testing F1 Score:", test_f1_score)
Data1 = sns.countplot(x="quality", data=wine_df)
plt.draw()
plt.waitforbuttonpress(0)
plt.close()
Data2 = sns.heatmap(wine_df.corr(), annot=True)
plt.draw()
plt.waitforbuttonpress(0)
plt.close()

Binary file not shown.

View File

@ -1,9 +1,87 @@
\documentclass{article}
\documentclass{article}[12pt]
\usepackage{graphicx} % Required for inserting images
\usepackage{listings}
\usepackage{hyperref}
\usepackage{tabularx}
\usepackage{float}
\usepackage{subfig}
\usepackage[a4paper, total={6in, 8in}]{geometry}
\title{EARIN}
\author{Krzysztof Rudnicki}
\title{EARIN Lab 3 Report}
\author{Krzysztof Rudnicki, 307585 \\ Jakub Kliszko, 303866 }
\date{\today}
\begin{document}
\maketitle
dupa
\section{Exercise Variant 2 - Predicting wine quality}
Our task was to write a program that predicts wine quality based on data containing: \\
fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
\section{Implementation}
Program can be ran by installing python, moving to project directory and issuing command:
\begin{lstlisting}[language=bash]
python main.py
\end{lstlisting}
We have decided on implementing Linear and Logistical regression methods as we found them the easiest to implement \\
There will be 3 types of output \\
\begin{enumerate}
\item Number of wines with given quality (Graphical)
\item How a given parameter impacts quality (Graphical)
\item How well did linear and logistical regression performed (Textual)
\end{enumerate}
Upon clicking any button the next plot will be shown
\section{Results}
We have successfully implemented program to predict wine quality \\
\subsection{Data investigation}
There are 11 features in total and 1599 instances of those features \\
It is clear that there is an inbalance in quality of wines with majority of wines being either '5' or '6':
\begin{figure}[H]
\caption{Plot showing inbalance in quality of wine}
\includegraphics[width=\textwidth]{inbalance.png}
\centering
\end{figure}
More importantly we checked correlation of parameters:
\begin{figure}[H]
\caption{Plot showing correlation between parameters, bright squares are positve correleation, dark squares are negative correleation}
\includegraphics[width=\textwidth]{corr.png}
\centering
\end{figure}
Bright squares mean that the parameters have positive correlation to each other \\
Darker squares mean that the parameters have negative correlation to each other \\
\newpage
We are most intrested in correleation of certain parameters to quality value \\
Alcohol has by far the biggest positive impact on quality with coreleation value of 0.48 (where value of 1 means that those two parameters are equal to eachother), then we have sulphates and citric acid with roughly the same values (0.25 and 0.23 respectively) \\
The worst impact on quality is done by volatile acidity (-0.39)
\subsection{Methods comparison}
For Linear regression we checked values of:
\begin{itemize}
\item Training Mean squared error - Difference between predicted and true values, the lower the better
\item Training $R^2$ - for given data, The higher the better
\item Testing $R^2$ - for new data, The higher the better
\end{itemize}
For Logistic regression we checked values of:
\begin{itemize}
\item Training Accuracy - how many instances we correctly classified, the higher the better
\item Training F1 Score - for given data, The higher the better
\item Testing F1 Score - for new data, The higher the better
\end{itemize}
For Linear regression we received values:
\begin{lstlisting}[language=bash]
Training MSE: 0.4258083784387746
Training R^2: 0.36545196162068627
Testing R^2: 0.3283887639580225
\end{lstlisting}
For Logistic regression we received values:
\begin{lstlisting}[language=bash]
Training Accuracy: 0.596559812353401
Training F1 Score: 0.5806169210603433
Testing F1 Score: 0.6166756344362352
\end{lstlisting}
We can see that Logistic regression outperforms linear regression, its test scores which is supposed to be as high as possible are twice as good as ones in linear regression
\end{document}

BIN
lab4/report/corr.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

BIN
lab4/report/inbalance.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB