Skip to content
Snippets Groups Projects
Commit f6ee9107 authored by OleBrumm's avatar OleBrumm
Browse files

done

parent 10b18cc6
No related branches found
No related tags found
No related merge requests found
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="INF161" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="GrazieInspection" enabled="false" level="GRAMMAR_ERROR" enabled_by_default="false" />
<inspection_tool class="IdentifierGrammar" enabled="false" level="TYPO" enabled_by_default="false" />
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="E722" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
<option value="N802" />
</list>
</option>
</inspection_tool>
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
<inspection_tool class="StructuralWrap" enabled="false" level="TYPO" enabled_by_default="false" />
<inspection_tool class="Style" enabled="false" level="TYPO" enabled_by_default="false" />
<inspection_tool class="ValeProblem" enabled="false" level="WARNING" enabled_by_default="false" />
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="INF161" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="INF161" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/INF161_Lab4.iml" filepath="$PROJECT_DIR$/.idea/INF161_Lab4.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Titanic overlevelse ## Titanic overlevelse
Her har vi data om passasjerer på skipet Titanic og informasjon om hvorvidt de overlevde. I denne oppgaven skal du bruke passasjerdata for å teste om det er forskjeller i overlevelsen mellom passasjergrupper og predikere om passasjeren overlevde. Her har vi data om passasjerer på skipet Titanic og informasjon om hvorvidt de overlevde. I denne oppgaven skal du bruke passasjerdata for å teste om det er forskjeller i overlevelsen mellom passasjergrupper og predikere om passasjeren overlevde.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# imports # imports
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy import stats from scipy import stats
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss from sklearn.metrics import accuracy_score, log_loss
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# les inn Titanic-data # les inn Titanic-data
titanic_df = pd.read_csv('data/titanic.csv') titanic_df = pd.read_csv('data/titanic.csv')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Først endrer vi litt på data for å gjøre oppgaven enklere. Dette er ikke en god ide å gjøre når vi vil lage bestmulige modeller. Først endrer vi litt på data for å gjøre oppgaven enklere. Dette er ikke en god ide å gjøre når vi vil lage bestmulige modeller.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# del data i mål-, og prediktorvariabler # del data i mål-, og prediktorvariabler
X_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'] X_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
y_col = 'Survived' y_col = 'Survived'
reduced_df = titanic_df.loc[:, X_cols + [y_col]].dropna() reduced_df = titanic_df.loc[:, X_cols + [y_col]].dropna()
X_df = pd.get_dummies(reduced_df.loc[:, X_cols], dtype='int') X_df = pd.get_dummies(reduced_df.loc[:, X_cols], dtype='int')
X = X_df.values X = X_df.values
y = reduced_df[y_col].values y = reduced_df[y_col].values
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Del data i trenings-, validerigns-, og testdata med størrelser 70%, 15%, 15% av data. Del data i trenings-, validerigns-, og testdata med størrelser 70%, 15%, 15% av data.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# dele data i trenings, validerings og testdata # dele data i trenings, validerings og testdata
# generer X_train, X_val, X_test, y_train, y_val, y_test # generer X_train, X_val, X_test, y_train, y_val, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5) X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
På treningsdata, test om menn og kvinner hadde forskjellige sannsynligheter for å overleve. På treningsdata, test om menn og kvinner hadde forskjellige sannsynligheter for å overleve.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# test om menn og kvinner hadde forskjellig overlevelse # test om menn og kvinner hadde forskjellig overlevelse
df = pd.DataFrame(X_train, columns=X_df.columns) df = pd.DataFrame(X_train, columns=X_df.columns)
df.loc[:, 'Survived'] = y_train df.loc[:, 'Survived'] = y_train
womenalive = df[df['Sex_female'] == 1]['Survived'].sum() womenalive = df[df['Sex_female'] == 1]['Survived'].sum()
womendead = df[df['Sex_female'] == 1]['Survived'].count() - womenalive womendead = df[df['Sex_female'] == 1]['Survived'].count() - womenalive
menalive = df[df['Sex_male'] == 1]['Survived'].sum() menalive = df[df['Sex_male'] == 1]['Survived'].sum()
mendead = df[df['Sex_male'] == 1]['Survived'].count() - menalive mendead = df[df['Sex_male'] == 1]['Survived'].count() - menalive
print((womenalive/womendead)/(menalive/mendead)) print((womenalive/womendead)/(menalive/mendead))
nrs = pd.crosstab(df['Sex_female'], df['Survived']) nrs = pd.crosstab(df['Sex_female'], df['Survived'])
kjonn_test = stats.fisher_exact(nrs) kjonn_test = stats.fisher_exact(nrs)
print(str(kjonn_test)) print(str(kjonn_test))
``` ```
%% Output %% Output
12.35820895522388 11.545454545454547
SignificanceResult(statistic=12.35820895522388, pvalue=1.6667183280926595e-34) SignificanceResult(statistic=11.545454545454545, pvalue=4.443912734227653e-33)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
På treningsdata, test om de som overlevde hadde forskjellig mean alder enn de som ikke overlevde. På treningsdata, test om de som overlevde hadde forskjellig mean alder enn de som ikke overlevde.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Get the ages and survival labels from training data # Get the ages and survival labels from training data
df = pd.DataFrame(X_train, columns=X_df.columns) df = pd.DataFrame(X_train, columns=X_df.columns)
df.loc[:, 'Survived'] = y_train df.loc[:, 'Survived'] = y_train
# Test if the difference is significant # Test if the difference is significant
alder_test = stats.ttest_ind(titanic_df[titanic_df['Survived']==1]['Age'].dropna(), alder_test = stats.ttest_ind(titanic_df[titanic_df['Survived']==0]['Age'].dropna(),
titanic_df[titanic_df['Survived']==0]['Age'].dropna()) titanic_df[titanic_df['Survived']==1]['Age'].dropna())
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Tren en kNN-modell med k=1, k=10 og k=50 på treningsdata. Tren også en logistisk regresjon, naive Bayes modell, tilfeldig skog og supportvektormaskin på treningsdata. Tren en kNN-modell med k=1, k=10 og k=50 på treningsdata. Tren også en logistisk regresjon, naive Bayes modell, tilfeldig skog og supportvektormaskin på treningsdata.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# tren forskjellige modeller # tren forskjellige modeller
models = {'kNN-1': KNeighborsClassifier(n_neighbors=1), models = {'kNN-1': KNeighborsClassifier(n_neighbors=1),
'kNN-10': KNeighborsClassifier(n_neighbors=10), 'kNN-10': KNeighborsClassifier(n_neighbors=10),
'kNN-50': KNeighborsClassifier(n_neighbors=50), 'kNN-50': KNeighborsClassifier(n_neighbors=50),
'Logistisk regresjon': LogisticRegression(), 'Logistisk regresjon': LogisticRegression(),
'Naive Bayes': MultinomialNB(), 'Naive Bayes': MultinomialNB(),
'Tilfeldig skog': RandomForestClassifier(), 'Tilfeldig skog': RandomForestClassifier(),
'Supportvektormaskin': SVC()} 'Supportvektormaskin': SVC(probability=True)}
for _, model in models.items(): for _, model in models.items():
model.fit(X_train, y_train) model.fit(X_train, y_train)
``` ```
%% Output %% Output
C:\Users\Ole\miniconda3\envs\INF161\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1): C:\Users\Ole\miniconda3\envs\INF161\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in: Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options: Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result( n_iter_i = _check_optimize_result(
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Sorter de ulike modellene etter nøyaktighet på valideringsdata (`sklearn.metrics.accuracy_score`). Sorter de ulike modellene etter nøyaktighet på valideringsdata (`sklearn.metrics.accuracy_score`).
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# sjekk nøyaktighet for valideringsdata # sjekk nøyaktighet for valideringsdata
accuracies = {name: accuracy_score(y_val, model.predict(X_val)) for name, model in models.items()} accuracies = {name: accuracy_score(y_val, model.predict(X_val)) for name, model in models.items()}
accuracies_df = pd.DataFrame(accuracies, index=accuracies.keys(), columns=['accuracy']) accuracies_df = pd.DataFrame(accuracies, index=accuracies.keys(), columns=['accuracy'])
accuracies_df.sort_values(by='accuracy', ascending=False, inplace=True) accuracies_df.sort_values(by='accuracy', ascending=False, inplace=True)
print(accuracies_df) print(accuracies_df)
``` ```
%% Output %% Output
accuracy accuracy
kNN-1 NaN kNN-1 NaN
kNN-10 NaN kNN-10 NaN
kNN-50 NaN kNN-50 NaN
Logistisk regresjon NaN Logistisk regresjon NaN
Naive Bayes NaN Naive Bayes NaN
Tilfeldig skog NaN Tilfeldig skog NaN
Supportvektormaskin NaN Supportvektormaskin NaN
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
I stedet for nøyaktighet er det vanlig å bruke log-loss, som tar hensyn til en probabilistisk prediksjon. Sorter de ulike modellene etter log-loss (`sklearn.metrics.log_loss`). I stedet for nøyaktighet er det vanlig å bruke log-loss, som tar hensyn til en probabilistisk prediksjon. Sorter de ulike modellene etter log-loss (`sklearn.metrics.log_loss`).
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Calculate log loss for each model on validation data # Calculate log loss for each model on validation data
losses = {name: log_loss(y_val, model.predict_proba(X_val)) losses = {name: log_loss(y_val, model.predict_proba(X_val))
for name, model in models.items() for name, model in models.items()
if hasattr(model, 'predict_proba')} if hasattr(model, 'predict_proba')}
# Convert dictionary to DataFrame and sort by loss # Convert dictionary to DataFrame and sort by loss
losses_df = pd.DataFrame(list(losses.items()), columns=['Model', 'Log-Loss']) losses_df = pd.DataFrame(list(losses.items()), columns=['Model', 'Log-Loss'])
losses_df.sort_values(by='Log-Loss', inplace=True) losses_df.sort_values(by='Log-Loss', inplace=True)
``` ```
%% Output %% Output
C:\Users\Ole\miniconda3\envs\INF161\lib\site-packages\sklearn\metrics\_classification.py:2916: UserWarning: The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error. C:\Users\Ole\miniconda3\envs\INF161\lib\site-packages\sklearn\metrics\_classification.py:2916: UserWarning: The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.
warnings.warn( warnings.warn(
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Velg ut den beste modellen (basert på log-loss) og sjekk hvor godt den generaliserer ved å regne ut nøyaktighet og log-loss på testdata. Velg ut den beste modellen (basert på log-loss) og sjekk hvor godt den generaliserer ved å regne ut nøyaktighet og log-loss på testdata.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Choose the best model based on log-loss # Choose the best model based on log-loss
best_model = models[losses_df.iloc[0]['Model']] best_model = models[losses_df.iloc[0]['Model']]
# Evaluate its performance on test data # Evaluate its performance on test data
generalization_accuracy = accuracy_score(y_test, best_model.predict(X_test)) generalization_accuracy = accuracy_score(y_test, best_model.predict(X_test))
generalization_logloss = log_loss(y_test, best_model.predict_proba(X_test)[:, 1]) generalization_logloss = log_loss(y_test, best_model.predict_proba(X_test)[:, 1])
print('Nøyaktighet:', generalization_accuracy) print('Nøyaktighet:', generalization_accuracy)
print('Log-loss:', generalization_logloss) print('Log-loss:', generalization_logloss)
``` ```
%% Output %% Output
Nøyaktighet: 0.7407407407407407 Nøyaktighet: 0.8055555555555556
Log-loss: 1.1793389109115633 Log-loss: 0.4015737539957328
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Gi en oppsummering over hva du har gjort og hva resultatet var. Gi en oppsummering over hva du har gjort og hva resultatet var.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
... ...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment