691 lines
32 KiB
Plaintext
691 lines
32 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 92,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from xgboost import XGBClassifier\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from sklearn.feature_selection import mutual_info_regression\n",
|
|
"from sklearn.discriminant_analysis import StandardScaler\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.metrics import classification_report\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"from sklearn.model_selection import train_test_split, ParameterGrid\n",
|
|
"from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Activation, Dense, LSTM, Input\n",
|
|
"from keras.optimizers import Adam, RMSprop, SGD\n",
|
|
"from scikeras.wrappers import KerasClassifier\n",
|
|
"import tensorflow as tf\n",
|
|
"from os import path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to load the dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 93,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
|
|
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
|
|
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
|
|
" columns_drop = ['Id'] + columns_drop\n",
|
|
" id_list = test[\"Id\"]\n",
|
|
" train.drop(columns_drop, inplace=True, axis=1)\n",
|
|
" test.drop(columns_drop, inplace=True, axis=1)\n",
|
|
" print(len(train.columns))\n",
|
|
" return (train, greeks, test, id_list)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to split the data in validation and train set randomly"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 94,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" X = df.loc[:, df.columns != \"Class\"]\n",
|
|
" y = df.loc[:, \"Class\"]\n",
|
|
" return train_test_split(X, y, test_size=split, random_state=42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to build a Tensorflow model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 95,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, \n",
|
|
" activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:\n",
|
|
" input = Input(shape=input_shape)\n",
|
|
" x = Dense(units=units1, activation=activation1)(input)\n",
|
|
" x = Dense(units=units2, activation=activation2)(x)\n",
|
|
" x = Dense(units=units3, activation=activation3)(x)\n",
|
|
" output = Dense(units=output_shape, activation=\"softmax\")(x)\n",
|
|
" model = Model(inputs=[input], outputs=[output])\n",
|
|
" \n",
|
|
" model.compile(loss=\"categorical_crossentropy\",\n",
|
|
" optimizer=optimizer(learning_rate=learning_rate),\n",
|
|
" metrics=[\"accuracy\"]) \n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to plot the accuracy of the model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 96,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plot_acc_tf_model(history:Model):\n",
|
|
" # summarize history for accuracy\n",
|
|
" plt.plot(history.history['accuracy'])\n",
|
|
" plt.plot(history.history['val_accuracy'])\n",
|
|
" plt.title('model accuracy')\n",
|
|
" plt.ylabel('accuracy')\n",
|
|
" plt.xlabel('epoch')\n",
|
|
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
|
|
" plt.show()\n",
|
|
" \n",
|
|
" # summarize history for loss\n",
|
|
" plt.plot(history.history['loss'])\n",
|
|
" plt.plot(history.history['val_loss'])\n",
|
|
" plt.title('model loss')\n",
|
|
" plt.ylabel('loss')\n",
|
|
" plt.xlabel('epoch')\n",
|
|
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to fit the Tensorflow model with ES Callback"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 97,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"es_callback = EarlyStopping(\n",
|
|
" monitor=\"val_accuracy\",\n",
|
|
" patience=5,\n",
|
|
" verbose=1,\n",
|
|
" restore_best_weights=True,\n",
|
|
" min_delta=0.005\n",
|
|
" )\n",
|
|
" \n",
|
|
"def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:\n",
|
|
" #split train and validation\n",
|
|
" x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)\n",
|
|
" #fit the model\n",
|
|
" history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])\n",
|
|
" return history "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method for GridSearch of Tensorflow model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 98,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"grid_params = {\"units1\": [8], \"units2\": [16,32], \"units3\": [64,128], \"activation1\": [\"relu\"], \"activation2\": [\"relu\"], \n",
|
|
" \"activation3\": [\"relu\"], \"optimizer\": [Adam, RMSprop, SGD], \"learning_rate\": [0.001]}\n",
|
|
"\n",
|
|
"#GridSearch\n",
|
|
"def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:\n",
|
|
" grid = ParameterGrid(param_grid = grid_params)\n",
|
|
" results = []\n",
|
|
" input_shape = len(X_train[1])\n",
|
|
" output_shape = 2\n",
|
|
" for idx,params in enumerate(grid):\n",
|
|
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)\n",
|
|
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
|
|
" val_loss = history.history['val_loss'][-1] \n",
|
|
" val_acc = history.history['val_accuracy'][-1]\n",
|
|
" results.append([val_loss, val_acc])\n",
|
|
" \n",
|
|
" val_accuracies = [i[1] for i in results]\n",
|
|
" val_losses= [i[0] for i in results]\n",
|
|
" best_acc = val_accuracies.index(max(val_accuracies))\n",
|
|
" best_loss = val_losses.index(min(val_losses))\n",
|
|
" print(f\"best acc at index {best_acc}: {max(val_accuracies)}\")\n",
|
|
" print(f\"best loss at index {best_loss}: {min(val_losses)}\")\n",
|
|
" print(grid[best_acc])\n",
|
|
" \n",
|
|
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])\n",
|
|
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
|
|
" #plot_acc_tf_model(history)\n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to build a preprocessing pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 99,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
|
|
" # Preprocessing for numerical data \n",
|
|
" numerical_transformer = Pipeline(steps=[\n",
|
|
" ('imputer',SimpleImputer(strategy='constant')),\n",
|
|
" ('scaler', StandardScaler())])\n",
|
|
" \n",
|
|
" # Preprocessing for categorical data\n",
|
|
" categorical_transformer = Pipeline(steps=[\n",
|
|
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
|
|
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
|
|
" ])\n",
|
|
"\n",
|
|
" # Bundle preprocessing for numerical and categorical data\n",
|
|
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
|
|
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
|
|
" preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', numerical_transformer, numerical_cols),\n",
|
|
" ('cat', categorical_transformer, categorical_cols)\n",
|
|
" ])\n",
|
|
" print(f\"Number of columns: {len(df.columns)}\")\n",
|
|
" return preprocessor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to generate the Mutual Info scores and plot them"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):\n",
|
|
" mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n",
|
|
" mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n",
|
|
" mi_scores = mi_scores.sort_values(ascending=False)\n",
|
|
" return mi_scores\n",
|
|
"\n",
|
|
"def plot_mi_scores(scores):\n",
|
|
" scores = scores.sort_values(ascending=True)\n",
|
|
" width = np.arange(len(scores))\n",
|
|
" ticks = list(scores.index)\n",
|
|
" plt.figure(dpi=100, figsize=(16, 16))\n",
|
|
" plt.barh(width, scores)\n",
|
|
" plt.yticks(width, ticks)\n",
|
|
" plt.title(\"Mutual Information Scores\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Display MI scores that are beneath 0.01"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 101,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"57\n",
|
|
"dataset shape: (617, 57)\n",
|
|
"X shape: (431, 56) and y shape: (431,)\n",
|
|
" Columns with MI equal zero: ['DH', 'CC', 'DN', 'BR', 'CL', 'EG', 'CD ', 'AZ', 'BD ', 'CB', 'GB', 'CF', 'EJ', 'CU', 'CW ', 'DE', 'DF', 'DY', 'AB'] --> total length: 19\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#get the mutual information of features\n",
|
|
"train, greeks, test, id_list = load_dataset(columns_drop=[])\n",
|
|
"print(f\"dataset shape: {train.shape}\")\n",
|
|
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
|
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
|
|
"\n",
|
|
"cols = list(X_train.columns)\n",
|
|
"cols.append(\"1\")\n",
|
|
"X_transformed = X_train.fillna(0)\n",
|
|
"X_transformed[\"EJ\"].replace(['A', 'B'], [0, 1], inplace=True)\n",
|
|
"discrete_features = X_transformed.dtypes == int\n",
|
|
"mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)\n",
|
|
"bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)\n",
|
|
"#plot_mi_scores(mi_scores)\n",
|
|
"print(f\" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Create a model and preprocessor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 102,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"38\n",
|
|
"dataset shape: (617, 38)\n",
|
|
"Number of columns: 37\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.6856 - accuracy: 0.5901 - val_loss: 0.6137 - val_accuracy: 0.7816\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5953 - accuracy: 0.8227 - val_loss: 0.5401 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5448 - accuracy: 0.8372 - val_loss: 0.4961 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5096 - accuracy: 0.8372 - val_loss: 0.4673 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4853 - accuracy: 0.8372 - val_loss: 0.4464 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4643 - accuracy: 0.8372 - val_loss: 0.4334 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4852 - accuracy: 0.8125Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4456 - accuracy: 0.8372 - val_loss: 0.4228 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6956 - accuracy: 0.5959 - val_loss: 0.6155 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6004 - accuracy: 0.8256 - val_loss: 0.5349 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5545 - accuracy: 0.8401 - val_loss: 0.4919 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5145 - accuracy: 0.8401 - val_loss: 0.4642 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4748 - accuracy: 0.8459 - val_loss: 0.4483 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4367 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4351 - accuracy: 0.8459 - val_loss: 0.4269 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.7086 - accuracy: 0.4419 - val_loss: 0.6189 - val_accuracy: 0.8276\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5895 - accuracy: 0.8459 - val_loss: 0.5260 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5449 - accuracy: 0.8372 - val_loss: 0.4829 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5045 - accuracy: 0.8401 - val_loss: 0.4571 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4679 - accuracy: 0.8430 - val_loss: 0.4363 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4300 - accuracy: 0.8430 - val_loss: 0.4170 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3338 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3985 - accuracy: 0.8605 - val_loss: 0.3954 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.6396 - accuracy: 0.8081 - val_loss: 0.5624 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5573 - accuracy: 0.8372 - val_loss: 0.5018 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4990 - accuracy: 0.8372 - val_loss: 0.4580 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4496 - accuracy: 0.8343 - val_loss: 0.4293 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4051 - accuracy: 0.8459 - val_loss: 0.4132 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3665 - accuracy: 0.8634 - val_loss: 0.4049 - val_accuracy: 0.8506\n",
|
|
"Epoch 7/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3397 - accuracy: 0.8837 - val_loss: 0.3967 - val_accuracy: 0.8506\n",
|
|
"Epoch 8/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3178 - accuracy: 0.9041 - val_loss: 0.3961 - val_accuracy: 0.8506\n",
|
|
"Epoch 9/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3018 - accuracy: 0.9099 - val_loss: 0.3948 - val_accuracy: 0.8506\n",
|
|
"Epoch 10/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.2875 - accuracy: 0.9099 - val_loss: 0.3997 - val_accuracy: 0.8506\n",
|
|
"Epoch 11/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3437 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 6.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.2751 - accuracy: 0.9099 - val_loss: 0.4019 - val_accuracy: 0.8506\n",
|
|
"Epoch 11: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.8969 - accuracy: 0.2297 - val_loss: 0.7597 - val_accuracy: 0.4713\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.7185 - accuracy: 0.5436 - val_loss: 0.6301 - val_accuracy: 0.7471\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6123 - accuracy: 0.7762 - val_loss: 0.5465 - val_accuracy: 0.7586\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5408 - accuracy: 0.8401 - val_loss: 0.4930 - val_accuracy: 0.8276\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4907 - accuracy: 0.8459 - val_loss: 0.4590 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4501 - accuracy: 0.8430 - val_loss: 0.4371 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4202 - accuracy: 0.8459 - val_loss: 0.4190 - val_accuracy: 0.8391\n",
|
|
"Epoch 8/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3892 - accuracy: 0.8488 - val_loss: 0.4064 - val_accuracy: 0.8391\n",
|
|
"Epoch 9/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3649 - accuracy: 0.8576 - val_loss: 0.3965 - val_accuracy: 0.8391\n",
|
|
"Epoch 10/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.2900 - accuracy: 0.9375Restoring model weights from the end of the best epoch: 5.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3420 - accuracy: 0.8692 - val_loss: 0.3873 - val_accuracy: 0.8391\n",
|
|
"Epoch 10: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.6335 - accuracy: 0.7849 - val_loss: 0.5528 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5477 - accuracy: 0.8372 - val_loss: 0.4920 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4883 - accuracy: 0.8401 - val_loss: 0.4514 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4435 - accuracy: 0.8430 - val_loss: 0.4220 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4014 - accuracy: 0.8459 - val_loss: 0.3975 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3657 - accuracy: 0.8488 - val_loss: 0.3775 - val_accuracy: 0.8506\n",
|
|
"Epoch 7/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3343 - accuracy: 0.8721 - val_loss: 0.3609 - val_accuracy: 0.8506\n",
|
|
"Epoch 8/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3069 - accuracy: 0.8837 - val_loss: 0.3547 - val_accuracy: 0.8506\n",
|
|
"Epoch 9/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.2838 - accuracy: 0.8837 - val_loss: 0.3453 - val_accuracy: 0.8506\n",
|
|
"Epoch 10/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.2615 - accuracy: 0.8924 - val_loss: 0.3440 - val_accuracy: 0.8621\n",
|
|
"Epoch 11/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.2444 - accuracy: 0.9070 - val_loss: 0.3438 - val_accuracy: 0.8621\n",
|
|
"Epoch 12/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.2285 - accuracy: 0.9128 - val_loss: 0.3447 - val_accuracy: 0.8621\n",
|
|
"Epoch 13/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.2165 - accuracy: 0.9186 - val_loss: 0.3427 - val_accuracy: 0.8621\n",
|
|
"Epoch 14/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.2012 - accuracy: 0.9331 - val_loss: 0.3522 - val_accuracy: 0.8621\n",
|
|
"Epoch 15/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.1392 - accuracy: 0.9688Restoring model weights from the end of the best epoch: 10.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.1896 - accuracy: 0.9360 - val_loss: 0.3620 - val_accuracy: 0.8621\n",
|
|
"Epoch 15: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.7918 - accuracy: 0.3140 - val_loss: 0.6742 - val_accuracy: 0.6437\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.6316 - accuracy: 0.7297 - val_loss: 0.5706 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 5ms/step - loss: 0.5542 - accuracy: 0.8256 - val_loss: 0.5122 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4967 - accuracy: 0.8401 - val_loss: 0.4750 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4508 - accuracy: 0.8401 - val_loss: 0.4540 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4100 - accuracy: 0.8430 - val_loss: 0.4411 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4157 - accuracy: 0.8125Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3695 - accuracy: 0.8517 - val_loss: 0.4333 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.8143 - accuracy: 0.2994 - val_loss: 0.6333 - val_accuracy: 0.7586\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5870 - accuracy: 0.8227 - val_loss: 0.5004 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5142 - accuracy: 0.8372 - val_loss: 0.4455 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4769 - accuracy: 0.8372 - val_loss: 0.4203 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4411 - accuracy: 0.8372 - val_loss: 0.4046 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4000 - accuracy: 0.8372 - val_loss: 0.3902 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.2600 - accuracy: 0.9375Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3656 - accuracy: 0.8430 - val_loss: 0.3776 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"best acc at index 5: 0.8620689511299133\n",
|
|
"best loss at index 5: 0.3620067536830902\n",
|
|
"{'units3': 64, 'units2': 16, 'units1': 16, 'optimizer': <class 'keras.optimizers.legacy.adam.Adam'>, 'learning_rate': 0.001, 'activation3': 'relu', 'activation2': 'relu', 'activation1': 'relu'}\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6786 - accuracy: 0.6453 - val_loss: 0.5829 - val_accuracy: 0.8276\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5685 - accuracy: 0.8488 - val_loss: 0.5142 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 5ms/step - loss: 0.5102 - accuracy: 0.8372 - val_loss: 0.4789 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4728 - accuracy: 0.8401 - val_loss: 0.4561 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4380 - accuracy: 0.8488 - val_loss: 0.4398 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4107 - accuracy: 0.8547 - val_loss: 0.4282 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3892 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3847 - accuracy: 0.8576 - val_loss: 0.4165 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#Newly load the dataset with columns drop and create preprocessing pipeline\n",
|
|
"train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)\n",
|
|
"print(f\"dataset shape: {train.shape}\")\n",
|
|
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
|
"preprocessor = build_preprocessing_pipeline(X_train)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Define models for the pipeline\n",
|
|
"preprocessor.fit(X_train)\n",
|
|
"X_preprocessed = preprocessor.transform(X_train)\n",
|
|
"y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])\n",
|
|
"y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])\n",
|
|
"model_keras = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)\n",
|
|
"model_rf = RandomForestClassifier(n_estimators=100, random_state=22)\n",
|
|
"model_xgb = XGBClassifier(n_estimators=500)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Build the final pipeline with preprocessor and model, fit it and display accuracy score"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 103,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Building Pipeline\n",
|
|
"def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model) -> Pipeline:\n",
|
|
" pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
|
|
" ('model', model)\n",
|
|
" ])\n",
|
|
" #Fit the Model and make preds\n",
|
|
" pipeline.fit(X_train, y_train)\n",
|
|
" preds = pipeline.predict(X_valid)\n",
|
|
" \"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
|
|
" print(f\"Accuracy of {score}\") \"\"\"\n",
|
|
" correct_answers = 0\n",
|
|
" for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
|
|
" if(y_pred[0] == y_true[0]):correct_answers+=1\n",
|
|
" print(correct_answers/len(preds))\n",
|
|
" return pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Print results of the model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 104,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"6/6 [==============================] - 0s 1ms/step\n",
|
|
"0.7849462365591398\n",
|
|
"0.9354838709677419\n",
|
|
"0.946236559139785\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"pipeline_keras = fit_pipeline(X_train, y_train_ohe, preprocessor, model_keras)\n",
|
|
"pipeline_rf = fit_pipeline(X_train, y_train_ohe, preprocessor, model_rf)\n",
|
|
"pipeline_xgb = fit_pipeline(X_train, y_train_ohe, preprocessor, model_xgb)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Combine the fitted models to look if the accuracy improves"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 108,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"6/6 [==============================] - 0s 799us/step\n",
|
|
"0.9247311827956989\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"preds1 = pipeline_keras.predict(X_valid)\n",
|
|
"preds2 = pipeline_rf.predict(X_valid)\n",
|
|
"preds3 = pipeline_xgb.predict(X_valid)\n",
|
|
"correct_answers = 0\n",
|
|
"preds = [[0,0] for _ in range(len(preds1))]\n",
|
|
"for y_pred1,y_pred2,y_pred3, i in zip(preds1,preds2,preds3, range(len(preds1))):\n",
|
|
" count_class1 = y_pred1[0] + y_pred2[0] + y_pred3[0]\n",
|
|
" count_class2 = y_pred1[1] + y_pred2[1] + y_pred3[1]\n",
|
|
" if(count_class1 > count_class2):\n",
|
|
" preds[i][0] = 1\n",
|
|
" preds[i][1] = 0\n",
|
|
" else:\n",
|
|
" preds[i][0] = 0\n",
|
|
" preds[i][1] = 1\n",
|
|
"for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
|
|
" if(y_pred[0] == y_true[0]):correct_answers+=1\n",
|
|
"print(correct_answers/len(preds))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Submission"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"' submission = pd.DataFrame()\\nprediction = model.predict(x_test)\\nsubmission.insert(0, \"Id\", id_number, False)\\nsubmission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\\nsubmission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\\nsubmission.to_csv(\"/kaggle/working/submission.csv\",index = False) '"
|
|
]
|
|
},
|
|
"execution_count": 71,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\"\"\" submission = pd.DataFrame()\n",
|
|
"prediction = model.predict(x_test)\n",
|
|
"submission.insert(0, \"Id\", id_number, False)\n",
|
|
"submission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\n",
|
|
"submission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\n",
|
|
"submission.to_csv(\"/kaggle/working/submission.csv\",index = False) \"\"\""
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|