From aa96d3d9831484555b61cf750a8bfaa505a5c910 Mon Sep 17 00:00:00 2001 From: yann22ahlgrim Date: Thu, 13 Jul 2023 16:39:37 +0200 Subject: [PATCH] random forest model --- .../train_models/RF_model.ipynb | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 competitions/identify-age-related-conditions/train_models/RF_model.ipynb diff --git a/competitions/identify-age-related-conditions/train_models/RF_model.ipynb b/competitions/identify-age-related-conditions/train_models/RF_model.ipynb new file mode 100644 index 0000000..1f303b0 --- /dev/null +++ b/competitions/identify-age-related-conditions/train_models/RF_model.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", + " train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n", + " greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n", + " test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n", + " train.head()\n", + " return (train, greeks, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", + " X = df.loc[:, :\"Class\"]\n", + " y = df.loc[:, \"Class\"]\n", + " return train_test_split(X, y, test_size=split, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n", + " # Preprocessing for numerical data \n", + " numerical_transformer = SimpleImputer(strategy='constant')\n", + " \n", + " # Preprocessing for categorical data\n", + " categorical_transformer = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='most_frequent')),\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + " ])\n", + "\n", + " # Bundle preprocessing for numerical and categorical data\n", + " numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n", + " categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n", + " preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', numerical_transformer, numerical_cols),\n", + " ('cat', categorical_transformer, categorical_cols)\n", + " ])\n", + " return preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X shape: (61, 58) and y shape: (61,)\n" + ] + } + ], + "source": [ + "# Define model\n", + "model = RandomForestClassifier(n_estimators=100, random_state=22)\n", + "\n", + "# Building Pipeline\n", + "train, greeks, test = load_dataset()\n", + "preprocessor = build_pipeline(train)\n", + "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + "\n", + "#Preprocessing data\n", + "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n", + "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "556 out of 556 are right\n" + ] + } + ], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "preds = pipeline.predict(X_valid)\n", + "score = accuracy_score(y_valid, preds, normalize=False)\n", + "print(f\"{score} out of {len(y_valid)} are right\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}