random forest model

2023-07-13 16:39:37 +02:00
parent 2ac083ed2f
commit aa96d3d983
1 changed files with 144 additions and 0 deletions
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
+    "    train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
+    "    greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
+    "    test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
+    "    train.head()\n",
+    "    return (train, greeks, test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
+    "    X = df.loc[:, :\"Class\"]\n",
+    "    y = df.loc[:, \"Class\"]\n",
+    "    return train_test_split(X, y, test_size=split, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
+    "    # Preprocessing for numerical data    \n",
+    "    numerical_transformer = SimpleImputer(strategy='constant')\n",
+    "    \n",
+    "    # Preprocessing for categorical data\n",
+    "    categorical_transformer = Pipeline(steps=[\n",
+    "        ('imputer', SimpleImputer(strategy='most_frequent')),\n",
+    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
+    "    ])\n",
+    "\n",
+    "    # Bundle preprocessing for numerical and categorical data\n",
+    "    numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
+    "    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
+    "    preprocessor = ColumnTransformer(\n",
+    "        transformers=[\n",
+    "            ('num', numerical_transformer, numerical_cols),\n",
+    "            ('cat', categorical_transformer, categorical_cols)\n",
+    "        ])\n",
+    "    return preprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "X shape: (61, 58) and y shape: (61,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define model\n",
+    "model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
+    "\n",
+    "# Building Pipeline\n",
+    "train, greeks, test = load_dataset()\n",
+    "preprocessor = build_pipeline(train)\n",
+    "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
+    "                        ('model', model)\n",
+    "                        ])\n",
+    "\n",
+    "#Preprocessing data\n",
+    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
+    "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "556 out of 556 are right\n"
+     ]
+    }
+   ],
+   "source": [
+    "pipeline.fit(X_train, y_train)\n",
+    "preds = pipeline.predict(X_valid)\n",
+    "score = accuracy_score(y_valid, preds, normalize=False)\n",
+    "print(f\"{score} out of {len(y_valid)} are right\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}