{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "from xgboost import XGBClassifier\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
    "    greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
    "    test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
    "    train.head()\n",
    "    return (train, greeks, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    X = df.loc[:, df.columns != \"Class\"]\n",
    "    y = df.loc[:, \"Class\"]\n",
    "    return train_test_split(X, y, test_size=split, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
    "    # Preprocessing for numerical data    \n",
    "    numerical_transformer = SimpleImputer(strategy='constant')\n",
    "    \n",
    "    # Preprocessing for categorical data\n",
    "    categorical_transformer = Pipeline(steps=[\n",
    "        ('imputer', SimpleImputer(strategy='most_frequent')),\n",
    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
    "    ])\n",
    "\n",
    "    # Bundle preprocessing for numerical and categorical data\n",
    "    numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
    "    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', numerical_transformer, numerical_cols),\n",
    "            ('cat', categorical_transformer, categorical_cols)\n",
    "        ])\n",
    "    return preprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset shape: (617, 58)\n",
      "X shape: (431, 57) and y shape: (431,)\n"
     ]
    }
   ],
   "source": [
    "# Define model\n",
    "#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
    "model = XGBClassifier(n_estimators=500)\n",
    "\n",
    "#Splitting\n",
    "train, greeks, test = load_dataset()\n",
    "print(f\"dataset shape: {train.shape}\")\n",
    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
    "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
    "\n",
    "# Building Pipeline\n",
    "preprocessor = build_pipeline(X_train)\n",
    "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                        ('model', model)\n",
    "                        ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of [0.94736842 0.89189189 0.91891892 0.97297297 0.89189189]\n"
     ]
    }
   ],
   "source": [
    "#Fit the Model and make preds\n",
    "pipeline.fit(X_train, y_train)\n",
    "preds = pipeline.predict(X_valid)\n",
    "score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
    "print(f\"Accuracy of {score}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}