{ "cells": [ { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", " train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n", " greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n", " test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n", " train.head()\n", " return (train, greeks, test)" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", " X = df.loc[:, :\"Class\"]\n", " y = df.loc[:, \"Class\"]\n", " return train_test_split(X, y, test_size=split, random_state=42)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n", " # Preprocessing for numerical data \n", " numerical_transformer = SimpleImputer(strategy='constant')\n", " \n", " # Preprocessing for categorical data\n", " categorical_transformer = Pipeline(steps=[\n", " ('imputer', SimpleImputer(strategy='most_frequent')),\n", " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", " ])\n", "\n", " # Bundle preprocessing for numerical and categorical data\n", " numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n", " categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n", " preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', numerical_transformer, numerical_cols),\n", " ('cat', categorical_transformer, categorical_cols)\n", " ])\n", " return preprocessor" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X shape: (61, 58) and y shape: (61,)\n" ] } ], "source": [ "# Define model\n", "model = RandomForestClassifier(n_estimators=100, random_state=22)\n", "\n", "# Building Pipeline\n", "train, greeks, test = load_dataset()\n", "preprocessor = build_pipeline(train)\n", "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('model', model)\n", " ])\n", "\n", "#Preprocessing data\n", "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n", "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "556 out of 556 are right\n" ] } ], "source": [ "pipeline.fit(X_train, y_train)\n", "preds = pipeline.predict(X_valid)\n", "score = accuracy_score(y_valid, preds, normalize=False)\n", "print(f\"{score} out of {len(y_valid)} are right\")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }