random forest model

This commit is contained in:
yann22ahlgrim
2023-07-13 16:39:37 +02:00
parent 2ac083ed2f
commit aa96d3d983
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
" train.head()\n",
" return (train, greeks, test)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
" X = df.loc[:, :\"Class\"]\n",
" y = df.loc[:, \"Class\"]\n",
" return train_test_split(X, y, test_size=split, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
" # Preprocessing for numerical data \n",
" numerical_transformer = SimpleImputer(strategy='constant')\n",
" \n",
" # Preprocessing for categorical data\n",
" categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
" ])\n",
"\n",
" # Bundle preprocessing for numerical and categorical data\n",
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
" preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numerical_transformer, numerical_cols),\n",
" ('cat', categorical_transformer, categorical_cols)\n",
" ])\n",
" return preprocessor"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X shape: (61, 58) and y shape: (61,)\n"
]
}
],
"source": [
"# Define model\n",
"model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
"\n",
"# Building Pipeline\n",
"train, greeks, test = load_dataset()\n",
"preprocessor = build_pipeline(train)\n",
"pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
"\n",
"#Preprocessing data\n",
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"556 out of 556 are right\n"
]
}
],
"source": [
"pipeline.fit(X_train, y_train)\n",
"preds = pipeline.predict(X_valid)\n",
"score = accuracy_score(y_valid, preds, normalize=False)\n",
"print(f\"{score} out of {len(y_valid)} are right\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}