From 535bd043bf00c41b3ec2dd7cea41bbb423d5399e Mon Sep 17 00:00:00 2001 From: davherdel Date: Thu, 12 Feb 2026 01:10:17 +0000 Subject: [PATCH] Finished lab --- lab-hyper-tuning.ipynb | 1944 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 1921 insertions(+), 23 deletions(-) diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..94cb22e 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -35,19 +35,26 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#Libraries\n", + "# Running this after several \"not defined\" error messages: \n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "# First things first" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -200,7 +207,7 @@ "4 True " ] }, - "execution_count": 2, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -221,11 +228,46 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# Keeping these in ust one block for simplicity\n", + "# Target\n", + "y = spaceship[\"Transported\"].astype(int)\n", + "\n", + "# Features\n", + "X = spaceship.drop(columns=[\"Transported\", \"PassengerId\", \"Name\"])\n", + "\n", + "# Feature engineering\n", + "X[\"Deck\"] = X[\"Cabin\"].astype(str).str[0]\n", + "X = X.drop(columns=[\"Cabin\"])\n", + "\n", + "# Column types\n", + "num_cols = X.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "cat_cols = X.select_dtypes(include=[\"object\", \"bool\"]).columns\n", + "\n", + "# Pipelines\n", + "num_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "cat_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "# Preprocessor\n", + "preprocessor = ColumnTransformer([\n", + " (\"num\", num_pipeline, num_cols),\n", + " (\"cat\", cat_pipeline, cat_cols)\n", + "])\n", + "\n", + "# Split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n" ] }, { @@ -237,11 +279,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "param_grid = {\n", + " \"model__n_estimators\": [100, 200],\n", + " \"model__learning_rate\": [0.05, 0.1],\n", + " \"model__max_depth\": [3, 5]\n", + "}" ] }, { @@ -253,11 +300,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.8039102932719954\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.77 0.80 863\n", + " 1 0.79 0.83 0.81 876\n", + "\n", + " accuracy 0.80 1739\n", + " macro avg 0.80 0.80 0.80 1739\n", + "weighted avg 0.80 0.80 0.80 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "gb_model.fit(X_train, y_train)\n", + "\n", + "y_pred = gb_model.predict(X_test)\n", + "\n", + "print(\"Baseline accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))" ] }, { @@ -283,11 +354,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.8039102932719954\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.77 0.80 863\n", + " 1 0.79 0.83 0.81 876\n", + "\n", + " accuracy 0.80 1739\n", + " macro avg 0.80 0.80 0.80 1739\n", + "weighted avg 0.80 0.80 0.80 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "gb_model.fit(X_train, y_train)\n", + "\n", + "y_pred = gb_model.predict(X_test)\n", + "\n", + "print(\"Baseline accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))\n" ] }, { @@ -299,10 +394,1787 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('prep',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),\n",
+       "                                                                                         ('onehot',\n",
+       "                                                                                          OneHotEncoder(handle_unknown='ignore'))]),\n",
+       "                                                                         Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck'], dtype='object'))])),\n",
+       "                                       ('model',\n",
+       "                                        GradientBoostingClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'model__learning_rate': [0.05, 0.1],\n",
+       "                         'model__max_depth': [3, 5],\n",
+       "                         'model__n_estimators': [100, 200]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('prep',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),\n", + " ('onehot',\n", + " OneHotEncoder(handle_unknown='ignore'))]),\n", + " Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck'], dtype='object'))])),\n", + " ('model',\n", + " GradientBoostingClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'model__learning_rate': [0.05, 0.1],\n", + " 'model__max_depth': [3, 5],\n", + " 'model__n_estimators': [100, 200]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + " gb_model,\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)\n" + ] }, { "cell_type": "markdown", @@ -313,15 +2185,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model accuracy: 0.8113858539390454\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.79 0.81 863\n", + " 1 0.80 0.83 0.82 876\n", + "\n", + " accuracy 0.81 1739\n", + " macro avg 0.81 0.81 0.81 1739\n", + "weighted avg 0.81 0.81 0.81 1739\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "best_model = grid_search.best_estimator_\n", + "\n", + "y_pred = best_model.predict(X_test)\n", + "\n", + "print(\"Best model accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -335,7 +2233,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.12" } }, "nbformat": 4,