From 29386aae99912fb30a58e0cd5f6c10782de67d69 Mon Sep 17 00:00:00 2001 From: bperezlovisolo Date: Thu, 26 Feb 2026 12:26:13 +0100 Subject: [PATCH] lab-hyper-tuning --- lab-hyper-tuning.ipynb | 3229 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 3204 insertions(+), 25 deletions(-) diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..77c1e61 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -219,90 +219,3253 @@ "- Feature Selection\n" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId 0\n", + "HomePlanet 0\n", + "CryoSleep 0\n", + "Cabin 0\n", + "Destination 0\n", + "Age 0\n", + "VIP 0\n", + "RoomService 0\n", + "FoodCourt 0\n", + "ShoppingMall 0\n", + "Spa 0\n", + "VRDeck 0\n", + "Name 0\n", + "Transported 0\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#your code here\n", + "spaceship = spaceship.dropna()\n", + "spaceship.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['B', 'F', 'A', 'G', 'E', 'C', 'D', 'T'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spaceship[\"Cabin\"] = spaceship[\"Cabin\"].str.split(\"/\").str[0]\n", + "spaceship[\"Cabin\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "spaceship.drop(columns=[\"PassengerId\", \"Name\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Transported\n", + "True 0.503633\n", + "False 0.496367\n", + "Name: proportion, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spaceship[\"Transported\"].value_counts(normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',\n", + " 'Transported', 'HomePlanet_Earth', 'HomePlanet_Europa',\n", + " 'HomePlanet_Mars', 'CryoSleep_False', 'CryoSleep_True', 'Cabin_A',\n", + " 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',\n", + " 'Cabin_T', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22',\n", + " 'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "columns_to_encode = spaceship.select_dtypes(include=\"object\").columns\n", + "spaceship_encoded = pd.get_dummies(spaceship, columns=columns_to_encode, dtype=int)\n", + "print(spaceship_encoded.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#your code here\n", + "features = spaceship_encoded.drop(columns=[\"Transported\"])\n", + "target = spaceship_encoded[\"Transported\"].astype(int)" + ] + }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "normalizer = MinMaxScaler()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
MinMaxScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "MinMaxScaler()" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalizer.fit(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_scaled = normalizer.transform(X_train) \n", + "X_test_scaled = normalizer.transform(X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "forest = RandomForestClassifier(n_estimators=100,\n", + " max_depth=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(max_depth=20)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(max_depth=20)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest.fit(X_train_scaled, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters." + "- Evaluate your model" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7851739788199698\n", + "Precision: 0.7853019634781613\n", + "Recall: 0.7851739788199698\n", + "F1: 0.7851498837188009\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", + "\n", + "pred_rf = forest.predict(X_test_scaled)\n", + "\n", + "print(\"Accuracy:\", accuracy_score(y_test, pred_rf))\n", + "print(\"Precision:\", precision_score(y_test, pred_rf, average=\"macro\"))\n", + "print(\"Recall:\", recall_score(y_test, pred_rf, average=\"macro\"))\n", + "print(\"F1:\", f1_score(y_test, pred_rf, average=\"macro\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Evaluate your model" + "**Grid/Random Search**" ] }, { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#your code here" + "For this lab we will use Grid Search." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Grid/Random Search**" + "- Define hyperparameters to fine tune." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 17, "metadata": {}, + "outputs": [], "source": [ - "For this lab we will use Grid Search." + "#your code here\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 300],\n", + " 'max_depth': [None, 10, 20],\n", + " 'min_samples_split': [2, 5],\n", + " 'min_samples_leaf': [1, 2],\n", + " 'max_features': ['sqrt', 'log2']\n", + "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Define hyperparameters to fine tune." + "- Run Grid Search" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 19, "metadata": {}, + "outputs": [], "source": [ - "- Run Grid Search" + "rf = RandomForestClassifier(random_state=0)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 72 candidates, totalling 360 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\HP\\AppData\\Local\\Python\\pythoncore-3.14-64\\Lib\\site-packages\\joblib\\externals\\loky\\process_executor.py:782: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,\n",
+       "             param_grid={'max_depth': [None, 10, 20],\n",
+       "                         'max_features': ['sqrt', 'log2'],\n",
+       "                         'min_samples_leaf': [1, 2],\n",
+       "                         'min_samples_split': [2, 5],\n",
+       "                         'n_estimators': [100, 200, 300]},\n",
+       "             scoring='accuracy', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,\n", + " param_grid={'max_depth': [None, 10, 20],\n", + " 'max_features': ['sqrt', 'log2'],\n", + " 'min_samples_leaf': [1, 2],\n", + " 'min_samples_split': [2, 5],\n", + " 'n_estimators': [100, 200, 300]},\n", + " scoring='accuracy', verbose=1)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search = GridSearchCV(estimator= rf, param_grid= param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)\n", + "\n", + "grid_search.fit(X_train_scaled, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}\n", + "Best CV Score: 0.8024218055675009\n" + ] + } + ], + "source": [ + "print(\"Best Parameters:\", grid_search.best_params_)\n", + "print(\"Best CV Score:\", grid_search.best_score_)" + ] }, { "cell_type": "markdown", @@ -313,10 +3476,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy: 0.7874432677760969\n" + ] + } + ], + "source": [ + "best_rf = grid_search.best_estimator_\n", + "\n", + "pred_best = best_rf.predict(X_test_scaled)\n", + "\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, pred_best))" + ] } ], "metadata": { @@ -335,7 +3514,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.14.2" } }, "nbformat": 4,