From 312bda7fe639acbb32178e8c5a339bb8caaf3566 Mon Sep 17 00:00:00 2001 From: soy juan Date: Thu, 26 Feb 2026 17:43:11 +0100 Subject: [PATCH] lab hyper terminado --- README.md | 1 + lab-hyper-tuning.ipynb | 1809 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 1792 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 6e62e8a..4cb7f45 100644 --- a/README.md +++ b/README.md @@ -87,3 +87,4 @@ If the link shown is the same as the main Ironhack repository, you will need to +lab terminado! diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..206873d 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -221,11 +221,67 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((6954, 11), (1739, 11))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code here" + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "\n", + "# Target\n", + "y = spaceship[\"Transported\"].astype(int)\n", + "\n", + "# Features (quitamos columnas típicamente inútiles / ID)\n", + "X = spaceship.drop(columns=[\"Transported\"])\n", + "for col in [\"PassengerId\", \"Name\"]:\n", + " if col in X.columns:\n", + " X = X.drop(columns=[col])\n", + "\n", + "# Identificar columnas numéricas y categóricas\n", + "num_cols = X.select_dtypes(include=[\"number\"]).columns.tolist()\n", + "cat_cols = X.select_dtypes(exclude=[\"number\"]).columns.tolist()\n", + "\n", + "# Pipelines de preprocesamiento\n", + "numeric_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "categorical_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "preprocess = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, num_cols),\n", + " (\"cat\", categorical_transformer, cat_cols)\n", + " ]\n", + ")\n", + "\n", + "# Feature selection (después del preproceso)\n", + "selector = SelectKBest(score_func=f_classif, k=50)\n", + "\n", + "# Train/Test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n", + "\n", + "X_train.shape, X_test.shape" ] }, { @@ -237,11 +293,1632 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocess',\n",
+       "                 ColumnTransformer(transformers=[('num',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer(strategy='median')),\n",
+       "                                                                  ('scaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['Age', 'RoomService',\n",
+       "                                                   'FoodCourt', 'ShoppingMall',\n",
+       "                                                   'Spa', 'VRDeck']),\n",
+       "                                                 ('cat',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
+       "                                                                  ('onehot',\n",
+       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
+       "                                                  ['HomePlanet', 'CryoSleep',\n",
+       "                                                   'Cabin', 'Destination',\n",
+       "                                                   'VIP'])])),\n",
+       "                ('select', SelectKBest(k=50)),\n",
+       "                ('model', LogisticRegression(max_iter=1000))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocess',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " ['Age', 'RoomService',\n", + " 'FoodCourt', 'ShoppingMall',\n", + " 'Spa', 'VRDeck']),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),\n", + " ('onehot',\n", + " OneHotEncoder(handle_unknown='ignore'))]),\n", + " ['HomePlanet', 'CryoSleep',\n", + " 'Cabin', 'Destination',\n", + " 'VIP'])])),\n", + " ('select', SelectKBest(k=50)),\n", + " ('model', LogisticRegression(max_iter=1000))])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code here" + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "log_reg = LogisticRegression(max_iter=1000)\n", + "\n", + "baseline_pipe = Pipeline(steps=[\n", + " (\"preprocess\", preprocess),\n", + " (\"select\", selector),\n", + " (\"model\", log_reg)\n", + "])\n", + "\n", + "baseline_pipe.fit(X_train, y_train)" ] }, { @@ -253,11 +1930,40 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy (baseline): 0.7791834387579069\n", + "\n", + "Confusion Matrix:\n", + " [[686 177]\n", + " [207 669]]\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.77 0.79 0.78 863\n", + " 1 0.79 0.76 0.78 876\n", + "\n", + " accuracy 0.78 1739\n", + " macro avg 0.78 0.78 0.78 1739\n", + "weighted avg 0.78 0.78 0.78 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "\n", + "y_pred = baseline_pipe.predict(X_test)\n", + "\n", + "print(\"Accuracy (baseline):\", accuracy_score(y_test, y_pred))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))" ] }, { @@ -283,11 +1989,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = {\n", + " \"model__C\": [0.1, 1, 10]\n", + "}\n", + "\n", + "grid = GridSearchCV(\n", + " estimator=baseline_pipe,\n", + " param_grid=param_grid,\n", + " scoring=\"accuracy\",\n", + " cv=3,\n", + " n_jobs=1,\n", + " verbose=2\n", + ")" ] }, { @@ -299,10 +2018,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 3 candidates, totalling 9 fits\n", + "[CV] END .......................................model__C=0.1; total time= 0.0s\n", + "[CV] END .......................................model__C=0.1; total time= 0.0s\n", + "[CV] END .......................................model__C=0.1; total time= 0.0s\n", + "[CV] END .........................................model__C=1; total time= 0.0s\n", + "[CV] END .........................................model__C=1; total time= 0.0s\n", + "[CV] END .........................................model__C=1; total time= 0.0s\n", + "[CV] END ........................................model__C=10; total time= 0.0s\n", + "[CV] END ........................................model__C=10; total time= 0.0s\n", + "[CV] END ........................................model__C=10; total time= 0.0s\n", + "Best CV accuracy: 0.7791199309749784\n", + "Best params: {'model__C': 10}\n" + ] + } + ], + "source": [ + "grid.fit(X_train, y_train)\n", + "\n", + "print(\"Best CV accuracy:\", grid.best_score_)\n", + "print(\"Best params:\", grid.best_params_)" + ] }, { "cell_type": "markdown", @@ -313,10 +2056,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy (tuned): 0.7786083956296722\n", + "\n", + "Confusion Matrix:\n", + " [[686 177]\n", + " [208 668]]\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.77 0.79 0.78 863\n", + " 1 0.79 0.76 0.78 876\n", + "\n", + " accuracy 0.78 1739\n", + " macro avg 0.78 0.78 0.78 1739\n", + "weighted avg 0.78 0.78 0.78 1739\n", + "\n" + ] + } + ], + "source": [ + "best_model = grid.best_estimator_\n", + "y_pred_best = best_model.predict(X_test)\n", + "\n", + "print(\"Accuracy (tuned):\", accuracy_score(y_test, y_pred_best))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred_best))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred_best))" + ] } ], "metadata": { @@ -335,7 +2108,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.14.0" } }, "nbformat": 4,