diff --git a/README.md b/README.md index 6e62e8a..4cb7f45 100644 --- a/README.md +++ b/README.md @@ -87,3 +87,4 @@ If the link shown is the same as the main Ironhack repository, you will need to +lab terminado! diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..206873d 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -221,11 +221,67 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((6954, 11), (1739, 11))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code here" + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "\n", + "# Target\n", + "y = spaceship[\"Transported\"].astype(int)\n", + "\n", + "# Features (quitamos columnas típicamente inútiles / ID)\n", + "X = spaceship.drop(columns=[\"Transported\"])\n", + "for col in [\"PassengerId\", \"Name\"]:\n", + " if col in X.columns:\n", + " X = X.drop(columns=[col])\n", + "\n", + "# Identificar columnas numéricas y categóricas\n", + "num_cols = X.select_dtypes(include=[\"number\"]).columns.tolist()\n", + "cat_cols = X.select_dtypes(exclude=[\"number\"]).columns.tolist()\n", + "\n", + "# Pipelines de preprocesamiento\n", + "numeric_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "categorical_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "preprocess = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, num_cols),\n", + " (\"cat\", categorical_transformer, cat_cols)\n", + " ]\n", + ")\n", + "\n", + "# Feature selection (después del preproceso)\n", + "selector = SelectKBest(score_func=f_classif, k=50)\n", + "\n", + "# Train/Test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n", + "\n", + "X_train.shape, X_test.shape" ] }, { @@ -237,11 +293,1632 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocess',\n",
+ " ColumnTransformer(transformers=[('num',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('scaler',\n",
+ " StandardScaler())]),\n",
+ " ['Age', 'RoomService',\n",
+ " 'FoodCourt', 'ShoppingMall',\n",
+ " 'Spa', 'VRDeck']),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='most_frequent')),\n",
+ " ('onehot',\n",
+ " OneHotEncoder(handle_unknown='ignore'))]),\n",
+ " ['HomePlanet', 'CryoSleep',\n",
+ " 'Cabin', 'Destination',\n",
+ " 'VIP'])])),\n",
+ " ('select', SelectKBest(k=50)),\n",
+ " ('model', LogisticRegression(max_iter=1000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
| \n", + " | \n",
+ " \n",
+ " score_func\n",
+ " score_func: callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. Default is f_classif (see below \"See Also\"). The default function only works with classification tasks. .. versionadded:: 0.18\n", + " \n", + " | \n",
+ " <function f_c...001C824C55430> | \n", + "
| \n", + " | \n",
+ " \n",
+ " k\n",
+ " k: int or \"all\", default=10 Number of top features to select. The \"all\" option bypasses selection, for use in a parameter search.\n", + " \n", + " | \n",
+ " 50 | \n", + "