diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ad6f926 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +lab-hyper-tuning_trad.ipynb diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..a18d5b8 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -221,11 +221,55 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/1x/604g3jcd6s54pz1_9_q0yygr0000gn/T/ipykernel_59692/2818035475.py:12: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " clean_spaceship[\"Cabin\"] = clean_spaceship[\"Cabin\"].apply(apply_cabin_map)\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "clean_spaceship = spaceship.dropna()\n", + "\n", + "clean_spaceship.isna().sum()\n", + "\n", + "def apply_cabin_map(value):\n", + " return value[0]\n", + "\n", + "clean_spaceship[\"Cabin\"] = clean_spaceship[\"Cabin\"].apply(apply_cabin_map)\n", + "\n", + "clean_spaceship = clean_spaceship.drop(columns=[\"PassengerId\", \"Name\"])\n", + "\n", + "clean_spaceship = pd.get_dummies(clean_spaceship, columns=[\"HomePlanet\", \"Cabin\", \"Destination\"])\n", + "\n", + "y = clean_spaceship[\"Transported\"]\n", + "X = clean_spaceship.drop(columns=[\"Transported\"])\n", + "\n", + "#your code here\n", + "\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.20, random_state=0\n", + ")\n", + "\n", + "scaler = MinMaxScaler()\n", + "scaler.fit(X_train)\n", + "\n", + "X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)\n", + "X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" ] }, { @@ -237,11 +281,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "rf = RandomForestClassifier(\n", + " n_estimators=100,\n", + " random_state=0\n", + ")\n", + "\n", + "rf.fit(X_train, y_train)\n", + "\n", + "pred = rf.predict(X_test)" ] }, { @@ -253,11 +309,25 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7957639939485628" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "accuracy_score(y_test, pred)\n", + "rf.score(X_test, y_test)" ] }, { @@ -283,11 +353,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "rf = RandomForestClassifier(random_state=0)\n", + "\n", + "param_grid = {\n", + " \"n_estimators\": [50, 100, 200, 500],\n", + " \"max_leaf_nodes\": [250, 500, 1000, None],\n", + " \"max_depth\": [10, 30, 50]\n", + "}\n" ] }, { @@ -299,10 +380,454 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),\n",
+ " param_grid={'max_depth': [10, 30, 50],\n",
+ " 'max_leaf_nodes': [250, 500, 1000, None],\n",
+ " 'n_estimators': [50, 100, 200, 500]},\n",
+ " scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),\n",
+ " param_grid={'max_depth': [10, 30, 50],\n",
+ " 'max_leaf_nodes': [250, 500, 1000, None],\n",
+ " 'n_estimators': [50, 100, 200, 500]},\n",
+ " scoring='accuracy')RandomForestClassifier(max_depth=10, n_estimators=500, random_state=0)
RandomForestClassifier(max_depth=10, n_estimators=500, random_state=0)