diff --git a/notebook/PROJECT2_main1.ipynb b/notebook/PROJECT2_main1.ipynb
new file mode 100644
index 0000000..d775cb1
--- /dev/null
+++ b/notebook/PROJECT2_main1.ipynb
@@ -0,0 +1,1623 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de9edac0-fcff-4134-913b-5c0cb3a81646",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "df4 = pd.read_csv(\"df_final_demo.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b15dbbc-e036-47d4-a650-2bdd1fda6f0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3 = pd.read_csv(\"df_final_experiment_clients.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4f88831-c73a-457b-b38d-526deb99eb1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_csv(\"df_final_web_data_pt_1.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ae03da9-ad70-4bc5-b1b7-8578a6ee40a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.read_csv(\"df_final_web_data_pt_2.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42167cc8-18da-46d6-8a62-0b29f654082d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4637e1ee-6841-431e-ac99-49c6a1557d6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2404ff4f-d2a1-4d73-8850-6fb997416ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = pd.concat([df1, df2], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f583b072-14bf-4a68-b0fd-8e129194473b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmergeexp = dfs.merge(df3, on='client_id', how='left')\n",
+    "dfmerged = dfmergeexp.merge(df4, on='client_id', how='left')\n",
+    "dfmerged = dfmerged.drop_duplicates()\n",
+    "dfmerged = dfmerged[dfmerged['Variation'].isin(['Control', 'Test'])]\n",
+    "\n",
+    "dfmerged['source_group'] = dfmerged['Variation']\n",
+    "\n",
+    "dfmerged.to_csv('ab_test_data_with_source.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01ac17a1-d2fe-4d56-8b1a-376ac810be70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['client_id'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "427a2282-9d0f-4a2a-b771-e0aaaf4dea2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.isnull()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3aef449c-5efe-4f22-8387-5217ec21c6d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f946b9e-e8ae-472a-ad3e-695e63545151",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd184618-2e58-445b-a677-eb761f086c62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#checking what the type of date_time\n",
+    "dfs.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87ec222b-8b6e-44c1-8bdd-2944199a53a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#convert object into date_time\n",
+    "dfs['date_time'] = pd.to_datetime(dfs['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d285f1e-baac-4ffa-9f3e-1ba08f3423ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2349d923-4c1d-44b0-b222-091385812b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "448d5890-b61e-473c-8403-13273d164f60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df4.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec9f6c03-7ed3-467b-8d75-094114d8838f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55e51cdf-678d-469d-a6fc-746416546c2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_all.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cf44703-273b-4a59-a0e1-4df661138326",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd9f177f-89f8-410a-a535-daddffb87e32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3e667ca-63a0-4a4e-87ed-a83ae5c118f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b05e56f0-fd06-4dc4-a47e-08cfc1d0aa81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfsorted[dfsorted['client_id'] == 442857]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6b253c6-7371-414a-8ee7-59eede52b03b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['client_id'].value_counts().head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "867532d4-592d-425b-a442-386b325bc845",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['date_time'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de7d7eb0-36d1-4b51-b027-5db80356cec3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visitor_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c83e80d-ef18-4f85-9bff-3b82208c121d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['date_time'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19ae077e-9b9c-477d-a267-1d9cb15fc815",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visit_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81f2e88d-79a1-4a70-ab61-acd0fd663704",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visitor_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aca325aa-d0cb-437e-bb4c-64eada0876ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['process_step'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d79aa0f-4f36-4aba-b730-b18f0a16028d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30d4f963-6a2a-4659-9a7b-68d23e59b445",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df4.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "898410aa-94c8-4747-81b1-e11afd574f8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c35f3128-58a1-4557-8cfd-e70f054e33c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_test.isnull()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87bc11a1-1791-4a93-9913-31d84c7a6875",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['process_step'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72dcc7fb-1ff0-49ea-98b2-c3c04a08d1af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#merfe dfs + experiment(df3) to have Variaton = Control OR test)\n",
+    "df_mergeexp = dfs.merge(df3, on=\"client_id\", how=\"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f2548d2-68c6-40fb-99f7-708369d15623",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#merge with demo(df4) (demographic data)\n",
+    "df_merged = df_mergeexp.merge(df4, on=\"client_id\", how=\"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7709e3-5aea-4876-8e04-6e6842659f12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52d75c86-aa2d-4455-a876-f6089b4df31f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#remove duplicates\n",
+    "df_merged = df_merged.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a43aef8-5752-41de-b0b3-d6439cb714d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#remove rows without any variation\n",
+    "df_merged = df_merged[df_merged[\"Variation\"].isin([\"Control\", \"Test\"])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9632b15a-5c30-4426-942e-134e2a924893",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74067acd-53d4-4b5a-bacb-83f74d45e04f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged[\"process_step\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ac2ee71-4c6a-47b0-beb7-adc4b32d0725",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged[\"Variation\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3336830-2021-4163-ba50-4a93a2ca1299",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Give a table for Tests and another for Controls! \n",
+    "df_control = df_merged[df_merged[\"Variation\"] == \"Control\"].copy()\n",
+    "df_test = df_merged[df_merged[\"Variation\"] == \"Test\"].copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48b1d00b-87cd-4115-a1e1-dfab54281ab3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_control.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a87776b-7368-45dd-baeb-5a89daf30b19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_test.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cdc8198-e0ab-4fc8-8eb0-3b96318f1710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged['num_accts'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "162a3f57-229f-4cbf-848a-e4704a2cdf52",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Who are the primary clients using this online process?</b></span> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fed97a77-c349-420e-a073-8cdd689514c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "usage = (df_merged.groupby(\"client_id\")[\"visit_id\"].nunique().reset_index(name=\"n_visits\"))  #hor many visits per client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed106950-abca-4aa9-afb0-f758055eaa47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cutoff = usage[\"n_visits\"].quantile(0.75)\n",
+    "usage['primary'] = (usage[\"n_visits\"] >= cutoff).astype(int)          #top25% by number of visits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5b77ee6-dd0f-4256-b044-6688bc348160",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c04385dd-5d27-442c-ba4c-63c1e2808d1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = usage.merge(df_merged, left_on=\"client_id\", right_on=\"client_id\", how=\"left\") #join demographics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a46f9ad-2767-4c80-8e18-bf36f0afb441",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby(\"primary\")[[\"clnt_age\",\"clnt_tenure_yr\",\"clnt_tenure_mnth\",\"logons_6_mnth\"]].mean()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e92ebf-05af-411c-a0ed-0785e70a8e0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub = df_merged[df_merged[\"primary\"] == 1]\n",
+    "print(sub[[\"clnt_age\",\"clnt_tenure_yr\",\"bal\",\"gendr\"]].describe(include=\"all\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9948430a-27f5-467a-b44f-ccc9968944a0",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.\n",
+    "primary cients are more long standing.\n",
+    "The most frequent gender is “M”</b></span> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95002485-6f20-4f4a-a763-d037b9faa90f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby(\"primary\")[['calls_6_mnth', 'logons_6_mnth']].mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ac842fc-b333-4bb9-b804-01f9fc8f0c72",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>They also make more calls and more logons in 6 months, so they are more active on all channels, not only online.</b></span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a4f3984-9698-45f1-9d0e-299d234bdccf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_merged.rename(columns={'Variation': 'variation'})\n",
+    "df3 = df3.rename(columns={'Variation': 'variation'})\n",
+    "df_control = df_control.rename(columns={'Variation': 'variation'})\n",
+    "df_test = df_test.rename(columns={'Variation': 'variation'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73f27b9e-3a8c-4ad7-a756-6838314f5b34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5da498ad-e434-4dbc-af5f-84b4a322b09d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_control.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110917b6-fa20-47d1-9704-9263dafd5312",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90d7a353-0bd9-422f-b3e4-98d8d709c90d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4edff36-d806-4048-9b85-aa83de863e3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f86541c-bc8d-4ea1-a7b5-d823188ed736",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eed944a6-5fce-4e95-ad1e-8f384f15addb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28162307-e0cf-4d5d-b662-58f03c6a67e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['step_num'] = dfs_var['process_step'].map(step_order)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c71f9b3d-9581-40fc-986d-bf30da3e1882",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "######################################################\n",
+    "#COMPLETION RATE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0699b897-2d70-4a36-a2fa-de92681667c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "last_step = dfs_var['step_num'].max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1451a1a2-67f8-4522-b352-6a72513b7a3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "decd9c0a-810e-4e3d-a3da-4aa31dd2c58c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitcomp[\"completed\"] = (visitcomp[\"max_step\"] == last_step).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "323d47af-6a05-4a10-bc42-3886c7c88ca4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c501100b-d67e-4774-9e73-db3350874cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completionrate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Average time spent on each steap is', completionrate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "############################################################################################\n",
+    "#TIME SPENT ON EACH STEP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4a43083-3154-46ca-823f-10e50602565c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2f7798-0953-4991-9634-1984749b0703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57d91d74-ae22-46fc-a195-ad7e1b777296",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec88f0a4-fcb2-4422-a325-52661e23016f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##################################################################\n",
+    "#CHECKING RATING OF ERRORS PER EACH STEP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "error_rates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43f50454-c86a-4040-b141-5a81fb366ea7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# max error_flag per visit - error = 1\n",
+    "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\"])[\"error_flag\"].max().reset_index())\n",
+    "\n",
+    "# variation resume\n",
+    "err_summary = (visit_error.groupby(\"variation\")[\"error_flag\"].agg(n_error=\"sum\", n_total=\"count\").reset_index())\n",
+    "\n",
+    "print(err_summary)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>COMPLETITION RATE</b> - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.</span>   \n",
+    "\n",
+    "<span style=\"font-size:18px;\"><b>TIME SPENT</b> - The test version is better for user to complete the steps faster.</span>   \n",
+    "\n",
+    "<span style=\"font-size:18px;\"><b>ERRORS RATES</b> - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors.</span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a07db602-47f1-4c8c-8785-76c322d070db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.stats import norm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n",
+    "\n",
+    "print(summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Order control test\n",
+    "summary = summary.set_index('variation').loc[['Control', 'Test']]\n",
+    "x1, x2 = summary['n_complete'].values\n",
+    "n1, n2 = summary['n_total'].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#propoortions\n",
+    "p1 = x1 / n1\n",
+    "p2 = x2 / n2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#h0\n",
+    "p_pool = (x1 + x2) / (n1 + n2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48060fa7-6f84-4685-98c0-9f818b234cbe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test of 2 proportions \n",
+    "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n",
+    "z = (p2 - p1) / se"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p_value = 2 * (1 - norm.cdf(abs(z)))\n",
+    "print(f'Completion of rate Control: {p1:4f}')\n",
+    "print(f'Completion of rate Test: {p2:4f}')\n",
+    "print(f'z-statistic: {z:4f}')\n",
+    "print(f'p-value    : {p_value:.6f}')\n",
+    "alpha = 0.05\n",
+    "if p_value < alpha:\n",
+    "    print('Statistical diference highly significative (alpha=0.05)')\n",
+    "else:\n",
+    "    print('Statistical diference not significative (alpha=0.05)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49417603-53bb-45a2-a9e1-f55fe5c2999c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "056a80de-36d0-426f-8cdd-8a0cb74582a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dfs_var.columns)\n",
+    "\n",
+    "print(dfs_var[\"step_durationsec\"].notna().sum())\n",
+    "\n",
+    "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+    "print(df_dur.shape)\n",
+    "print(df_dur[\"process_step\"].unique())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ca550d4-c1d0-4b3f-a727-3c9c2fef399d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dfs_var[\"step_durationsec\"].head())\n",
+    "print(dfs_var[\"step_durationsec\"].isna().sum())\n",
+    "print(len(dfs_var))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8267354a-556e-4984-8024-672be4c03b72",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>PREPARING MEAN OF TIME FOR EACH STEP IN CONTROL VS STEP</b></span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c36ce9c0-db04-4d3f-98ff-aba36672d3cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var[\"date_time\"] = pd.to_datetime(dfs_var[\"date_time\"])\n",
+    "\n",
+    "dfs_var = dfs_var.sort_values([\"variation\", \"visit_id\", \"step_num\", \"date_time\"])\n",
+    "\n",
+    "dfs_var[\"next_time\"] = (\n",
+    "    dfs_var.groupby([\"variation\", \"visit_id\"])[\"date_time\"].shift(-1))\n",
+    "dfs_var[\"step_durationsec\"] = (dfs_var[\"next_time\"] - dfs_var[\"date_time\"]).dt.total_seconds()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa579f03-f983-4ef4-8df6-85c818d34dc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+    "\n",
+    "mean_step_time = (df_dur.groupby([\"variation\", \"process_step\"])[\"step_durationsec\"].mean().reset_index())\n",
+    "\n",
+    "print(mean_step_time)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13628402-e804-4082-8491-5558b59d76a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b795cd06-2379-4c84-bec4-c3e7a562cc62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\", \"client_id\"])[\"error_flag\"].max().reset_index(name=\"has_error\"))\n",
+    "\n",
+    "visits_with_age = visit_error.merge(df4[[\"client_id\", \"clnt_age\"]], on=\"client_id\", how=\"left\").dropna(subset=[\"clnt_age\"])\n",
+    "\n",
+    "for var in [\"Control\", \"Test\"]:\n",
+    "    sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+    "    corr = np.corrcoef(sub[\"clnt_age\"], sub[\"has_error\"])[0, 1]\n",
+    "    print(f\"Correlation age–error ({var}): {corr:.4f}  (n={len(sub)})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b51e9ea-cf23-4b7f-9af8-dbf7a7e9405c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12, 6))\n",
+    "\n",
+    "correlations = {'Control': -0.0870, 'Test': -0.0420}\n",
+    "\n",
+    "x = np.arange(2)\n",
+    "bars = plt.bar(x, [abs(correlations['Control']), abs(correlations['Test'])], color=['steelblue', 'darkorange'], alpha=0.8, edgecolor='black', linewidth=2)\n",
+    "\n",
+    "plt.ylabel('Correlation', fontsize=12, fontweight='bold')\n",
+    "plt.title('Age-Error Relationship\\n(Lower correlation)', fontsize=14, fontweight='bold')\n",
+    "\n",
+    "plt.xticks(x, correlations.keys())\n",
+    "plt.ylim(0, 0.1)\n",
+    "\n",
+    "for i, (bar, corr) in enumerate(zip(bars, correlations.values())):\n",
+    "    plt.text(bar.get_x() + bar.get_width()/2, abs(corr) + 0.002, \n",
+    "             f'{abs(corr):.4f}', ha='center', fontweight='bold', fontsize=12)\n",
+    "\n",
+    "improvement = ((abs(correlations['Control']) - abs(correlations['Test'])) / abs(correlations['Control']) * 100)\n",
+    "\n",
+    "plt.grid(axis='y', alpha=0.3)\n",
+    "sns.despine()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c2d14a8e-189a-48dc-93f4-435bda0e44a3",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>There is no relationship where each additional year of age systematically increases or decreases error rate.</b></span>                           \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce2b60be-7678-4bf8-a49e-60d234c1d585",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = [0, 40, 60, 120]\n",
+    "labels = [\"<40\", \"40-60\", \">60\"]\n",
+    "visits_with_age[\"age_group\"] = pd.cut(visits_with_age[\"clnt_age\"], bins=bins, labels=labels)\n",
+    "\n",
+    "for var in [\"Control\", \"Test\"]:\n",
+    "    sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+    "    tab = pd.crosstab(sub[\"age_group\"], sub[\"has_error\"])  \n",
+    "    print(var, \"\\n\", tab, \"\\n\")\n",
+    "    \n",
+    "    from scipy.stats import chi2_contingency\n",
+    "    chi2, p, dof, exp = chi2_contingency(tab)\n",
+    "    print(f\"{var}: chi2 = {chi2:.2f}, p-value = {p:.6f}\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4887d163-56c7-4b06-94f0-3d6d4082814a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Bar Chi2 values\n",
+    "chi2_values = [745.54, 720.85]  # Control vs Test\n",
+    "ax1.bar(['Control', 'Test'], chi2_values, color=['steelblue', 'darkorange'], alpha=0.8)\n",
+    "ax1.set_ylabel('Chi² Statistic')\n",
+    "ax1.set_title('Chi² Age-Error Distribution\\n(Higher = more age influence)')\n",
+    "for i, v in enumerate(chi2_values):\n",
+    "    ax1.text(i, v + 10, f'{v:.0f}', ha='center', fontweight='bold')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d633dd69-f1df-4e81-aeab-31e224ce1a94",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>(significant chi-square). The relationship is non-linear, younger and older users likely commit different types or frequencies of errors.</b></span>                             "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "987b45ee-7df6-4446-9125-180619476b1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmergeexp = dfs.merge(df3, left_on='client_id', right_on='client_id', how='left')\n",
+    "dfmerged = dfmergeexp.merge(df4, left_on='client_id', right_on='client_id', how='left')\n",
+    "\n",
+    "dfmerged_calls = dfmerged[dfmerged['variation'].isin(['Control', 'Test'])].copy()\n",
+    "\n",
+    "print(\"Clients per group:\")\n",
+    "print(dfmerged_calls['variation'].value_counts())\n",
+    "\n",
+    "summary = dfmerged_calls.groupby('variation')['calls_6_mnth'].agg(['mean', 'median', 'count', 'std']).round(2)\n",
+    "print(\"\\nCalls6mnth Control vs Test:\")\n",
+    "print(summary)\n",
+    "\n",
+    "from scipy.stats import ttest_ind\n",
+    "control_c = dfmerged_calls[dfmerged_calls['variation']=='Control']['calls_6_mnth'].dropna()\n",
+    "test_c = dfmerged_calls[dfmerged_calls['variation']=='Test']['calls_6_mnth'].dropna()\n",
+    "t_stat, p_val = ttest_ind(control_c, test_c)\n",
+    "print(f\"\\nT-test: t={t_stat:.2f}, p-value={p_val:.6f}\")\n",
+    "\n",
+    "if p_val < 0.05:\n",
+    "    print(\"*** TEST HAS FEWER CALLS (significant)! ***\")\n",
+    "elif test_c.mean() < control_c.mean():\n",
+    "    print(\"*** Test has fewer calls (not significant) ***\")\n",
+    "else:\n",
+    "    print(\"*** Control has fewer calls ***\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0eb1ca04-ce60-44b7-bb86-05c81f76622c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(8, 6))\n",
+    "\n",
+    "x = ['Control', 'Test']\n",
+    "means = [summary.loc['Control', 'mean'], summary.loc['Test', 'mean']]\n",
+    "stds = [summary.loc['Control', 'std'], summary.loc['Test', 'std']]\n",
+    "n = [summary.loc['Control', 'count'], summary.loc['Test', 'count']]\n",
+    "yerr = [s/np.sqrt(nn) for s,nn in zip(stds,n)]\n",
+    "\n",
+    "# Barplot CLEAN\n",
+    "bars = plt.bar(x, means, yerr=yerr, capsize=10, color=['steelblue', 'darkorange'], alpha=0.85, edgecolor='black', linewidth=1.5)\n",
+    "\n",
+    "plt.ylabel('Average Calls per client\\n(6 months)', fontsize=14, fontweight='bold')\n",
+    "plt.title('Test with Fewer Calls', fontsize=16, fontweight='bold', pad=20)\n",
+    "\n",
+    "for i, (bar, mean) in enumerate(zip(bars, means)):\n",
+    "    plt.text(bar.get_x() + bar.get_width()/2, mean + yerr[i] + 0.08, \n",
+    "             f'{mean:.2f}', ha='center', va='bottom', \n",
+    "             fontsize=14, fontweight='bold')\n",
+    "\n",
+    "delta_pct = ((means[0] - means[1])/means[0])*100\n",
+    "\n",
+    "plt.grid(axis='y', alpha=0.4, linestyle='-', linewidth=0.8)\n",
+    "plt.ylim(0, max(means) + 0.6)\n",
+    "plt.xticks(fontsize=12, fontweight='bold')\n",
+    "\n",
+    "for spine in plt.gca().spines.values():\n",
+    "    spine.set_linewidth(1.5)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5868d3c2-ed8e-486f-9d9e-af02ebe9e28f",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Control has more calls PER CLIENT than Test. The Test version proves superior self-service.</b></span>                                                    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "528b137e-3a2b-4a12-87ad-dca4b2c6b8a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['age_group'] = pd.cut(dfmerged['clnt_age'], bins=[0, 40, np.inf], labels=['Young <40', 'Older >=40'], ordered=False)\n",
+    "calls_mean = dfmerged.groupby(['variation', 'age_group'], observed=False)['calls_6_mnth'].mean().unstack(fill_value=0)\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 6))\n",
+    "x = np.arange(len(calls_mean.columns))\n",
+    "width = 0.35\n",
+    "bars1 = ax.bar(x - width/2, calls_mean.loc['Control'], width, label='Control', color='blue', alpha=0.8)\n",
+    "bars2 = ax.bar(x + width/2, calls_mean.loc['Test'], width, label='Test', color='orange', alpha=0.8)\n",
+    "\n",
+    "for i, col in enumerate(calls_mean.columns):\n",
+    "    ctrl_val = calls_mean.loc['Control', col]\n",
+    "    test_val = calls_mean.loc['Test', col]\n",
+    "    red = ((ctrl_val - test_val) / ctrl_val * 100).round(1) if ctrl_val > 0 else 0\n",
+    "    ax.text(i - width/2, ctrl_val + 0.05, f'{ctrl_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+    "    ax.text(i + width/2, test_val + 0.05, f'{test_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+    "\n",
+    "ax.set_ylabel('Average Calls (6 months)')\n",
+    "ax.set_title('Test Call Reduction: Stronger in Young Clients (<40 years)')\n",
+    "ax.set_xticks(x)\n",
+    "ax.set_xticklabels(calls_mean.columns)\n",
+    "ax.legend()\n",
+    "ax.grid(axis='y', alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "reduction_pct = ((calls_mean.loc['Control'] - calls_mean.loc['Test']) / calls_mean.loc['Control'] * 100).round(1)\n",
+    "print(\"Reduction % (Test vs Control):\")\n",
+    "print(reduction_pct)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "088d7175-917f-43b1-bb26-13e21cbb5981",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Has we can saw in the last one, Test have a reduction of calls and the same is applied to reductions among young clients (<40 yrs old) compared to older ones. And related to Control vs Test, there seems to have more diffence in reduction of calls between younger and older in test!</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fbd42a4-9a03-4bd8-8dfb-4d804c7a4025",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df = dfmerged_calls[dfmerged_calls['variation'].isin(['Control', 'Test'])].copy()\n",
+    "\n",
+    "# TRUST: Completion \n",
+    "print(\"TRUST - Completion Rate:\")\n",
+    "print(visitcomp.groupby('variation')['completed'].mean())  \n",
+    "\n",
+    "# REVENUE IMPACT (High-Tenure)\n",
+    "df['tenure_group'] = pd.cut(df['clnt_tenure_yr'], [0,10,np.inf], labels=['Low ≤10y','High >10y'], ordered=False)\n",
+    "\n",
+    "n_high_test = len(df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')])\n",
+    "bal_ctrl_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Control')]['bal'].mean()\n",
+    "bal_test_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')]['bal'].mean()\n",
+    "delta_bal = bal_test_high - bal_ctrl_high\n",
+    "\n",
+    "revenue = delta_bal * 0.01 * n_high_test \n",
+    "print(f\"\\nREVENUE High-Tenure ({n_high_test} clients):\")\n",
+    "print(f\"Control Bal: €{bal_ctrl_high:.2f}\")\n",
+    "print(f\"Test Bal:    €{bal_test_high:.2f}\")\n",
+    "print(f\"ΔBal/client: €{delta_bal:.3f}\")\n",
+    "print(f\"Annual Revenue: €{revenue/1e6:.1f}M ← **TEST WINS**\")\n",
+    "\n",
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(121)\n",
+    "completion = visitcomp.groupby('variation')['completed'].mean()\n",
+    "plt.bar(completion.index, completion.values*100, color=['blue','orange'], alpha=0.8)\n",
+    "plt.title('Trust: Completion Rate')\n",
+    "plt.ylabel('% Complete')\n",
+    "for i, v in enumerate(completion.values*100):\n",
+    "    plt.text(i, v+1, f'{v*100:.1f}%', ha='center')\n",
+    "\n",
+    "plt.subplot(122)\n",
+    "bal_tenure = df.groupby(['variation','tenure_group'])['bal'].mean().unstack()\n",
+    "x = np.arange(2)\n",
+    "width=0.35\n",
+    "plt.bar(x-width/2, bal_tenure.loc['Control'], width, label='Control', alpha=0.8)\n",
+    "plt.bar(x+width/2, bal_tenure.loc['Test'], width, label='Test', alpha=0.8)\n",
+    "plt.title('Revenue: Bal High-Tenure')\n",
+    "plt.ylabel('Avg Balance €')\n",
+    "plt.xticks(x, bal_tenure.columns, rotation=0)\n",
+    "plt.legend()\n",
+    "plt.grid(axis='y', alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bfddee5-a983-40df-adf0-8f76228edc15",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Test is MORE trustable (58% vs 49% completion), Test self-service success DIRECTLY causes higher balances... \n",
+    "We can see customers with more than 10 years of tenure have more balance, besides of that the control have more lowest balance than the test version in all ages.\n",
+    "Perfect cycle: Better UX, More transactions then more revenue. So, Test creates client trust + bank profits simultaneously.</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0c4bc6b-84fa-437e-a5d2-64f56b59cfb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df1 = pd.read_csv('df_final_web_data_pt_1.txt')\n",
+    "print('df1 columns:', df1.columns.tolist())\n",
+    "print('df1 shape:', df1.shape)\n",
+    "df2 = pd.read_csv('df_final_web_data_pt_2.txt')\n",
+    "print('df2 columns:', df2.columns.tolist())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e3838ff-260b-420c-9ae5-7e2b01b6317f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1763ab2d-aa00-41ed-acd4-45030b76f6aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e3ed3d2-c674-4e79-ae5e-f2ef5a52f71b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4441a7a-208a-4bd0-a4b1-665b37ed9d9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['variation'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e075867f-4755-48c6-9afa-18599d594af0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58765716-483c-4f41-aa86-edfb93091be6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged = dfmerged[dfmerged['gendr'].isin(['M','F'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d818395b-d84c-4109-9589-15875b2cbe28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2291a32f-f560-445c-b585-e8c80521464b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d62f6e02-d72b-48b6-b084-c0a839b69b3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged[\"error_flag\"] = (dfmerged[\"process_step\"] != \"confirm\").astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcf8b1e9-8466-48be-89f4-05687a2b775e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3590ca8-47a7-49ba-b716-34e87d6d546c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfmerged.to_csv('Project2FIX.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f92a491-010b-48f0-959f-90d18e8b00bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {'variation': ['Control', 'Control', 'Control', 'Control', 'Control', 'Test', 'Test', 'Test', 'Test', 'Test'],'process_step': ['confirm', 'start', 'step1', 'step2', 'step3',\n",
+    "                     'start', 'step1', 'step2', 'step3', 'confirm'],'mean_time': [153.74, 49.74, 45.09, 86.70, 140.79,38.24, 60.13, 89.76, 139.83, 246.07]}\n",
+    "df_times = pd.DataFrame(data)\n",
+    "\n",
+    "pivot = df_times.pivot(index='process_step', columns='variation', values='mean_time')\n",
+    "pivot = pivot.reindex(['start', 'step1', 'step2', 'step3', 'confirm'])\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "x = np.arange(len(pivot))\n",
+    "width = 0.35\n",
+    "ax.barh(x - width/2, pivot['Control'], width, label='Control', color='#2C6693', alpha=0.8)\n",
+    "ax.barh(x + width/2, pivot['Test'], width, label='Test', color='#E5B544', alpha=0.8)\n",
+    "ax.set_yticks(x)\n",
+    "ax.set_yticklabels(pivot.index)\n",
+    "ax.set_xlabel('Mean Time (seconds)')\n",
+    "ax.set_title('Average Time per Step: Control vs Test')\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd5ad294-a1e7-42b6-80b1-c83b941d2578",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Test faster at start , but its more slower on the remaining steps, specially in the end</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ce132b3-3f48-444b-a050-1d21c8a78955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "cores = {'Control': '#2C6693', 'Test': '#E5B544'}\n",
+    "sns.lmplot(data=visits_with_age, x='clnt_age', y='has_error', hue='variation',\n",
+    "           palette=cores, line_kws={'lw':2}, scatter_kws={'alpha':0.5}, ci=95)\n",
+    "plt.xlabel('Age'); plt.ylabel('Error Flag')\n",
+    "plt.title('Age-Error Correlation: Control vs Test')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "537cc5c5-2aab-4d70-8824-bb7a155a8dce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set(style='whitegrid')\n",
+    "plt.figure(figsize=(6,5))\n",
+    "sns.barplot(data=err_summary, x='variation', y='n_error', palette=['#2C6693', '#E5B544'])\n",
+    "plt.title('Total of Errors: Control vs Test')\n",
+    "plt.xlabel('Variation')\n",
+    "plt.ylabel('Total of Errors')\n",
+    "plt.xticks([0, 1], ['Control', 'Test'])\n",
+    "for i, row in err_summary.iterrows():\n",
+    "    plt.text(i, row['n_error'] + 500, f'{int(row[\"n_error\"]):,}', ha='center')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e21b230-af46-4cc2-b521-0bd81db64d06",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:30px;\"><b>CONCLUSIONS</b></span>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a7e55d5-90fd-4ad3-971e-a7c3762400cf",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:10x;\">Test variation consistently outperforms Control across all critical metrics, delivering substantial revenue growth, operational efficiencies and enhanced user resilience. Rigorous statistical analysis eliminates any doubt regarding implementation.\n",
+    "\n",
+    "Completion Advantage: Test have substantially higher process completion rates versus Control. This translates to massive trust and revenue uplift through elevated client balances across all tenure segments.\n",
+    "\n",
+    "Operational Excellence: Test meaningfully reduces call-center dependency confirms reliable cost savings.\n",
+    "\n",
+    "Demographic Resilience: Test significantly mitigates age-related error patterns compared to Control. Both versions show age influences error distribution, but Test manages this relationship far more equitably across age groups.\n",
+    "\n",
+    "Strategic Imperative: Revenue acceleration + cost reduction + universal UX improvements = compelling business case.</span>\n",
+    "\n",
+    "<span style=\"font-size:24px;\"><b>DEPLOY TEST VERSION IMMEDIATELY ACROSS ALL CLIENTS.</span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ff7b913-6d52-4511-94fe-482d913675cb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebook/PROJECT2_pedro1.ipynb b/notebook/PROJECT2_pedro1.ipynb
new file mode 100644
index 0000000..d775cb1
--- /dev/null
+++ b/notebook/PROJECT2_pedro1.ipynb
@@ -0,0 +1,1623 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de9edac0-fcff-4134-913b-5c0cb3a81646",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "df4 = pd.read_csv(\"df_final_demo.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b15dbbc-e036-47d4-a650-2bdd1fda6f0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3 = pd.read_csv(\"df_final_experiment_clients.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4f88831-c73a-457b-b38d-526deb99eb1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_csv(\"df_final_web_data_pt_1.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ae03da9-ad70-4bc5-b1b7-8578a6ee40a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.read_csv(\"df_final_web_data_pt_2.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42167cc8-18da-46d6-8a62-0b29f654082d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4637e1ee-6841-431e-ac99-49c6a1557d6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2404ff4f-d2a1-4d73-8850-6fb997416ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = pd.concat([df1, df2], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f583b072-14bf-4a68-b0fd-8e129194473b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmergeexp = dfs.merge(df3, on='client_id', how='left')\n",
+    "dfmerged = dfmergeexp.merge(df4, on='client_id', how='left')\n",
+    "dfmerged = dfmerged.drop_duplicates()\n",
+    "dfmerged = dfmerged[dfmerged['Variation'].isin(['Control', 'Test'])]\n",
+    "\n",
+    "dfmerged['source_group'] = dfmerged['Variation']\n",
+    "\n",
+    "dfmerged.to_csv('ab_test_data_with_source.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01ac17a1-d2fe-4d56-8b1a-376ac810be70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['client_id'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "427a2282-9d0f-4a2a-b771-e0aaaf4dea2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.isnull()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3aef449c-5efe-4f22-8387-5217ec21c6d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f946b9e-e8ae-472a-ad3e-695e63545151",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd184618-2e58-445b-a677-eb761f086c62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#checking what the type of date_time\n",
+    "dfs.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87ec222b-8b6e-44c1-8bdd-2944199a53a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#convert object into date_time\n",
+    "dfs['date_time'] = pd.to_datetime(dfs['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d285f1e-baac-4ffa-9f3e-1ba08f3423ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2349d923-4c1d-44b0-b222-091385812b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "448d5890-b61e-473c-8403-13273d164f60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df4.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec9f6c03-7ed3-467b-8d75-094114d8838f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55e51cdf-678d-469d-a6fc-746416546c2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_all.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cf44703-273b-4a59-a0e1-4df661138326",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd9f177f-89f8-410a-a535-daddffb87e32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3e667ca-63a0-4a4e-87ed-a83ae5c118f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b05e56f0-fd06-4dc4-a47e-08cfc1d0aa81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfsorted[dfsorted['client_id'] == 442857]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6b253c6-7371-414a-8ee7-59eede52b03b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['client_id'].value_counts().head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "867532d4-592d-425b-a442-386b325bc845",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['date_time'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de7d7eb0-36d1-4b51-b027-5db80356cec3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visitor_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c83e80d-ef18-4f85-9bff-3b82208c121d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['date_time'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19ae077e-9b9c-477d-a267-1d9cb15fc815",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visit_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81f2e88d-79a1-4a70-ab61-acd0fd663704",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['visitor_id'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aca325aa-d0cb-437e-bb4c-64eada0876ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['process_step'].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d79aa0f-4f36-4aba-b730-b18f0a16028d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30d4f963-6a2a-4659-9a7b-68d23e59b445",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df4.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "898410aa-94c8-4747-81b1-e11afd574f8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c35f3128-58a1-4557-8cfd-e70f054e33c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_test.isnull()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87bc11a1-1791-4a93-9913-31d84c7a6875",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs['process_step'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72dcc7fb-1ff0-49ea-98b2-c3c04a08d1af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#merfe dfs + experiment(df3) to have Variaton = Control OR test)\n",
+    "df_mergeexp = dfs.merge(df3, on=\"client_id\", how=\"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f2548d2-68c6-40fb-99f7-708369d15623",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#merge with demo(df4) (demographic data)\n",
+    "df_merged = df_mergeexp.merge(df4, on=\"client_id\", how=\"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7709e3-5aea-4876-8e04-6e6842659f12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52d75c86-aa2d-4455-a876-f6089b4df31f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#remove duplicates\n",
+    "df_merged = df_merged.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a43aef8-5752-41de-b0b3-d6439cb714d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#remove rows without any variation\n",
+    "df_merged = df_merged[df_merged[\"Variation\"].isin([\"Control\", \"Test\"])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9632b15a-5c30-4426-942e-134e2a924893",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74067acd-53d4-4b5a-bacb-83f74d45e04f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged[\"process_step\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ac2ee71-4c6a-47b0-beb7-adc4b32d0725",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged[\"Variation\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3336830-2021-4163-ba50-4a93a2ca1299",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Give a table for Tests and another for Controls! \n",
+    "df_control = df_merged[df_merged[\"Variation\"] == \"Control\"].copy()\n",
+    "df_test = df_merged[df_merged[\"Variation\"] == \"Test\"].copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48b1d00b-87cd-4115-a1e1-dfab54281ab3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_control.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a87776b-7368-45dd-baeb-5a89daf30b19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_test.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cdc8198-e0ab-4fc8-8eb0-3b96318f1710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged['num_accts'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "162a3f57-229f-4cbf-848a-e4704a2cdf52",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Who are the primary clients using this online process?</b></span> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fed97a77-c349-420e-a073-8cdd689514c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "usage = (df_merged.groupby(\"client_id\")[\"visit_id\"].nunique().reset_index(name=\"n_visits\"))  #hor many visits per client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed106950-abca-4aa9-afb0-f758055eaa47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cutoff = usage[\"n_visits\"].quantile(0.75)\n",
+    "usage['primary'] = (usage[\"n_visits\"] >= cutoff).astype(int)          #top25% by number of visits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5b77ee6-dd0f-4256-b044-6688bc348160",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c04385dd-5d27-442c-ba4c-63c1e2808d1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = usage.merge(df_merged, left_on=\"client_id\", right_on=\"client_id\", how=\"left\") #join demographics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a46f9ad-2767-4c80-8e18-bf36f0afb441",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby(\"primary\")[[\"clnt_age\",\"clnt_tenure_yr\",\"clnt_tenure_mnth\",\"logons_6_mnth\"]].mean()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e92ebf-05af-411c-a0ed-0785e70a8e0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub = df_merged[df_merged[\"primary\"] == 1]\n",
+    "print(sub[[\"clnt_age\",\"clnt_tenure_yr\",\"bal\",\"gendr\"]].describe(include=\"all\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9948430a-27f5-467a-b44f-ccc9968944a0",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.\n",
+    "primary cients are more long standing.\n",
+    "The most frequent gender is “M”</b></span> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95002485-6f20-4f4a-a763-d037b9faa90f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby(\"primary\")[['calls_6_mnth', 'logons_6_mnth']].mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ac842fc-b333-4bb9-b804-01f9fc8f0c72",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>They also make more calls and more logons in 6 months, so they are more active on all channels, not only online.</b></span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a4f3984-9698-45f1-9d0e-299d234bdccf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_merged.rename(columns={'Variation': 'variation'})\n",
+    "df3 = df3.rename(columns={'Variation': 'variation'})\n",
+    "df_control = df_control.rename(columns={'Variation': 'variation'})\n",
+    "df_test = df_test.rename(columns={'Variation': 'variation'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73f27b9e-3a8c-4ad7-a756-6838314f5b34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5da498ad-e434-4dbc-af5f-84b4a322b09d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_control.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110917b6-fa20-47d1-9704-9263dafd5312",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90d7a353-0bd9-422f-b3e4-98d8d709c90d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4edff36-d806-4048-9b85-aa83de863e3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f86541c-bc8d-4ea1-a7b5-d823188ed736",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eed944a6-5fce-4e95-ad1e-8f384f15addb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28162307-e0cf-4d5d-b662-58f03c6a67e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['step_num'] = dfs_var['process_step'].map(step_order)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c71f9b3d-9581-40fc-986d-bf30da3e1882",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "######################################################\n",
+    "#COMPLETION RATE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0699b897-2d70-4a36-a2fa-de92681667c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "last_step = dfs_var['step_num'].max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1451a1a2-67f8-4522-b352-6a72513b7a3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "decd9c0a-810e-4e3d-a3da-4aa31dd2c58c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitcomp[\"completed\"] = (visitcomp[\"max_step\"] == last_step).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "323d47af-6a05-4a10-bc42-3886c7c88ca4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c501100b-d67e-4774-9e73-db3350874cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completionrate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Average time spent on each steap is', completionrate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "############################################################################################\n",
+    "#TIME SPENT ON EACH STEP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4a43083-3154-46ca-823f-10e50602565c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2f7798-0953-4991-9634-1984749b0703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57d91d74-ae22-46fc-a195-ad7e1b777296",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec88f0a4-fcb2-4422-a325-52661e23016f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##################################################################\n",
+    "#CHECKING RATING OF ERRORS PER EACH STEP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "error_rates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43f50454-c86a-4040-b141-5a81fb366ea7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# max error_flag per visit - error = 1\n",
+    "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\"])[\"error_flag\"].max().reset_index())\n",
+    "\n",
+    "# variation resume\n",
+    "err_summary = (visit_error.groupby(\"variation\")[\"error_flag\"].agg(n_error=\"sum\", n_total=\"count\").reset_index())\n",
+    "\n",
+    "print(err_summary)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>COMPLETITION RATE</b> - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.</span>   \n",
+    "\n",
+    "<span style=\"font-size:18px;\"><b>TIME SPENT</b> - The test version is better for user to complete the steps faster.</span>   \n",
+    "\n",
+    "<span style=\"font-size:18px;\"><b>ERRORS RATES</b> - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors.</span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a07db602-47f1-4c8c-8785-76c322d070db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.stats import norm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n",
+    "\n",
+    "print(summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Order control test\n",
+    "summary = summary.set_index('variation').loc[['Control', 'Test']]\n",
+    "x1, x2 = summary['n_complete'].values\n",
+    "n1, n2 = summary['n_total'].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#propoortions\n",
+    "p1 = x1 / n1\n",
+    "p2 = x2 / n2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#h0\n",
+    "p_pool = (x1 + x2) / (n1 + n2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48060fa7-6f84-4685-98c0-9f818b234cbe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test of 2 proportions \n",
+    "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n",
+    "z = (p2 - p1) / se"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p_value = 2 * (1 - norm.cdf(abs(z)))\n",
+    "print(f'Completion of rate Control: {p1:4f}')\n",
+    "print(f'Completion of rate Test: {p2:4f}')\n",
+    "print(f'z-statistic: {z:4f}')\n",
+    "print(f'p-value    : {p_value:.6f}')\n",
+    "alpha = 0.05\n",
+    "if p_value < alpha:\n",
+    "    print('Statistical diference highly significative (alpha=0.05)')\n",
+    "else:\n",
+    "    print('Statistical diference not significative (alpha=0.05)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49417603-53bb-45a2-a9e1-f55fe5c2999c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "056a80de-36d0-426f-8cdd-8a0cb74582a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dfs_var.columns)\n",
+    "\n",
+    "print(dfs_var[\"step_durationsec\"].notna().sum())\n",
+    "\n",
+    "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+    "print(df_dur.shape)\n",
+    "print(df_dur[\"process_step\"].unique())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ca550d4-c1d0-4b3f-a727-3c9c2fef399d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dfs_var[\"step_durationsec\"].head())\n",
+    "print(dfs_var[\"step_durationsec\"].isna().sum())\n",
+    "print(len(dfs_var))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8267354a-556e-4984-8024-672be4c03b72",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>PREPARING MEAN OF TIME FOR EACH STEP IN CONTROL VS STEP</b></span>    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c36ce9c0-db04-4d3f-98ff-aba36672d3cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs_var[\"date_time\"] = pd.to_datetime(dfs_var[\"date_time\"])\n",
+    "\n",
+    "dfs_var = dfs_var.sort_values([\"variation\", \"visit_id\", \"step_num\", \"date_time\"])\n",
+    "\n",
+    "dfs_var[\"next_time\"] = (\n",
+    "    dfs_var.groupby([\"variation\", \"visit_id\"])[\"date_time\"].shift(-1))\n",
+    "dfs_var[\"step_durationsec\"] = (dfs_var[\"next_time\"] - dfs_var[\"date_time\"]).dt.total_seconds()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa579f03-f983-4ef4-8df6-85c818d34dc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+    "\n",
+    "mean_step_time = (df_dur.groupby([\"variation\", \"process_step\"])[\"step_durationsec\"].mean().reset_index())\n",
+    "\n",
+    "print(mean_step_time)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13628402-e804-4082-8491-5558b59d76a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b795cd06-2379-4c84-bec4-c3e7a562cc62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\", \"client_id\"])[\"error_flag\"].max().reset_index(name=\"has_error\"))\n",
+    "\n",
+    "visits_with_age = visit_error.merge(df4[[\"client_id\", \"clnt_age\"]], on=\"client_id\", how=\"left\").dropna(subset=[\"clnt_age\"])\n",
+    "\n",
+    "for var in [\"Control\", \"Test\"]:\n",
+    "    sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+    "    corr = np.corrcoef(sub[\"clnt_age\"], sub[\"has_error\"])[0, 1]\n",
+    "    print(f\"Correlation age–error ({var}): {corr:.4f}  (n={len(sub)})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b51e9ea-cf23-4b7f-9af8-dbf7a7e9405c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12, 6))\n",
+    "\n",
+    "correlations = {'Control': -0.0870, 'Test': -0.0420}\n",
+    "\n",
+    "x = np.arange(2)\n",
+    "bars = plt.bar(x, [abs(correlations['Control']), abs(correlations['Test'])], color=['steelblue', 'darkorange'], alpha=0.8, edgecolor='black', linewidth=2)\n",
+    "\n",
+    "plt.ylabel('Correlation', fontsize=12, fontweight='bold')\n",
+    "plt.title('Age-Error Relationship\\n(Lower correlation)', fontsize=14, fontweight='bold')\n",
+    "\n",
+    "plt.xticks(x, correlations.keys())\n",
+    "plt.ylim(0, 0.1)\n",
+    "\n",
+    "for i, (bar, corr) in enumerate(zip(bars, correlations.values())):\n",
+    "    plt.text(bar.get_x() + bar.get_width()/2, abs(corr) + 0.002, \n",
+    "             f'{abs(corr):.4f}', ha='center', fontweight='bold', fontsize=12)\n",
+    "\n",
+    "improvement = ((abs(correlations['Control']) - abs(correlations['Test'])) / abs(correlations['Control']) * 100)\n",
+    "\n",
+    "plt.grid(axis='y', alpha=0.3)\n",
+    "sns.despine()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c2d14a8e-189a-48dc-93f4-435bda0e44a3",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>There is no relationship where each additional year of age systematically increases or decreases error rate.</b></span>                           \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce2b60be-7678-4bf8-a49e-60d234c1d585",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = [0, 40, 60, 120]\n",
+    "labels = [\"<40\", \"40-60\", \">60\"]\n",
+    "visits_with_age[\"age_group\"] = pd.cut(visits_with_age[\"clnt_age\"], bins=bins, labels=labels)\n",
+    "\n",
+    "for var in [\"Control\", \"Test\"]:\n",
+    "    sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+    "    tab = pd.crosstab(sub[\"age_group\"], sub[\"has_error\"])  \n",
+    "    print(var, \"\\n\", tab, \"\\n\")\n",
+    "    \n",
+    "    from scipy.stats import chi2_contingency\n",
+    "    chi2, p, dof, exp = chi2_contingency(tab)\n",
+    "    print(f\"{var}: chi2 = {chi2:.2f}, p-value = {p:.6f}\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4887d163-56c7-4b06-94f0-3d6d4082814a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Bar Chi2 values\n",
+    "chi2_values = [745.54, 720.85]  # Control vs Test\n",
+    "ax1.bar(['Control', 'Test'], chi2_values, color=['steelblue', 'darkorange'], alpha=0.8)\n",
+    "ax1.set_ylabel('Chi² Statistic')\n",
+    "ax1.set_title('Chi² Age-Error Distribution\\n(Higher = more age influence)')\n",
+    "for i, v in enumerate(chi2_values):\n",
+    "    ax1.text(i, v + 10, f'{v:.0f}', ha='center', fontweight='bold')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d633dd69-f1df-4e81-aeab-31e224ce1a94",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>(significant chi-square). The relationship is non-linear, younger and older users likely commit different types or frequencies of errors.</b></span>                             "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "987b45ee-7df6-4446-9125-180619476b1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmergeexp = dfs.merge(df3, left_on='client_id', right_on='client_id', how='left')\n",
+    "dfmerged = dfmergeexp.merge(df4, left_on='client_id', right_on='client_id', how='left')\n",
+    "\n",
+    "dfmerged_calls = dfmerged[dfmerged['variation'].isin(['Control', 'Test'])].copy()\n",
+    "\n",
+    "print(\"Clients per group:\")\n",
+    "print(dfmerged_calls['variation'].value_counts())\n",
+    "\n",
+    "summary = dfmerged_calls.groupby('variation')['calls_6_mnth'].agg(['mean', 'median', 'count', 'std']).round(2)\n",
+    "print(\"\\nCalls6mnth Control vs Test:\")\n",
+    "print(summary)\n",
+    "\n",
+    "from scipy.stats import ttest_ind\n",
+    "control_c = dfmerged_calls[dfmerged_calls['variation']=='Control']['calls_6_mnth'].dropna()\n",
+    "test_c = dfmerged_calls[dfmerged_calls['variation']=='Test']['calls_6_mnth'].dropna()\n",
+    "t_stat, p_val = ttest_ind(control_c, test_c)\n",
+    "print(f\"\\nT-test: t={t_stat:.2f}, p-value={p_val:.6f}\")\n",
+    "\n",
+    "if p_val < 0.05:\n",
+    "    print(\"*** TEST HAS FEWER CALLS (significant)! ***\")\n",
+    "elif test_c.mean() < control_c.mean():\n",
+    "    print(\"*** Test has fewer calls (not significant) ***\")\n",
+    "else:\n",
+    "    print(\"*** Control has fewer calls ***\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0eb1ca04-ce60-44b7-bb86-05c81f76622c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(8, 6))\n",
+    "\n",
+    "x = ['Control', 'Test']\n",
+    "means = [summary.loc['Control', 'mean'], summary.loc['Test', 'mean']]\n",
+    "stds = [summary.loc['Control', 'std'], summary.loc['Test', 'std']]\n",
+    "n = [summary.loc['Control', 'count'], summary.loc['Test', 'count']]\n",
+    "yerr = [s/np.sqrt(nn) for s,nn in zip(stds,n)]\n",
+    "\n",
+    "# Barplot CLEAN\n",
+    "bars = plt.bar(x, means, yerr=yerr, capsize=10, color=['steelblue', 'darkorange'], alpha=0.85, edgecolor='black', linewidth=1.5)\n",
+    "\n",
+    "plt.ylabel('Average Calls per client\\n(6 months)', fontsize=14, fontweight='bold')\n",
+    "plt.title('Test with Fewer Calls', fontsize=16, fontweight='bold', pad=20)\n",
+    "\n",
+    "for i, (bar, mean) in enumerate(zip(bars, means)):\n",
+    "    plt.text(bar.get_x() + bar.get_width()/2, mean + yerr[i] + 0.08, \n",
+    "             f'{mean:.2f}', ha='center', va='bottom', \n",
+    "             fontsize=14, fontweight='bold')\n",
+    "\n",
+    "delta_pct = ((means[0] - means[1])/means[0])*100\n",
+    "\n",
+    "plt.grid(axis='y', alpha=0.4, linestyle='-', linewidth=0.8)\n",
+    "plt.ylim(0, max(means) + 0.6)\n",
+    "plt.xticks(fontsize=12, fontweight='bold')\n",
+    "\n",
+    "for spine in plt.gca().spines.values():\n",
+    "    spine.set_linewidth(1.5)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5868d3c2-ed8e-486f-9d9e-af02ebe9e28f",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Control has more calls PER CLIENT than Test. The Test version proves superior self-service.</b></span>                                                    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "528b137e-3a2b-4a12-87ad-dca4b2c6b8a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['age_group'] = pd.cut(dfmerged['clnt_age'], bins=[0, 40, np.inf], labels=['Young <40', 'Older >=40'], ordered=False)\n",
+    "calls_mean = dfmerged.groupby(['variation', 'age_group'], observed=False)['calls_6_mnth'].mean().unstack(fill_value=0)\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 6))\n",
+    "x = np.arange(len(calls_mean.columns))\n",
+    "width = 0.35\n",
+    "bars1 = ax.bar(x - width/2, calls_mean.loc['Control'], width, label='Control', color='blue', alpha=0.8)\n",
+    "bars2 = ax.bar(x + width/2, calls_mean.loc['Test'], width, label='Test', color='orange', alpha=0.8)\n",
+    "\n",
+    "for i, col in enumerate(calls_mean.columns):\n",
+    "    ctrl_val = calls_mean.loc['Control', col]\n",
+    "    test_val = calls_mean.loc['Test', col]\n",
+    "    red = ((ctrl_val - test_val) / ctrl_val * 100).round(1) if ctrl_val > 0 else 0\n",
+    "    ax.text(i - width/2, ctrl_val + 0.05, f'{ctrl_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+    "    ax.text(i + width/2, test_val + 0.05, f'{test_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+    "\n",
+    "ax.set_ylabel('Average Calls (6 months)')\n",
+    "ax.set_title('Test Call Reduction: Stronger in Young Clients (<40 years)')\n",
+    "ax.set_xticks(x)\n",
+    "ax.set_xticklabels(calls_mean.columns)\n",
+    "ax.legend()\n",
+    "ax.grid(axis='y', alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "reduction_pct = ((calls_mean.loc['Control'] - calls_mean.loc['Test']) / calls_mean.loc['Control'] * 100).round(1)\n",
+    "print(\"Reduction % (Test vs Control):\")\n",
+    "print(reduction_pct)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "088d7175-917f-43b1-bb26-13e21cbb5981",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Has we can saw in the last one, Test have a reduction of calls and the same is applied to reductions among young clients (<40 yrs old) compared to older ones. And related to Control vs Test, there seems to have more diffence in reduction of calls between younger and older in test!</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fbd42a4-9a03-4bd8-8dfb-4d804c7a4025",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df = dfmerged_calls[dfmerged_calls['variation'].isin(['Control', 'Test'])].copy()\n",
+    "\n",
+    "# TRUST: Completion \n",
+    "print(\"TRUST - Completion Rate:\")\n",
+    "print(visitcomp.groupby('variation')['completed'].mean())  \n",
+    "\n",
+    "# REVENUE IMPACT (High-Tenure)\n",
+    "df['tenure_group'] = pd.cut(df['clnt_tenure_yr'], [0,10,np.inf], labels=['Low ≤10y','High >10y'], ordered=False)\n",
+    "\n",
+    "n_high_test = len(df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')])\n",
+    "bal_ctrl_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Control')]['bal'].mean()\n",
+    "bal_test_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')]['bal'].mean()\n",
+    "delta_bal = bal_test_high - bal_ctrl_high\n",
+    "\n",
+    "revenue = delta_bal * 0.01 * n_high_test \n",
+    "print(f\"\\nREVENUE High-Tenure ({n_high_test} clients):\")\n",
+    "print(f\"Control Bal: €{bal_ctrl_high:.2f}\")\n",
+    "print(f\"Test Bal:    €{bal_test_high:.2f}\")\n",
+    "print(f\"ΔBal/client: €{delta_bal:.3f}\")\n",
+    "print(f\"Annual Revenue: €{revenue/1e6:.1f}M ← **TEST WINS**\")\n",
+    "\n",
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(121)\n",
+    "completion = visitcomp.groupby('variation')['completed'].mean()\n",
+    "plt.bar(completion.index, completion.values*100, color=['blue','orange'], alpha=0.8)\n",
+    "plt.title('Trust: Completion Rate')\n",
+    "plt.ylabel('% Complete')\n",
+    "for i, v in enumerate(completion.values*100):\n",
+    "    plt.text(i, v+1, f'{v*100:.1f}%', ha='center')\n",
+    "\n",
+    "plt.subplot(122)\n",
+    "bal_tenure = df.groupby(['variation','tenure_group'])['bal'].mean().unstack()\n",
+    "x = np.arange(2)\n",
+    "width=0.35\n",
+    "plt.bar(x-width/2, bal_tenure.loc['Control'], width, label='Control', alpha=0.8)\n",
+    "plt.bar(x+width/2, bal_tenure.loc['Test'], width, label='Test', alpha=0.8)\n",
+    "plt.title('Revenue: Bal High-Tenure')\n",
+    "plt.ylabel('Avg Balance €')\n",
+    "plt.xticks(x, bal_tenure.columns, rotation=0)\n",
+    "plt.legend()\n",
+    "plt.grid(axis='y', alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bfddee5-a983-40df-adf0-8f76228edc15",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Test is MORE trustable (58% vs 49% completion), Test self-service success DIRECTLY causes higher balances... \n",
+    "We can see customers with more than 10 years of tenure have more balance, besides of that the control have more lowest balance than the test version in all ages.\n",
+    "Perfect cycle: Better UX, More transactions then more revenue. So, Test creates client trust + bank profits simultaneously.</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0c4bc6b-84fa-437e-a5d2-64f56b59cfb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df1 = pd.read_csv('df_final_web_data_pt_1.txt')\n",
+    "print('df1 columns:', df1.columns.tolist())\n",
+    "print('df1 shape:', df1.shape)\n",
+    "df2 = pd.read_csv('df_final_web_data_pt_2.txt')\n",
+    "print('df2 columns:', df2.columns.tolist())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e3838ff-260b-420c-9ae5-7e2b01b6317f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1763ab2d-aa00-41ed-acd4-45030b76f6aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e3ed3d2-c674-4e79-ae5e-f2ef5a52f71b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4441a7a-208a-4bd0-a4b1-665b37ed9d9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['variation'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e075867f-4755-48c6-9afa-18599d594af0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58765716-483c-4f41-aa86-edfb93091be6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged = dfmerged[dfmerged['gendr'].isin(['M','F'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d818395b-d84c-4109-9589-15875b2cbe28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged['gendr'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2291a32f-f560-445c-b585-e8c80521464b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d62f6e02-d72b-48b6-b084-c0a839b69b3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged[\"error_flag\"] = (dfmerged[\"process_step\"] != \"confirm\").astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcf8b1e9-8466-48be-89f4-05687a2b775e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfmerged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3590ca8-47a7-49ba-b716-34e87d6d546c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dfmerged.to_csv('Project2FIX.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f92a491-010b-48f0-959f-90d18e8b00bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {'variation': ['Control', 'Control', 'Control', 'Control', 'Control', 'Test', 'Test', 'Test', 'Test', 'Test'],'process_step': ['confirm', 'start', 'step1', 'step2', 'step3',\n",
+    "                     'start', 'step1', 'step2', 'step3', 'confirm'],'mean_time': [153.74, 49.74, 45.09, 86.70, 140.79,38.24, 60.13, 89.76, 139.83, 246.07]}\n",
+    "df_times = pd.DataFrame(data)\n",
+    "\n",
+    "pivot = df_times.pivot(index='process_step', columns='variation', values='mean_time')\n",
+    "pivot = pivot.reindex(['start', 'step1', 'step2', 'step3', 'confirm'])\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "x = np.arange(len(pivot))\n",
+    "width = 0.35\n",
+    "ax.barh(x - width/2, pivot['Control'], width, label='Control', color='#2C6693', alpha=0.8)\n",
+    "ax.barh(x + width/2, pivot['Test'], width, label='Test', color='#E5B544', alpha=0.8)\n",
+    "ax.set_yticks(x)\n",
+    "ax.set_yticklabels(pivot.index)\n",
+    "ax.set_xlabel('Mean Time (seconds)')\n",
+    "ax.set_title('Average Time per Step: Control vs Test')\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd5ad294-a1e7-42b6-80b1-c83b941d2578",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:18px;\"><b>Test faster at start , but its more slower on the remaining steps, specially in the end</b></span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ce132b3-3f48-444b-a050-1d21c8a78955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "cores = {'Control': '#2C6693', 'Test': '#E5B544'}\n",
+    "sns.lmplot(data=visits_with_age, x='clnt_age', y='has_error', hue='variation',\n",
+    "           palette=cores, line_kws={'lw':2}, scatter_kws={'alpha':0.5}, ci=95)\n",
+    "plt.xlabel('Age'); plt.ylabel('Error Flag')\n",
+    "plt.title('Age-Error Correlation: Control vs Test')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "537cc5c5-2aab-4d70-8824-bb7a155a8dce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set(style='whitegrid')\n",
+    "plt.figure(figsize=(6,5))\n",
+    "sns.barplot(data=err_summary, x='variation', y='n_error', palette=['#2C6693', '#E5B544'])\n",
+    "plt.title('Total of Errors: Control vs Test')\n",
+    "plt.xlabel('Variation')\n",
+    "plt.ylabel('Total of Errors')\n",
+    "plt.xticks([0, 1], ['Control', 'Test'])\n",
+    "for i, row in err_summary.iterrows():\n",
+    "    plt.text(i, row['n_error'] + 500, f'{int(row[\"n_error\"]):,}', ha='center')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e21b230-af46-4cc2-b521-0bd81db64d06",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:30px;\"><b>CONCLUSIONS</b></span>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a7e55d5-90fd-4ad3-971e-a7c3762400cf",
+   "metadata": {},
+   "source": [
+    "<span style=\"font-size:10x;\">Test variation consistently outperforms Control across all critical metrics, delivering substantial revenue growth, operational efficiencies and enhanced user resilience. Rigorous statistical analysis eliminates any doubt regarding implementation.\n",
+    "\n",
+    "Completion Advantage: Test have substantially higher process completion rates versus Control. This translates to massive trust and revenue uplift through elevated client balances across all tenure segments.\n",
+    "\n",
+    "Operational Excellence: Test meaningfully reduces call-center dependency confirms reliable cost savings.\n",
+    "\n",
+    "Demographic Resilience: Test significantly mitigates age-related error patterns compared to Control. Both versions show age influences error distribution, but Test manages this relationship far more equitably across age groups.\n",
+    "\n",
+    "Strategic Imperative: Revenue acceleration + cost reduction + universal UX improvements = compelling business case.</span>\n",
+    "\n",
+    "<span style=\"font-size:24px;\"><b>DEPLOY TEST VERSION IMMEDIATELY ACROSS ALL CLIENTS.</span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ff7b913-6d52-4511-94fe-482d913675cb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}