diff --git a/notebook/PROJECT2_main1.ipynb b/notebook/PROJECT2_main1.ipynb
new file mode 100644
index 0000000..d775cb1
--- /dev/null
+++ b/notebook/PROJECT2_main1.ipynb
@@ -0,0 +1,1623 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de9edac0-fcff-4134-913b-5c0cb3a81646",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "df4 = pd.read_csv(\"df_final_demo.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b15dbbc-e036-47d4-a650-2bdd1fda6f0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3 = pd.read_csv(\"df_final_experiment_clients.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4f88831-c73a-457b-b38d-526deb99eb1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1 = pd.read_csv(\"df_final_web_data_pt_1.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ae03da9-ad70-4bc5-b1b7-8578a6ee40a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = pd.read_csv(\"df_final_web_data_pt_2.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42167cc8-18da-46d6-8a62-0b29f654082d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4637e1ee-6841-431e-ac99-49c6a1557d6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2404ff4f-d2a1-4d73-8850-6fb997416ee1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = pd.concat([df1, df2], ignore_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f583b072-14bf-4a68-b0fd-8e129194473b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmergeexp = dfs.merge(df3, on='client_id', how='left')\n",
+ "dfmerged = dfmergeexp.merge(df4, on='client_id', how='left')\n",
+ "dfmerged = dfmerged.drop_duplicates()\n",
+ "dfmerged = dfmerged[dfmerged['Variation'].isin(['Control', 'Test'])]\n",
+ "\n",
+ "dfmerged['source_group'] = dfmerged['Variation']\n",
+ "\n",
+ "dfmerged.to_csv('ab_test_data_with_source.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "01ac17a1-d2fe-4d56-8b1a-376ac810be70",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['client_id'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "427a2282-9d0f-4a2a-b771-e0aaaf4dea2e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.isnull()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3aef449c-5efe-4f22-8387-5217ec21c6d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f946b9e-e8ae-472a-ad3e-695e63545151",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd184618-2e58-445b-a677-eb761f086c62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#checking what the type of date_time\n",
+ "dfs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87ec222b-8b6e-44c1-8bdd-2944199a53a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#convert object into date_time\n",
+ "dfs['date_time'] = pd.to_datetime(dfs['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d285f1e-baac-4ffa-9f3e-1ba08f3423ac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2349d923-4c1d-44b0-b222-091385812b30",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "448d5890-b61e-473c-8403-13273d164f60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec9f6c03-7ed3-467b-8d75-094114d8838f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55e51cdf-678d-469d-a6fc-746416546c2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_all.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4cf44703-273b-4a59-a0e1-4df661138326",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd9f177f-89f8-410a-a535-daddffb87e32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a3e667ca-63a0-4a4e-87ed-a83ae5c118f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b05e56f0-fd06-4dc4-a47e-08cfc1d0aa81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfsorted[dfsorted['client_id'] == 442857]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d6b253c6-7371-414a-8ee7-59eede52b03b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['client_id'].value_counts().head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "867532d4-592d-425b-a442-386b325bc845",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['date_time'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de7d7eb0-36d1-4b51-b027-5db80356cec3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visitor_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c83e80d-ef18-4f85-9bff-3b82208c121d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['date_time'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19ae077e-9b9c-477d-a267-1d9cb15fc815",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visit_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81f2e88d-79a1-4a70-ab61-acd0fd663704",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visitor_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aca325aa-d0cb-437e-bb4c-64eada0876ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['process_step'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2d79aa0f-4f36-4aba-b730-b18f0a16028d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30d4f963-6a2a-4659-9a7b-68d23e59b445",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "898410aa-94c8-4747-81b1-e11afd574f8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c35f3128-58a1-4557-8cfd-e70f054e33c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_test.isnull()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87bc11a1-1791-4a93-9913-31d84c7a6875",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['process_step'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72dcc7fb-1ff0-49ea-98b2-c3c04a08d1af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#merfe dfs + experiment(df3) to have Variaton = Control OR test)\n",
+ "df_mergeexp = dfs.merge(df3, on=\"client_id\", how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f2548d2-68c6-40fb-99f7-708369d15623",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#merge with demo(df4) (demographic data)\n",
+ "df_merged = df_mergeexp.merge(df4, on=\"client_id\", how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c7709e3-5aea-4876-8e04-6e6842659f12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "52d75c86-aa2d-4455-a876-f6089b4df31f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#remove duplicates\n",
+ "df_merged = df_merged.drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3a43aef8-5752-41de-b0b3-d6439cb714d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#remove rows without any variation\n",
+ "df_merged = df_merged[df_merged[\"Variation\"].isin([\"Control\", \"Test\"])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9632b15a-5c30-4426-942e-134e2a924893",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74067acd-53d4-4b5a-bacb-83f74d45e04f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged[\"process_step\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ac2ee71-4c6a-47b0-beb7-adc4b32d0725",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged[\"Variation\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3336830-2021-4163-ba50-4a93a2ca1299",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Give a table for Tests and another for Controls! \n",
+ "df_control = df_merged[df_merged[\"Variation\"] == \"Control\"].copy()\n",
+ "df_test = df_merged[df_merged[\"Variation\"] == \"Test\"].copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48b1d00b-87cd-4115-a1e1-dfab54281ab3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_control.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a87776b-7368-45dd-baeb-5a89daf30b19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0cdc8198-e0ab-4fc8-8eb0-3b96318f1710",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged['num_accts'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "162a3f57-229f-4cbf-848a-e4704a2cdf52",
+ "metadata": {},
+ "source": [
+ "Who are the primary clients using this online process? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fed97a77-c349-420e-a073-8cdd689514c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "usage = (df_merged.groupby(\"client_id\")[\"visit_id\"].nunique().reset_index(name=\"n_visits\")) #hor many visits per client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed106950-abca-4aa9-afb0-f758055eaa47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cutoff = usage[\"n_visits\"].quantile(0.75)\n",
+ "usage['primary'] = (usage[\"n_visits\"] >= cutoff).astype(int) #top25% by number of visits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5b77ee6-dd0f-4256-b044-6688bc348160",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c04385dd-5d27-442c-ba4c-63c1e2808d1b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged = usage.merge(df_merged, left_on=\"client_id\", right_on=\"client_id\", how=\"left\") #join demographics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a46f9ad-2767-4c80-8e18-bf36f0afb441",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.groupby(\"primary\")[[\"clnt_age\",\"clnt_tenure_yr\",\"clnt_tenure_mnth\",\"logons_6_mnth\"]].mean()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e92ebf-05af-411c-a0ed-0785e70a8e0d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sub = df_merged[df_merged[\"primary\"] == 1]\n",
+ "print(sub[[\"clnt_age\",\"clnt_tenure_yr\",\"bal\",\"gendr\"]].describe(include=\"all\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9948430a-27f5-467a-b44f-ccc9968944a0",
+ "metadata": {},
+ "source": [
+ "Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.\n",
+ "primary cients are more long standing.\n",
+ "The most frequent gender is “M” "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95002485-6f20-4f4a-a763-d037b9faa90f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.groupby(\"primary\")[['calls_6_mnth', 'logons_6_mnth']].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ac842fc-b333-4bb9-b804-01f9fc8f0c72",
+ "metadata": {},
+ "source": [
+ "They also make more calls and more logons in 6 months, so they are more active on all channels, not only online. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a4f3984-9698-45f1-9d0e-299d234bdccf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged = df_merged.rename(columns={'Variation': 'variation'})\n",
+ "df3 = df3.rename(columns={'Variation': 'variation'})\n",
+ "df_control = df_control.rename(columns={'Variation': 'variation'})\n",
+ "df_test = df_test.rename(columns={'Variation': 'variation'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "73f27b9e-3a8c-4ad7-a756-6838314f5b34",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5da498ad-e434-4dbc-af5f-84b4a322b09d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_control.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "110917b6-fa20-47d1-9704-9263dafd5312",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90d7a353-0bd9-422f-b3e4-98d8d709c90d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4edff36-d806-4048-9b85-aa83de863e3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f86541c-bc8d-4ea1-a7b5-d823188ed736",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eed944a6-5fce-4e95-ad1e-8f384f15addb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28162307-e0cf-4d5d-b662-58f03c6a67e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['step_num'] = dfs_var['process_step'].map(step_order)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c71f9b3d-9581-40fc-986d-bf30da3e1882",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "######################################################\n",
+ "#COMPLETION RATE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0699b897-2d70-4a36-a2fa-de92681667c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "last_step = dfs_var['step_num'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1451a1a2-67f8-4522-b352-6a72513b7a3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "decd9c0a-810e-4e3d-a3da-4aa31dd2c58c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visitcomp[\"completed\"] = (visitcomp[\"max_step\"] == last_step).astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "323d47af-6a05-4a10-bc42-3886c7c88ca4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c501100b-d67e-4774-9e73-db3350874cf7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "completionrate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print('Average time spent on each steap is', completionrate)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "############################################################################################\n",
+ "#TIME SPENT ON EACH STEP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f4a43083-3154-46ca-823f-10e50602565c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b2f7798-0953-4991-9634-1984749b0703",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57d91d74-ae22-46fc-a195-ad7e1b777296",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec88f0a4-fcb2-4422-a325-52661e23016f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "##################################################################\n",
+ "#CHECKING RATING OF ERRORS PER EACH STEP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "error_rates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43f50454-c86a-4040-b141-5a81fb366ea7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# max error_flag per visit - error = 1\n",
+ "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\"])[\"error_flag\"].max().reset_index())\n",
+ "\n",
+ "# variation resume\n",
+ "err_summary = (visit_error.groupby(\"variation\")[\"error_flag\"].agg(n_error=\"sum\", n_total=\"count\").reset_index())\n",
+ "\n",
+ "print(err_summary)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601",
+ "metadata": {},
+ "source": [
+ "COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps. \n",
+ "\n",
+ "TIME SPENT - The test version is better for user to complete the steps faster. \n",
+ "\n",
+ "ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a07db602-47f1-4c8c-8785-76c322d070db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from scipy.stats import norm\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n",
+ "\n",
+ "print(summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Order control test\n",
+ "summary = summary.set_index('variation').loc[['Control', 'Test']]\n",
+ "x1, x2 = summary['n_complete'].values\n",
+ "n1, n2 = summary['n_total'].values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#propoortions\n",
+ "p1 = x1 / n1\n",
+ "p2 = x2 / n2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#h0\n",
+ "p_pool = (x1 + x2) / (n1 + n2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48060fa7-6f84-4685-98c0-9f818b234cbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#test of 2 proportions \n",
+ "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n",
+ "z = (p2 - p1) / se"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p_value = 2 * (1 - norm.cdf(abs(z)))\n",
+ "print(f'Completion of rate Control: {p1:4f}')\n",
+ "print(f'Completion of rate Test: {p2:4f}')\n",
+ "print(f'z-statistic: {z:4f}')\n",
+ "print(f'p-value : {p_value:.6f}')\n",
+ "alpha = 0.05\n",
+ "if p_value < alpha:\n",
+ " print('Statistical diference highly significative (alpha=0.05)')\n",
+ "else:\n",
+ " print('Statistical diference not significative (alpha=0.05)')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49417603-53bb-45a2-a9e1-f55fe5c2999c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "056a80de-36d0-426f-8cdd-8a0cb74582a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dfs_var.columns)\n",
+ "\n",
+ "print(dfs_var[\"step_durationsec\"].notna().sum())\n",
+ "\n",
+ "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+ "print(df_dur.shape)\n",
+ "print(df_dur[\"process_step\"].unique())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ca550d4-c1d0-4b3f-a727-3c9c2fef399d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dfs_var[\"step_durationsec\"].head())\n",
+ "print(dfs_var[\"step_durationsec\"].isna().sum())\n",
+ "print(len(dfs_var))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8267354a-556e-4984-8024-672be4c03b72",
+ "metadata": {},
+ "source": [
+ "PREPARING MEAN OF TIME FOR EACH STEP IN CONTROL VS STEP "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c36ce9c0-db04-4d3f-98ff-aba36672d3cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var[\"date_time\"] = pd.to_datetime(dfs_var[\"date_time\"])\n",
+ "\n",
+ "dfs_var = dfs_var.sort_values([\"variation\", \"visit_id\", \"step_num\", \"date_time\"])\n",
+ "\n",
+ "dfs_var[\"next_time\"] = (\n",
+ " dfs_var.groupby([\"variation\", \"visit_id\"])[\"date_time\"].shift(-1))\n",
+ "dfs_var[\"step_durationsec\"] = (dfs_var[\"next_time\"] - dfs_var[\"date_time\"]).dt.total_seconds()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa579f03-f983-4ef4-8df6-85c818d34dc6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+ "\n",
+ "mean_step_time = (df_dur.groupby([\"variation\", \"process_step\"])[\"step_durationsec\"].mean().reset_index())\n",
+ "\n",
+ "print(mean_step_time)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13628402-e804-4082-8491-5558b59d76a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b795cd06-2379-4c84-bec4-c3e7a562cc62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\", \"client_id\"])[\"error_flag\"].max().reset_index(name=\"has_error\"))\n",
+ "\n",
+ "visits_with_age = visit_error.merge(df4[[\"client_id\", \"clnt_age\"]], on=\"client_id\", how=\"left\").dropna(subset=[\"clnt_age\"])\n",
+ "\n",
+ "for var in [\"Control\", \"Test\"]:\n",
+ " sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+ " corr = np.corrcoef(sub[\"clnt_age\"], sub[\"has_error\"])[0, 1]\n",
+ " print(f\"Correlation age–error ({var}): {corr:.4f} (n={len(sub)})\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b51e9ea-cf23-4b7f-9af8-dbf7a7e9405c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.figure(figsize=(12, 6))\n",
+ "\n",
+ "correlations = {'Control': -0.0870, 'Test': -0.0420}\n",
+ "\n",
+ "x = np.arange(2)\n",
+ "bars = plt.bar(x, [abs(correlations['Control']), abs(correlations['Test'])], color=['steelblue', 'darkorange'], alpha=0.8, edgecolor='black', linewidth=2)\n",
+ "\n",
+ "plt.ylabel('Correlation', fontsize=12, fontweight='bold')\n",
+ "plt.title('Age-Error Relationship\\n(Lower correlation)', fontsize=14, fontweight='bold')\n",
+ "\n",
+ "plt.xticks(x, correlations.keys())\n",
+ "plt.ylim(0, 0.1)\n",
+ "\n",
+ "for i, (bar, corr) in enumerate(zip(bars, correlations.values())):\n",
+ " plt.text(bar.get_x() + bar.get_width()/2, abs(corr) + 0.002, \n",
+ " f'{abs(corr):.4f}', ha='center', fontweight='bold', fontsize=12)\n",
+ "\n",
+ "improvement = ((abs(correlations['Control']) - abs(correlations['Test'])) / abs(correlations['Control']) * 100)\n",
+ "\n",
+ "plt.grid(axis='y', alpha=0.3)\n",
+ "sns.despine()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2d14a8e-189a-48dc-93f4-435bda0e44a3",
+ "metadata": {},
+ "source": [
+ "There is no relationship where each additional year of age systematically increases or decreases error rate. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce2b60be-7678-4bf8-a49e-60d234c1d585",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bins = [0, 40, 60, 120]\n",
+ "labels = [\"<40\", \"40-60\", \">60\"]\n",
+ "visits_with_age[\"age_group\"] = pd.cut(visits_with_age[\"clnt_age\"], bins=bins, labels=labels)\n",
+ "\n",
+ "for var in [\"Control\", \"Test\"]:\n",
+ " sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+ " tab = pd.crosstab(sub[\"age_group\"], sub[\"has_error\"]) \n",
+ " print(var, \"\\n\", tab, \"\\n\")\n",
+ " \n",
+ " from scipy.stats import chi2_contingency\n",
+ " chi2, p, dof, exp = chi2_contingency(tab)\n",
+ " print(f\"{var}: chi2 = {chi2:.2f}, p-value = {p:.6f}\\n\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4887d163-56c7-4b06-94f0-3d6d4082814a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))\n",
+ "\n",
+ "# Bar Chi2 values\n",
+ "chi2_values = [745.54, 720.85] # Control vs Test\n",
+ "ax1.bar(['Control', 'Test'], chi2_values, color=['steelblue', 'darkorange'], alpha=0.8)\n",
+ "ax1.set_ylabel('Chi² Statistic')\n",
+ "ax1.set_title('Chi² Age-Error Distribution\\n(Higher = more age influence)')\n",
+ "for i, v in enumerate(chi2_values):\n",
+ " ax1.text(i, v + 10, f'{v:.0f}', ha='center', fontweight='bold')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d633dd69-f1df-4e81-aeab-31e224ce1a94",
+ "metadata": {},
+ "source": [
+ "(significant chi-square). The relationship is non-linear, younger and older users likely commit different types or frequencies of errors. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "987b45ee-7df6-4446-9125-180619476b1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmergeexp = dfs.merge(df3, left_on='client_id', right_on='client_id', how='left')\n",
+ "dfmerged = dfmergeexp.merge(df4, left_on='client_id', right_on='client_id', how='left')\n",
+ "\n",
+ "dfmerged_calls = dfmerged[dfmerged['variation'].isin(['Control', 'Test'])].copy()\n",
+ "\n",
+ "print(\"Clients per group:\")\n",
+ "print(dfmerged_calls['variation'].value_counts())\n",
+ "\n",
+ "summary = dfmerged_calls.groupby('variation')['calls_6_mnth'].agg(['mean', 'median', 'count', 'std']).round(2)\n",
+ "print(\"\\nCalls6mnth Control vs Test:\")\n",
+ "print(summary)\n",
+ "\n",
+ "from scipy.stats import ttest_ind\n",
+ "control_c = dfmerged_calls[dfmerged_calls['variation']=='Control']['calls_6_mnth'].dropna()\n",
+ "test_c = dfmerged_calls[dfmerged_calls['variation']=='Test']['calls_6_mnth'].dropna()\n",
+ "t_stat, p_val = ttest_ind(control_c, test_c)\n",
+ "print(f\"\\nT-test: t={t_stat:.2f}, p-value={p_val:.6f}\")\n",
+ "\n",
+ "if p_val < 0.05:\n",
+ " print(\"*** TEST HAS FEWER CALLS (significant)! ***\")\n",
+ "elif test_c.mean() < control_c.mean():\n",
+ " print(\"*** Test has fewer calls (not significant) ***\")\n",
+ "else:\n",
+ " print(\"*** Control has fewer calls ***\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0eb1ca04-ce60-44b7-bb86-05c81f76622c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.figure(figsize=(8, 6))\n",
+ "\n",
+ "x = ['Control', 'Test']\n",
+ "means = [summary.loc['Control', 'mean'], summary.loc['Test', 'mean']]\n",
+ "stds = [summary.loc['Control', 'std'], summary.loc['Test', 'std']]\n",
+ "n = [summary.loc['Control', 'count'], summary.loc['Test', 'count']]\n",
+ "yerr = [s/np.sqrt(nn) for s,nn in zip(stds,n)]\n",
+ "\n",
+ "# Barplot CLEAN\n",
+ "bars = plt.bar(x, means, yerr=yerr, capsize=10, color=['steelblue', 'darkorange'], alpha=0.85, edgecolor='black', linewidth=1.5)\n",
+ "\n",
+ "plt.ylabel('Average Calls per client\\n(6 months)', fontsize=14, fontweight='bold')\n",
+ "plt.title('Test with Fewer Calls', fontsize=16, fontweight='bold', pad=20)\n",
+ "\n",
+ "for i, (bar, mean) in enumerate(zip(bars, means)):\n",
+ " plt.text(bar.get_x() + bar.get_width()/2, mean + yerr[i] + 0.08, \n",
+ " f'{mean:.2f}', ha='center', va='bottom', \n",
+ " fontsize=14, fontweight='bold')\n",
+ "\n",
+ "delta_pct = ((means[0] - means[1])/means[0])*100\n",
+ "\n",
+ "plt.grid(axis='y', alpha=0.4, linestyle='-', linewidth=0.8)\n",
+ "plt.ylim(0, max(means) + 0.6)\n",
+ "plt.xticks(fontsize=12, fontweight='bold')\n",
+ "\n",
+ "for spine in plt.gca().spines.values():\n",
+ " spine.set_linewidth(1.5)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5868d3c2-ed8e-486f-9d9e-af02ebe9e28f",
+ "metadata": {},
+ "source": [
+ "Control has more calls PER CLIENT than Test. The Test version proves superior self-service. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "528b137e-3a2b-4a12-87ad-dca4b2c6b8a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['age_group'] = pd.cut(dfmerged['clnt_age'], bins=[0, 40, np.inf], labels=['Young <40', 'Older >=40'], ordered=False)\n",
+ "calls_mean = dfmerged.groupby(['variation', 'age_group'], observed=False)['calls_6_mnth'].mean().unstack(fill_value=0)\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(9, 6))\n",
+ "x = np.arange(len(calls_mean.columns))\n",
+ "width = 0.35\n",
+ "bars1 = ax.bar(x - width/2, calls_mean.loc['Control'], width, label='Control', color='blue', alpha=0.8)\n",
+ "bars2 = ax.bar(x + width/2, calls_mean.loc['Test'], width, label='Test', color='orange', alpha=0.8)\n",
+ "\n",
+ "for i, col in enumerate(calls_mean.columns):\n",
+ " ctrl_val = calls_mean.loc['Control', col]\n",
+ " test_val = calls_mean.loc['Test', col]\n",
+ " red = ((ctrl_val - test_val) / ctrl_val * 100).round(1) if ctrl_val > 0 else 0\n",
+ " ax.text(i - width/2, ctrl_val + 0.05, f'{ctrl_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+ " ax.text(i + width/2, test_val + 0.05, f'{test_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+ "\n",
+ "ax.set_ylabel('Average Calls (6 months)')\n",
+ "ax.set_title('Test Call Reduction: Stronger in Young Clients (<40 years)')\n",
+ "ax.set_xticks(x)\n",
+ "ax.set_xticklabels(calls_mean.columns)\n",
+ "ax.legend()\n",
+ "ax.grid(axis='y', alpha=0.3)\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n",
+ "reduction_pct = ((calls_mean.loc['Control'] - calls_mean.loc['Test']) / calls_mean.loc['Control'] * 100).round(1)\n",
+ "print(\"Reduction % (Test vs Control):\")\n",
+ "print(reduction_pct)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "088d7175-917f-43b1-bb26-13e21cbb5981",
+ "metadata": {},
+ "source": [
+ "Has we can saw in the last one, Test have a reduction of calls and the same is applied to reductions among young clients (<40 yrs old) compared to older ones. And related to Control vs Test, there seems to have more diffence in reduction of calls between younger and older in test!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2fbd42a4-9a03-4bd8-8dfb-4d804c7a4025",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df = dfmerged_calls[dfmerged_calls['variation'].isin(['Control', 'Test'])].copy()\n",
+ "\n",
+ "# TRUST: Completion \n",
+ "print(\"TRUST - Completion Rate:\")\n",
+ "print(visitcomp.groupby('variation')['completed'].mean()) \n",
+ "\n",
+ "# REVENUE IMPACT (High-Tenure)\n",
+ "df['tenure_group'] = pd.cut(df['clnt_tenure_yr'], [0,10,np.inf], labels=['Low ≤10y','High >10y'], ordered=False)\n",
+ "\n",
+ "n_high_test = len(df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')])\n",
+ "bal_ctrl_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Control')]['bal'].mean()\n",
+ "bal_test_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')]['bal'].mean()\n",
+ "delta_bal = bal_test_high - bal_ctrl_high\n",
+ "\n",
+ "revenue = delta_bal * 0.01 * n_high_test \n",
+ "print(f\"\\nREVENUE High-Tenure ({n_high_test} clients):\")\n",
+ "print(f\"Control Bal: €{bal_ctrl_high:.2f}\")\n",
+ "print(f\"Test Bal: €{bal_test_high:.2f}\")\n",
+ "print(f\"ΔBal/client: €{delta_bal:.3f}\")\n",
+ "print(f\"Annual Revenue: €{revenue/1e6:.1f}M ← **TEST WINS**\")\n",
+ "\n",
+ "plt.figure(figsize=(12,4))\n",
+ "\n",
+ "plt.subplot(121)\n",
+ "completion = visitcomp.groupby('variation')['completed'].mean()\n",
+ "plt.bar(completion.index, completion.values*100, color=['blue','orange'], alpha=0.8)\n",
+ "plt.title('Trust: Completion Rate')\n",
+ "plt.ylabel('% Complete')\n",
+ "for i, v in enumerate(completion.values*100):\n",
+ " plt.text(i, v+1, f'{v*100:.1f}%', ha='center')\n",
+ "\n",
+ "plt.subplot(122)\n",
+ "bal_tenure = df.groupby(['variation','tenure_group'])['bal'].mean().unstack()\n",
+ "x = np.arange(2)\n",
+ "width=0.35\n",
+ "plt.bar(x-width/2, bal_tenure.loc['Control'], width, label='Control', alpha=0.8)\n",
+ "plt.bar(x+width/2, bal_tenure.loc['Test'], width, label='Test', alpha=0.8)\n",
+ "plt.title('Revenue: Bal High-Tenure')\n",
+ "plt.ylabel('Avg Balance €')\n",
+ "plt.xticks(x, bal_tenure.columns, rotation=0)\n",
+ "plt.legend()\n",
+ "plt.grid(axis='y', alpha=0.3)\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4bfddee5-a983-40df-adf0-8f76228edc15",
+ "metadata": {},
+ "source": [
+ "Test is MORE trustable (58% vs 49% completion), Test self-service success DIRECTLY causes higher balances... \n",
+ "We can see customers with more than 10 years of tenure have more balance, besides of that the control have more lowest balance than the test version in all ages.\n",
+ "Perfect cycle: Better UX, More transactions then more revenue. So, Test creates client trust + bank profits simultaneously."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0c4bc6b-84fa-437e-a5d2-64f56b59cfb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "df1 = pd.read_csv('df_final_web_data_pt_1.txt')\n",
+ "print('df1 columns:', df1.columns.tolist())\n",
+ "print('df1 shape:', df1.shape)\n",
+ "df2 = pd.read_csv('df_final_web_data_pt_2.txt')\n",
+ "print('df2 columns:', df2.columns.tolist())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e3838ff-260b-420c-9ae5-7e2b01b6317f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1763ab2d-aa00-41ed-acd4-45030b76f6aa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e3ed3d2-c674-4e79-ae5e-f2ef5a52f71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4441a7a-208a-4bd0-a4b1-665b37ed9d9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['variation'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e075867f-4755-48c6-9afa-18599d594af0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58765716-483c-4f41-aa86-edfb93091be6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged = dfmerged[dfmerged['gendr'].isin(['M','F'])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d818395b-d84c-4109-9589-15875b2cbe28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2291a32f-f560-445c-b585-e8c80521464b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d62f6e02-d72b-48b6-b084-c0a839b69b3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged[\"error_flag\"] = (dfmerged[\"process_step\"] != \"confirm\").astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bcf8b1e9-8466-48be-89f4-05687a2b775e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3590ca8-47a7-49ba-b716-34e87d6d546c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfmerged.to_csv('Project2FIX.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f92a491-010b-48f0-959f-90d18e8b00bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = {'variation': ['Control', 'Control', 'Control', 'Control', 'Control', 'Test', 'Test', 'Test', 'Test', 'Test'],'process_step': ['confirm', 'start', 'step1', 'step2', 'step3',\n",
+ " 'start', 'step1', 'step2', 'step3', 'confirm'],'mean_time': [153.74, 49.74, 45.09, 86.70, 140.79,38.24, 60.13, 89.76, 139.83, 246.07]}\n",
+ "df_times = pd.DataFrame(data)\n",
+ "\n",
+ "pivot = df_times.pivot(index='process_step', columns='variation', values='mean_time')\n",
+ "pivot = pivot.reindex(['start', 'step1', 'step2', 'step3', 'confirm'])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 6))\n",
+ "x = np.arange(len(pivot))\n",
+ "width = 0.35\n",
+ "ax.barh(x - width/2, pivot['Control'], width, label='Control', color='#2C6693', alpha=0.8)\n",
+ "ax.barh(x + width/2, pivot['Test'], width, label='Test', color='#E5B544', alpha=0.8)\n",
+ "ax.set_yticks(x)\n",
+ "ax.set_yticklabels(pivot.index)\n",
+ "ax.set_xlabel('Mean Time (seconds)')\n",
+ "ax.set_title('Average Time per Step: Control vs Test')\n",
+ "ax.legend()\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd5ad294-a1e7-42b6-80b1-c83b941d2578",
+ "metadata": {},
+ "source": [
+ "Test faster at start , but its more slower on the remaining steps, specially in the end"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ce132b3-3f48-444b-a050-1d21c8a78955",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "cores = {'Control': '#2C6693', 'Test': '#E5B544'}\n",
+ "sns.lmplot(data=visits_with_age, x='clnt_age', y='has_error', hue='variation',\n",
+ " palette=cores, line_kws={'lw':2}, scatter_kws={'alpha':0.5}, ci=95)\n",
+ "plt.xlabel('Age'); plt.ylabel('Error Flag')\n",
+ "plt.title('Age-Error Correlation: Control vs Test')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "537cc5c5-2aab-4d70-8824-bb7a155a8dce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.set(style='whitegrid')\n",
+ "plt.figure(figsize=(6,5))\n",
+ "sns.barplot(data=err_summary, x='variation', y='n_error', palette=['#2C6693', '#E5B544'])\n",
+ "plt.title('Total of Errors: Control vs Test')\n",
+ "plt.xlabel('Variation')\n",
+ "plt.ylabel('Total of Errors')\n",
+ "plt.xticks([0, 1], ['Control', 'Test'])\n",
+ "for i, row in err_summary.iterrows():\n",
+ " plt.text(i, row['n_error'] + 500, f'{int(row[\"n_error\"]):,}', ha='center')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e21b230-af46-4cc2-b521-0bd81db64d06",
+ "metadata": {},
+ "source": [
+ "CONCLUSIONS\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a7e55d5-90fd-4ad3-971e-a7c3762400cf",
+ "metadata": {},
+ "source": [
+ "Test variation consistently outperforms Control across all critical metrics, delivering substantial revenue growth, operational efficiencies and enhanced user resilience. Rigorous statistical analysis eliminates any doubt regarding implementation.\n",
+ "\n",
+ "Completion Advantage: Test have substantially higher process completion rates versus Control. This translates to massive trust and revenue uplift through elevated client balances across all tenure segments.\n",
+ "\n",
+ "Operational Excellence: Test meaningfully reduces call-center dependency confirms reliable cost savings.\n",
+ "\n",
+ "Demographic Resilience: Test significantly mitigates age-related error patterns compared to Control. Both versions show age influences error distribution, but Test manages this relationship far more equitably across age groups.\n",
+ "\n",
+ "Strategic Imperative: Revenue acceleration + cost reduction + universal UX improvements = compelling business case.\n",
+ "\n",
+ "DEPLOY TEST VERSION IMMEDIATELY ACROSS ALL CLIENTS."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ff7b913-6d52-4511-94fe-482d913675cb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.14.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebook/PROJECT2_pedro1.ipynb b/notebook/PROJECT2_pedro1.ipynb
new file mode 100644
index 0000000..d775cb1
--- /dev/null
+++ b/notebook/PROJECT2_pedro1.ipynb
@@ -0,0 +1,1623 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de9edac0-fcff-4134-913b-5c0cb3a81646",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "df4 = pd.read_csv(\"df_final_demo.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b15dbbc-e036-47d4-a650-2bdd1fda6f0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3 = pd.read_csv(\"df_final_experiment_clients.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4f88831-c73a-457b-b38d-526deb99eb1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1 = pd.read_csv(\"df_final_web_data_pt_1.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ae03da9-ad70-4bc5-b1b7-8578a6ee40a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = pd.read_csv(\"df_final_web_data_pt_2.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42167cc8-18da-46d6-8a62-0b29f654082d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4637e1ee-6841-431e-ac99-49c6a1557d6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2404ff4f-d2a1-4d73-8850-6fb997416ee1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = pd.concat([df1, df2], ignore_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f583b072-14bf-4a68-b0fd-8e129194473b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmergeexp = dfs.merge(df3, on='client_id', how='left')\n",
+ "dfmerged = dfmergeexp.merge(df4, on='client_id', how='left')\n",
+ "dfmerged = dfmerged.drop_duplicates()\n",
+ "dfmerged = dfmerged[dfmerged['Variation'].isin(['Control', 'Test'])]\n",
+ "\n",
+ "dfmerged['source_group'] = dfmerged['Variation']\n",
+ "\n",
+ "dfmerged.to_csv('ab_test_data_with_source.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "01ac17a1-d2fe-4d56-8b1a-376ac810be70",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['client_id'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "427a2282-9d0f-4a2a-b771-e0aaaf4dea2e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.isnull()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3aef449c-5efe-4f22-8387-5217ec21c6d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f946b9e-e8ae-472a-ad3e-695e63545151",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd184618-2e58-445b-a677-eb761f086c62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#checking what the type of date_time\n",
+ "dfs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87ec222b-8b6e-44c1-8bdd-2944199a53a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#convert object into date_time\n",
+ "dfs['date_time'] = pd.to_datetime(dfs['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d285f1e-baac-4ffa-9f3e-1ba08f3423ac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2349d923-4c1d-44b0-b222-091385812b30",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "448d5890-b61e-473c-8403-13273d164f60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec9f6c03-7ed3-467b-8d75-094114d8838f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55e51cdf-678d-469d-a6fc-746416546c2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_all.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4cf44703-273b-4a59-a0e1-4df661138326",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd9f177f-89f8-410a-a535-daddffb87e32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a3e667ca-63a0-4a4e-87ed-a83ae5c118f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b05e56f0-fd06-4dc4-a47e-08cfc1d0aa81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfsorted[dfsorted['client_id'] == 442857]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d6b253c6-7371-414a-8ee7-59eede52b03b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['client_id'].value_counts().head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "867532d4-592d-425b-a442-386b325bc845",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['date_time'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de7d7eb0-36d1-4b51-b027-5db80356cec3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visitor_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c83e80d-ef18-4f85-9bff-3b82208c121d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['date_time'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19ae077e-9b9c-477d-a267-1d9cb15fc815",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visit_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81f2e88d-79a1-4a70-ab61-acd0fd663704",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['visitor_id'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aca325aa-d0cb-437e-bb4c-64eada0876ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['process_step'].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2d79aa0f-4f36-4aba-b730-b18f0a16028d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30d4f963-6a2a-4659-9a7b-68d23e59b445",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "898410aa-94c8-4747-81b1-e11afd574f8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c35f3128-58a1-4557-8cfd-e70f054e33c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_test.isnull()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87bc11a1-1791-4a93-9913-31d84c7a6875",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs['process_step'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72dcc7fb-1ff0-49ea-98b2-c3c04a08d1af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#merfe dfs + experiment(df3) to have Variaton = Control OR test)\n",
+ "df_mergeexp = dfs.merge(df3, on=\"client_id\", how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f2548d2-68c6-40fb-99f7-708369d15623",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#merge with demo(df4) (demographic data)\n",
+ "df_merged = df_mergeexp.merge(df4, on=\"client_id\", how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c7709e3-5aea-4876-8e04-6e6842659f12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "52d75c86-aa2d-4455-a876-f6089b4df31f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#remove duplicates\n",
+ "df_merged = df_merged.drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3a43aef8-5752-41de-b0b3-d6439cb714d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#remove rows without any variation\n",
+ "df_merged = df_merged[df_merged[\"Variation\"].isin([\"Control\", \"Test\"])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9632b15a-5c30-4426-942e-134e2a924893",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74067acd-53d4-4b5a-bacb-83f74d45e04f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged[\"process_step\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ac2ee71-4c6a-47b0-beb7-adc4b32d0725",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged[\"Variation\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3336830-2021-4163-ba50-4a93a2ca1299",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Give a table for Tests and another for Controls! \n",
+ "df_control = df_merged[df_merged[\"Variation\"] == \"Control\"].copy()\n",
+ "df_test = df_merged[df_merged[\"Variation\"] == \"Test\"].copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48b1d00b-87cd-4115-a1e1-dfab54281ab3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_control.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a87776b-7368-45dd-baeb-5a89daf30b19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0cdc8198-e0ab-4fc8-8eb0-3b96318f1710",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged['num_accts'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "162a3f57-229f-4cbf-848a-e4704a2cdf52",
+ "metadata": {},
+ "source": [
+ "Who are the primary clients using this online process? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fed97a77-c349-420e-a073-8cdd689514c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "usage = (df_merged.groupby(\"client_id\")[\"visit_id\"].nunique().reset_index(name=\"n_visits\")) #hor many visits per client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed106950-abca-4aa9-afb0-f758055eaa47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cutoff = usage[\"n_visits\"].quantile(0.75)\n",
+ "usage['primary'] = (usage[\"n_visits\"] >= cutoff).astype(int) #top25% by number of visits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5b77ee6-dd0f-4256-b044-6688bc348160",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c04385dd-5d27-442c-ba4c-63c1e2808d1b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged = usage.merge(df_merged, left_on=\"client_id\", right_on=\"client_id\", how=\"left\") #join demographics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a46f9ad-2767-4c80-8e18-bf36f0afb441",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.groupby(\"primary\")[[\"clnt_age\",\"clnt_tenure_yr\",\"clnt_tenure_mnth\",\"logons_6_mnth\"]].mean()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e92ebf-05af-411c-a0ed-0785e70a8e0d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sub = df_merged[df_merged[\"primary\"] == 1]\n",
+ "print(sub[[\"clnt_age\",\"clnt_tenure_yr\",\"bal\",\"gendr\"]].describe(include=\"all\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9948430a-27f5-467a-b44f-ccc9968944a0",
+ "metadata": {},
+ "source": [
+ "Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.\n",
+ "primary cients are more long standing.\n",
+ "The most frequent gender is “M” "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95002485-6f20-4f4a-a763-d037b9faa90f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.groupby(\"primary\")[['calls_6_mnth', 'logons_6_mnth']].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ac842fc-b333-4bb9-b804-01f9fc8f0c72",
+ "metadata": {},
+ "source": [
+ "They also make more calls and more logons in 6 months, so they are more active on all channels, not only online. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a4f3984-9698-45f1-9d0e-299d234bdccf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged = df_merged.rename(columns={'Variation': 'variation'})\n",
+ "df3 = df3.rename(columns={'Variation': 'variation'})\n",
+ "df_control = df_control.rename(columns={'Variation': 'variation'})\n",
+ "df_test = df_test.rename(columns={'Variation': 'variation'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "73f27b9e-3a8c-4ad7-a756-6838314f5b34",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_merged.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5da498ad-e434-4dbc-af5f-84b4a322b09d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_control.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "110917b6-fa20-47d1-9704-9263dafd5312",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90d7a353-0bd9-422f-b3e4-98d8d709c90d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4edff36-d806-4048-9b85-aa83de863e3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f86541c-bc8d-4ea1-a7b5-d823188ed736",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eed944a6-5fce-4e95-ad1e-8f384f15addb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28162307-e0cf-4d5d-b662-58f03c6a67e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['step_num'] = dfs_var['process_step'].map(step_order)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c71f9b3d-9581-40fc-986d-bf30da3e1882",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "######################################################\n",
+ "#COMPLETION RATE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0699b897-2d70-4a36-a2fa-de92681667c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "last_step = dfs_var['step_num'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1451a1a2-67f8-4522-b352-6a72513b7a3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "decd9c0a-810e-4e3d-a3da-4aa31dd2c58c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visitcomp[\"completed\"] = (visitcomp[\"max_step\"] == last_step).astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "323d47af-6a05-4a10-bc42-3886c7c88ca4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c501100b-d67e-4774-9e73-db3350874cf7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "completionrate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print('Average time spent on each steap is', completionrate)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "############################################################################################\n",
+ "#TIME SPENT ON EACH STEP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f4a43083-3154-46ca-823f-10e50602565c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b2f7798-0953-4991-9634-1984749b0703",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57d91d74-ae22-46fc-a195-ad7e1b777296",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec88f0a4-fcb2-4422-a325-52661e23016f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "step_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "##################################################################\n",
+ "#CHECKING RATING OF ERRORS PER EACH STEP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "error_rates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43f50454-c86a-4040-b141-5a81fb366ea7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# max error_flag per visit - error = 1\n",
+ "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\"])[\"error_flag\"].max().reset_index())\n",
+ "\n",
+ "# variation resume\n",
+ "err_summary = (visit_error.groupby(\"variation\")[\"error_flag\"].agg(n_error=\"sum\", n_total=\"count\").reset_index())\n",
+ "\n",
+ "print(err_summary)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601",
+ "metadata": {},
+ "source": [
+ "COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps. \n",
+ "\n",
+ "TIME SPENT - The test version is better for user to complete the steps faster. \n",
+ "\n",
+ "ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a07db602-47f1-4c8c-8785-76c322d070db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from scipy.stats import norm\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n",
+ "\n",
+ "print(summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Order control test\n",
+ "summary = summary.set_index('variation').loc[['Control', 'Test']]\n",
+ "x1, x2 = summary['n_complete'].values\n",
+ "n1, n2 = summary['n_total'].values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#propoortions\n",
+ "p1 = x1 / n1\n",
+ "p2 = x2 / n2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#h0\n",
+ "p_pool = (x1 + x2) / (n1 + n2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48060fa7-6f84-4685-98c0-9f818b234cbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#test of 2 proportions \n",
+ "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n",
+ "z = (p2 - p1) / se"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p_value = 2 * (1 - norm.cdf(abs(z)))\n",
+ "print(f'Completion of rate Control: {p1:4f}')\n",
+ "print(f'Completion of rate Test: {p2:4f}')\n",
+ "print(f'z-statistic: {z:4f}')\n",
+ "print(f'p-value : {p_value:.6f}')\n",
+ "alpha = 0.05\n",
+ "if p_value < alpha:\n",
+ " print('Statistical diference highly significative (alpha=0.05)')\n",
+ "else:\n",
+ " print('Statistical diference not significative (alpha=0.05)')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49417603-53bb-45a2-a9e1-f55fe5c2999c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "056a80de-36d0-426f-8cdd-8a0cb74582a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dfs_var.columns)\n",
+ "\n",
+ "print(dfs_var[\"step_durationsec\"].notna().sum())\n",
+ "\n",
+ "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+ "print(df_dur.shape)\n",
+ "print(df_dur[\"process_step\"].unique())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ca550d4-c1d0-4b3f-a727-3c9c2fef399d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dfs_var[\"step_durationsec\"].head())\n",
+ "print(dfs_var[\"step_durationsec\"].isna().sum())\n",
+ "print(len(dfs_var))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8267354a-556e-4984-8024-672be4c03b72",
+ "metadata": {},
+ "source": [
+ "PREPARING MEAN OF TIME FOR EACH STEP IN CONTROL VS STEP "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c36ce9c0-db04-4d3f-98ff-aba36672d3cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs_var[\"date_time\"] = pd.to_datetime(dfs_var[\"date_time\"])\n",
+ "\n",
+ "dfs_var = dfs_var.sort_values([\"variation\", \"visit_id\", \"step_num\", \"date_time\"])\n",
+ "\n",
+ "dfs_var[\"next_time\"] = (\n",
+ " dfs_var.groupby([\"variation\", \"visit_id\"])[\"date_time\"].shift(-1))\n",
+ "dfs_var[\"step_durationsec\"] = (dfs_var[\"next_time\"] - dfs_var[\"date_time\"]).dt.total_seconds()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa579f03-f983-4ef4-8df6-85c818d34dc6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_dur = dfs_var[dfs_var[\"step_durationsec\"].notna()]\n",
+ "\n",
+ "mean_step_time = (df_dur.groupby([\"variation\", \"process_step\"])[\"step_durationsec\"].mean().reset_index())\n",
+ "\n",
+ "print(mean_step_time)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13628402-e804-4082-8491-5558b59d76a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b795cd06-2379-4c84-bec4-c3e7a562cc62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visit_error = (dfs_var.groupby([\"variation\", \"visit_id\", \"client_id\"])[\"error_flag\"].max().reset_index(name=\"has_error\"))\n",
+ "\n",
+ "visits_with_age = visit_error.merge(df4[[\"client_id\", \"clnt_age\"]], on=\"client_id\", how=\"left\").dropna(subset=[\"clnt_age\"])\n",
+ "\n",
+ "for var in [\"Control\", \"Test\"]:\n",
+ " sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+ " corr = np.corrcoef(sub[\"clnt_age\"], sub[\"has_error\"])[0, 1]\n",
+ " print(f\"Correlation age–error ({var}): {corr:.4f} (n={len(sub)})\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b51e9ea-cf23-4b7f-9af8-dbf7a7e9405c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.figure(figsize=(12, 6))\n",
+ "\n",
+ "correlations = {'Control': -0.0870, 'Test': -0.0420}\n",
+ "\n",
+ "x = np.arange(2)\n",
+ "bars = plt.bar(x, [abs(correlations['Control']), abs(correlations['Test'])], color=['steelblue', 'darkorange'], alpha=0.8, edgecolor='black', linewidth=2)\n",
+ "\n",
+ "plt.ylabel('Correlation', fontsize=12, fontweight='bold')\n",
+ "plt.title('Age-Error Relationship\\n(Lower correlation)', fontsize=14, fontweight='bold')\n",
+ "\n",
+ "plt.xticks(x, correlations.keys())\n",
+ "plt.ylim(0, 0.1)\n",
+ "\n",
+ "for i, (bar, corr) in enumerate(zip(bars, correlations.values())):\n",
+ " plt.text(bar.get_x() + bar.get_width()/2, abs(corr) + 0.002, \n",
+ " f'{abs(corr):.4f}', ha='center', fontweight='bold', fontsize=12)\n",
+ "\n",
+ "improvement = ((abs(correlations['Control']) - abs(correlations['Test'])) / abs(correlations['Control']) * 100)\n",
+ "\n",
+ "plt.grid(axis='y', alpha=0.3)\n",
+ "sns.despine()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2d14a8e-189a-48dc-93f4-435bda0e44a3",
+ "metadata": {},
+ "source": [
+ "There is no relationship where each additional year of age systematically increases or decreases error rate. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce2b60be-7678-4bf8-a49e-60d234c1d585",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bins = [0, 40, 60, 120]\n",
+ "labels = [\"<40\", \"40-60\", \">60\"]\n",
+ "visits_with_age[\"age_group\"] = pd.cut(visits_with_age[\"clnt_age\"], bins=bins, labels=labels)\n",
+ "\n",
+ "for var in [\"Control\", \"Test\"]:\n",
+ " sub = visits_with_age[visits_with_age[\"variation\"] == var]\n",
+ " tab = pd.crosstab(sub[\"age_group\"], sub[\"has_error\"]) \n",
+ " print(var, \"\\n\", tab, \"\\n\")\n",
+ " \n",
+ " from scipy.stats import chi2_contingency\n",
+ " chi2, p, dof, exp = chi2_contingency(tab)\n",
+ " print(f\"{var}: chi2 = {chi2:.2f}, p-value = {p:.6f}\\n\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4887d163-56c7-4b06-94f0-3d6d4082814a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))\n",
+ "\n",
+ "# Bar Chi2 values\n",
+ "chi2_values = [745.54, 720.85] # Control vs Test\n",
+ "ax1.bar(['Control', 'Test'], chi2_values, color=['steelblue', 'darkorange'], alpha=0.8)\n",
+ "ax1.set_ylabel('Chi² Statistic')\n",
+ "ax1.set_title('Chi² Age-Error Distribution\\n(Higher = more age influence)')\n",
+ "for i, v in enumerate(chi2_values):\n",
+ " ax1.text(i, v + 10, f'{v:.0f}', ha='center', fontweight='bold')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d633dd69-f1df-4e81-aeab-31e224ce1a94",
+ "metadata": {},
+ "source": [
+ "(significant chi-square). The relationship is non-linear, younger and older users likely commit different types or frequencies of errors. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "987b45ee-7df6-4446-9125-180619476b1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmergeexp = dfs.merge(df3, left_on='client_id', right_on='client_id', how='left')\n",
+ "dfmerged = dfmergeexp.merge(df4, left_on='client_id', right_on='client_id', how='left')\n",
+ "\n",
+ "dfmerged_calls = dfmerged[dfmerged['variation'].isin(['Control', 'Test'])].copy()\n",
+ "\n",
+ "print(\"Clients per group:\")\n",
+ "print(dfmerged_calls['variation'].value_counts())\n",
+ "\n",
+ "summary = dfmerged_calls.groupby('variation')['calls_6_mnth'].agg(['mean', 'median', 'count', 'std']).round(2)\n",
+ "print(\"\\nCalls6mnth Control vs Test:\")\n",
+ "print(summary)\n",
+ "\n",
+ "from scipy.stats import ttest_ind\n",
+ "control_c = dfmerged_calls[dfmerged_calls['variation']=='Control']['calls_6_mnth'].dropna()\n",
+ "test_c = dfmerged_calls[dfmerged_calls['variation']=='Test']['calls_6_mnth'].dropna()\n",
+ "t_stat, p_val = ttest_ind(control_c, test_c)\n",
+ "print(f\"\\nT-test: t={t_stat:.2f}, p-value={p_val:.6f}\")\n",
+ "\n",
+ "if p_val < 0.05:\n",
+ " print(\"*** TEST HAS FEWER CALLS (significant)! ***\")\n",
+ "elif test_c.mean() < control_c.mean():\n",
+ " print(\"*** Test has fewer calls (not significant) ***\")\n",
+ "else:\n",
+ " print(\"*** Control has fewer calls ***\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0eb1ca04-ce60-44b7-bb86-05c81f76622c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.figure(figsize=(8, 6))\n",
+ "\n",
+ "x = ['Control', 'Test']\n",
+ "means = [summary.loc['Control', 'mean'], summary.loc['Test', 'mean']]\n",
+ "stds = [summary.loc['Control', 'std'], summary.loc['Test', 'std']]\n",
+ "n = [summary.loc['Control', 'count'], summary.loc['Test', 'count']]\n",
+ "yerr = [s/np.sqrt(nn) for s,nn in zip(stds,n)]\n",
+ "\n",
+ "# Barplot CLEAN\n",
+ "bars = plt.bar(x, means, yerr=yerr, capsize=10, color=['steelblue', 'darkorange'], alpha=0.85, edgecolor='black', linewidth=1.5)\n",
+ "\n",
+ "plt.ylabel('Average Calls per client\\n(6 months)', fontsize=14, fontweight='bold')\n",
+ "plt.title('Test with Fewer Calls', fontsize=16, fontweight='bold', pad=20)\n",
+ "\n",
+ "for i, (bar, mean) in enumerate(zip(bars, means)):\n",
+ " plt.text(bar.get_x() + bar.get_width()/2, mean + yerr[i] + 0.08, \n",
+ " f'{mean:.2f}', ha='center', va='bottom', \n",
+ " fontsize=14, fontweight='bold')\n",
+ "\n",
+ "delta_pct = ((means[0] - means[1])/means[0])*100\n",
+ "\n",
+ "plt.grid(axis='y', alpha=0.4, linestyle='-', linewidth=0.8)\n",
+ "plt.ylim(0, max(means) + 0.6)\n",
+ "plt.xticks(fontsize=12, fontweight='bold')\n",
+ "\n",
+ "for spine in plt.gca().spines.values():\n",
+ " spine.set_linewidth(1.5)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5868d3c2-ed8e-486f-9d9e-af02ebe9e28f",
+ "metadata": {},
+ "source": [
+ "Control has more calls PER CLIENT than Test. The Test version proves superior self-service. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "528b137e-3a2b-4a12-87ad-dca4b2c6b8a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['age_group'] = pd.cut(dfmerged['clnt_age'], bins=[0, 40, np.inf], labels=['Young <40', 'Older >=40'], ordered=False)\n",
+ "calls_mean = dfmerged.groupby(['variation', 'age_group'], observed=False)['calls_6_mnth'].mean().unstack(fill_value=0)\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(9, 6))\n",
+ "x = np.arange(len(calls_mean.columns))\n",
+ "width = 0.35\n",
+ "bars1 = ax.bar(x - width/2, calls_mean.loc['Control'], width, label='Control', color='blue', alpha=0.8)\n",
+ "bars2 = ax.bar(x + width/2, calls_mean.loc['Test'], width, label='Test', color='orange', alpha=0.8)\n",
+ "\n",
+ "for i, col in enumerate(calls_mean.columns):\n",
+ " ctrl_val = calls_mean.loc['Control', col]\n",
+ " test_val = calls_mean.loc['Test', col]\n",
+ " red = ((ctrl_val - test_val) / ctrl_val * 100).round(1) if ctrl_val > 0 else 0\n",
+ " ax.text(i - width/2, ctrl_val + 0.05, f'{ctrl_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+ " ax.text(i + width/2, test_val + 0.05, f'{test_val:.1f}', ha='center', va='bottom', fontweight='bold')\n",
+ "\n",
+ "ax.set_ylabel('Average Calls (6 months)')\n",
+ "ax.set_title('Test Call Reduction: Stronger in Young Clients (<40 years)')\n",
+ "ax.set_xticks(x)\n",
+ "ax.set_xticklabels(calls_mean.columns)\n",
+ "ax.legend()\n",
+ "ax.grid(axis='y', alpha=0.3)\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n",
+ "reduction_pct = ((calls_mean.loc['Control'] - calls_mean.loc['Test']) / calls_mean.loc['Control'] * 100).round(1)\n",
+ "print(\"Reduction % (Test vs Control):\")\n",
+ "print(reduction_pct)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "088d7175-917f-43b1-bb26-13e21cbb5981",
+ "metadata": {},
+ "source": [
+ "Has we can saw in the last one, Test have a reduction of calls and the same is applied to reductions among young clients (<40 yrs old) compared to older ones. And related to Control vs Test, there seems to have more diffence in reduction of calls between younger and older in test!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2fbd42a4-9a03-4bd8-8dfb-4d804c7a4025",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df = dfmerged_calls[dfmerged_calls['variation'].isin(['Control', 'Test'])].copy()\n",
+ "\n",
+ "# TRUST: Completion \n",
+ "print(\"TRUST - Completion Rate:\")\n",
+ "print(visitcomp.groupby('variation')['completed'].mean()) \n",
+ "\n",
+ "# REVENUE IMPACT (High-Tenure)\n",
+ "df['tenure_group'] = pd.cut(df['clnt_tenure_yr'], [0,10,np.inf], labels=['Low ≤10y','High >10y'], ordered=False)\n",
+ "\n",
+ "n_high_test = len(df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')])\n",
+ "bal_ctrl_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Control')]['bal'].mean()\n",
+ "bal_test_high = df[(df['tenure_group']=='High >10y') & (df['variation']=='Test')]['bal'].mean()\n",
+ "delta_bal = bal_test_high - bal_ctrl_high\n",
+ "\n",
+ "revenue = delta_bal * 0.01 * n_high_test \n",
+ "print(f\"\\nREVENUE High-Tenure ({n_high_test} clients):\")\n",
+ "print(f\"Control Bal: €{bal_ctrl_high:.2f}\")\n",
+ "print(f\"Test Bal: €{bal_test_high:.2f}\")\n",
+ "print(f\"ΔBal/client: €{delta_bal:.3f}\")\n",
+ "print(f\"Annual Revenue: €{revenue/1e6:.1f}M ← **TEST WINS**\")\n",
+ "\n",
+ "plt.figure(figsize=(12,4))\n",
+ "\n",
+ "plt.subplot(121)\n",
+ "completion = visitcomp.groupby('variation')['completed'].mean()\n",
+ "plt.bar(completion.index, completion.values*100, color=['blue','orange'], alpha=0.8)\n",
+ "plt.title('Trust: Completion Rate')\n",
+ "plt.ylabel('% Complete')\n",
+ "for i, v in enumerate(completion.values*100):\n",
+ " plt.text(i, v+1, f'{v*100:.1f}%', ha='center')\n",
+ "\n",
+ "plt.subplot(122)\n",
+ "bal_tenure = df.groupby(['variation','tenure_group'])['bal'].mean().unstack()\n",
+ "x = np.arange(2)\n",
+ "width=0.35\n",
+ "plt.bar(x-width/2, bal_tenure.loc['Control'], width, label='Control', alpha=0.8)\n",
+ "plt.bar(x+width/2, bal_tenure.loc['Test'], width, label='Test', alpha=0.8)\n",
+ "plt.title('Revenue: Bal High-Tenure')\n",
+ "plt.ylabel('Avg Balance €')\n",
+ "plt.xticks(x, bal_tenure.columns, rotation=0)\n",
+ "plt.legend()\n",
+ "plt.grid(axis='y', alpha=0.3)\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4bfddee5-a983-40df-adf0-8f76228edc15",
+ "metadata": {},
+ "source": [
+ "Test is MORE trustable (58% vs 49% completion), Test self-service success DIRECTLY causes higher balances... \n",
+ "We can see customers with more than 10 years of tenure have more balance, besides of that the control have more lowest balance than the test version in all ages.\n",
+ "Perfect cycle: Better UX, More transactions then more revenue. So, Test creates client trust + bank profits simultaneously."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0c4bc6b-84fa-437e-a5d2-64f56b59cfb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "df1 = pd.read_csv('df_final_web_data_pt_1.txt')\n",
+ "print('df1 columns:', df1.columns.tolist())\n",
+ "print('df1 shape:', df1.shape)\n",
+ "df2 = pd.read_csv('df_final_web_data_pt_2.txt')\n",
+ "print('df2 columns:', df2.columns.tolist())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e3838ff-260b-420c-9ae5-7e2b01b6317f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1763ab2d-aa00-41ed-acd4-45030b76f6aa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e3ed3d2-c674-4e79-ae5e-f2ef5a52f71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4441a7a-208a-4bd0-a4b1-665b37ed9d9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['variation'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e075867f-4755-48c6-9afa-18599d594af0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58765716-483c-4f41-aa86-edfb93091be6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged = dfmerged[dfmerged['gendr'].isin(['M','F'])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d818395b-d84c-4109-9589-15875b2cbe28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged['gendr'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2291a32f-f560-445c-b585-e8c80521464b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d62f6e02-d72b-48b6-b084-c0a839b69b3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged[\"error_flag\"] = (dfmerged[\"process_step\"] != \"confirm\").astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bcf8b1e9-8466-48be-89f4-05687a2b775e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfmerged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3590ca8-47a7-49ba-b716-34e87d6d546c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dfmerged.to_csv('Project2FIX.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f92a491-010b-48f0-959f-90d18e8b00bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = {'variation': ['Control', 'Control', 'Control', 'Control', 'Control', 'Test', 'Test', 'Test', 'Test', 'Test'],'process_step': ['confirm', 'start', 'step1', 'step2', 'step3',\n",
+ " 'start', 'step1', 'step2', 'step3', 'confirm'],'mean_time': [153.74, 49.74, 45.09, 86.70, 140.79,38.24, 60.13, 89.76, 139.83, 246.07]}\n",
+ "df_times = pd.DataFrame(data)\n",
+ "\n",
+ "pivot = df_times.pivot(index='process_step', columns='variation', values='mean_time')\n",
+ "pivot = pivot.reindex(['start', 'step1', 'step2', 'step3', 'confirm'])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 6))\n",
+ "x = np.arange(len(pivot))\n",
+ "width = 0.35\n",
+ "ax.barh(x - width/2, pivot['Control'], width, label='Control', color='#2C6693', alpha=0.8)\n",
+ "ax.barh(x + width/2, pivot['Test'], width, label='Test', color='#E5B544', alpha=0.8)\n",
+ "ax.set_yticks(x)\n",
+ "ax.set_yticklabels(pivot.index)\n",
+ "ax.set_xlabel('Mean Time (seconds)')\n",
+ "ax.set_title('Average Time per Step: Control vs Test')\n",
+ "ax.legend()\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd5ad294-a1e7-42b6-80b1-c83b941d2578",
+ "metadata": {},
+ "source": [
+ "Test faster at start , but its more slower on the remaining steps, specially in the end"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ce132b3-3f48-444b-a050-1d21c8a78955",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "cores = {'Control': '#2C6693', 'Test': '#E5B544'}\n",
+ "sns.lmplot(data=visits_with_age, x='clnt_age', y='has_error', hue='variation',\n",
+ " palette=cores, line_kws={'lw':2}, scatter_kws={'alpha':0.5}, ci=95)\n",
+ "plt.xlabel('Age'); plt.ylabel('Error Flag')\n",
+ "plt.title('Age-Error Correlation: Control vs Test')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "537cc5c5-2aab-4d70-8824-bb7a155a8dce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.set(style='whitegrid')\n",
+ "plt.figure(figsize=(6,5))\n",
+ "sns.barplot(data=err_summary, x='variation', y='n_error', palette=['#2C6693', '#E5B544'])\n",
+ "plt.title('Total of Errors: Control vs Test')\n",
+ "plt.xlabel('Variation')\n",
+ "plt.ylabel('Total of Errors')\n",
+ "plt.xticks([0, 1], ['Control', 'Test'])\n",
+ "for i, row in err_summary.iterrows():\n",
+ " plt.text(i, row['n_error'] + 500, f'{int(row[\"n_error\"]):,}', ha='center')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e21b230-af46-4cc2-b521-0bd81db64d06",
+ "metadata": {},
+ "source": [
+ "CONCLUSIONS\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a7e55d5-90fd-4ad3-971e-a7c3762400cf",
+ "metadata": {},
+ "source": [
+ "Test variation consistently outperforms Control across all critical metrics, delivering substantial revenue growth, operational efficiencies and enhanced user resilience. Rigorous statistical analysis eliminates any doubt regarding implementation.\n",
+ "\n",
+ "Completion Advantage: Test have substantially higher process completion rates versus Control. This translates to massive trust and revenue uplift through elevated client balances across all tenure segments.\n",
+ "\n",
+ "Operational Excellence: Test meaningfully reduces call-center dependency confirms reliable cost savings.\n",
+ "\n",
+ "Demographic Resilience: Test significantly mitigates age-related error patterns compared to Control. Both versions show age influences error distribution, but Test manages this relationship far more equitably across age groups.\n",
+ "\n",
+ "Strategic Imperative: Revenue acceleration + cost reduction + universal UX improvements = compelling business case.\n",
+ "\n",
+ "DEPLOY TEST VERSION IMMEDIATELY ACROSS ALL CLIENTS."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ff7b913-6d52-4511-94fe-482d913675cb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.14.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}