diff --git a/.virtual_documents/notebook/PROJECT2_pedro.ipynb b/.virtual_documents/notebook/PROJECT2_pedro.ipynb new file mode 100644 index 0000000..2752210 --- /dev/null +++ b/.virtual_documents/notebook/PROJECT2_pedro.ipynb @@ -0,0 +1,317 @@ +import pandas as pd + +df4 = pd.read_csv("df_final_demo.txt") + + +df3 = pd.read_csv("df_final_experiment_clients.txt") + + +df1 = pd.read_csv("df_final_web_data_pt_1.txt") + + +df2 = pd.read_csv("df_final_web_data_pt_2.txt") + + +df1.columns + + +df2.columns + + +dfs = pd.concat([df1, df2], ignore_index=True) + + +dfs['client_id'].unique() + + +dfs.isnull() + + +dfs.shape + + +dfs.head() + + +#checking what the type of date_time +dfs.dtypes + + +#convert object into date_time +dfs['date_time'] = pd.to_datetime(dfs['date_time']) + + +dfs.dtypes + + +df3.columns + + +df4.columns + + + + + +#df_all.head() + + + +dfs.columns + + +#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True]) + + +dfs.head(10) + + +#dfsorted[dfsorted['client_id'] == 442857] + + +dfs['client_id'].value_counts().head() + + +dfs['date_time'].isnull().value_counts() + + +dfs['visitor_id'].isnull().value_counts() + + +dfs['date_time'].isnull().value_counts() + + +dfs['visit_id'].isnull().value_counts() + + +dfs['visitor_id'].isnull().value_counts() + + +dfs['process_step'].isnull().value_counts() + + +dfs.isnull().value_counts() + + +df4.columns + + +df3.columns + + +#df_test.isnull() + + +dfs['process_step'].unique() + + +#merfe dfs + experiment(df3) to have Variaton = Control OR test) +df_mergeexp = dfs.merge(df3, on="client_id", how="left") + + +#merge with demo(df4) (demographic data) +df_merged = df_mergeexp.merge(df4, on="client_id", how="left") + + +df_merged.shape + + +#remove duplicates +df_merged = df_merged.drop_duplicates() + + +#remove rows without any variation +df_merged = df_merged[df_merged["Variation"].isin(["Control", "Test"])] + + +df_merged.shape + + +df_merged["process_step"].value_counts() + + +df_merged["Variation"].value_counts() + + +#Give a table for Tests and another for Controls! +df_control = df_merged[df_merged["Variation"] == "Control"].copy() +df_test = df_merged[df_merged["Variation"] == "Test"].copy() + + +df_control.shape + + +df_test.shape + + +df_merged['num_accts'].nunique() + + +#################################################################### + + +#Q1: Who are the primary clients using this online process? + + +usage = (df_merged.groupby("client_id")["visit_id"].nunique().reset_index(name="n_visits")) #hor many visits per client + + +cutoff = usage["n_visits"].quantile(0.75) +usage['primary'] = (usage["n_visits"] >= cutoff).astype(int) #top25% by number of visits + + +df_merged.columns + + +df_merged = usage.merge(df_merged, left_on="client_id", right_on="client_id", how="left") #join demographics + + +df_merged.groupby("primary")[["clnt_age","clnt_tenure_yr","clnt_tenure_mnth","logons_6_mnth"]].mean() + + + +sub = df_merged[df_merged["primary"] == 1] +print(sub[["clnt_age","clnt_tenure_yr","bal","gendr"]].describe(include="all")) + + +#Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients. +#primary cients are more long standing. +#The most frequent gender is “M” + + +df_merged.groupby("primary")[['calls_6_mnth', 'logons_6_mnth']].mean() + + +#They also make more calls and more logons in 6 months, so they are more active on all channels, not only online. + + +df_merged = df_merged.rename(columns={'Variation': 'variation'}) +df3 = df3.rename(columns={'Variation': 'variation'}) +df_control = df_control.rename(columns={'Variation': 'variation'}) +df_test = df_test.rename(columns={'Variation': 'variation'}) + + +df_merged.columns + + +df_control.columns + + +df3.columns + + +dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left') + + +dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time']) + + +step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4} + + + + + +dfs_var['step_num'] = dfs_var['process_step'].map(step_order) + + +###################################################### +#COMPLETION RATE + + +last_step = dfs_var['step_num'].max() + + +visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step')) + + +visitcomp["completed"] = (visitcomp["max_step"] == last_step).astype(int) + + +completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index()) + + +completionrate + + +print('Average time spent on each steap is', completionrate) + + +############################################################################################ +#TIME SPENT ON EACH STEP + + +dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time']) + + +dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time']) + + +dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit + + +dfs_var["next_time"] = pd.to_datetime(dfs_var["next_time"], errors="coerce") + + +dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds + + +step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index()) + + +step_time + + +################################################################## +#CHECKING RATING OF ERRORS PER EACH STEP + + +dfs_var['error_flag'] = (dfs_var['step_num'].astype(int)) + + +error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'})) + + +error_rates + + +#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps. +#TIME SPENT - The test version is better for user to complete the steps faster +#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors. + + +import numpy as np +from scipy.stats import norm + + + +#Order control test +summary = summary.set_index('variation').loc[['Control', 'Test']] +x1, x2 = summary['n_complete'].values +n1, n2 = summary['n_total'].values + + +#propoortions +p1 = x1 / n1 +p2 = x2 / n2 + + +#h0 +p_pool = (x1 + x2) / (n1 + n2) + + +#test of 2 proportions *THANK YOU CHATGPT +se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2)) +z = (p2 - p1) / se + + +p_value = 2 * (1 - norm.cdf(abs(z))) +print(f'Completion of rate Control: {p1:4f}') +print(f'Completion of rate Test: {p2:4f}') +print(f'z-statistic: {z:4f}') +print(f'p-value : {p_value:.6f}') +alpha = 0.05 +if p_value < alpha + print('Statistical diference highly significative (alpha=0.05)') +else + print('Statistical diference not significative (alpha=0.05)') diff --git a/.virtual_documents/notebook/Untitled.ipynb b/.virtual_documents/notebook/Untitled.ipynb new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.virtual_documents/notebook/Untitled.ipynb @@ -0,0 +1 @@ + diff --git a/anaconda_projects/db/project_filebrowser.db b/anaconda_projects/db/project_filebrowser.db new file mode 100644 index 0000000..e5ae8f7 Binary files /dev/null and b/anaconda_projects/db/project_filebrowser.db differ diff --git a/notebook/.ipynb_checkpoints/PROJECT2_pedro-checkpoint.ipynb b/notebook/.ipynb_checkpoints/PROJECT2_pedro-checkpoint.ipynb index a2ea406..00ce796 100644 --- a/notebook/.ipynb_checkpoints/PROJECT2_pedro-checkpoint.ipynb +++ b/notebook/.ipynb_checkpoints/PROJECT2_pedro-checkpoint.ipynb @@ -1193,28 +1193,17 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "id": "788ff698-7a92-4969-8cd7-6d87eeca1194", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "####################################################################" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "id": "162a3f57-229f-4cbf-848a-e4704a2cdf52", "metadata": {}, "outputs": [], @@ -1224,7 +1213,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "id": "fed97a77-c349-420e-a073-8cdd689514c9", "metadata": {}, "outputs": [], @@ -1234,7 +1223,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "id": "ed106950-abca-4aa9-afb0-f758055eaa47", "metadata": {}, "outputs": [], @@ -1245,7 +1234,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "id": "c5b77ee6-dd0f-4256-b044-6688bc348160", "metadata": {}, "outputs": [ @@ -1258,7 +1247,7 @@ " dtype='object')" ] }, - "execution_count": 49, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1269,7 +1258,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "id": "c04385dd-5d27-442c-ba4c-63c1e2808d1b", "metadata": {}, "outputs": [], @@ -1279,7 +1268,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "id": "2a46f9ad-2767-4c80-8e18-bf36f0afb441", "metadata": {}, "outputs": [ @@ -1343,7 +1332,7 @@ "1 51.816468 12.831203 159.971186 6.780515" ] }, - "execution_count": 51, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1384,29 +1373,10 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "id": "9948430a-27f5-467a-b44f-ccc9968944a0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " clnt_age clnt_tenure_yr bal gendr\n", - "count 124060.000000 124072.000000 1.240720e+05 124072\n", - "unique NaN NaN NaN 3\n", - "top NaN NaN NaN M\n", - "freq NaN NaN NaN 43343\n", - "mean 51.816468 12.831203 1.961189e+05 NaN\n", - "std 15.672054 7.328423 4.243774e+05 NaN\n", - "min 17.000000 2.000000 2.378961e+04 NaN\n", - "25% 39.000000 6.000000 4.566559e+04 NaN\n", - "50% 54.000000 12.000000 8.424076e+04 NaN\n", - "75% 63.500000 17.000000 1.926728e+05 NaN\n", - "max 94.000000 55.000000 1.632004e+07 NaN\n" - ] - } - ], + "outputs": [], "source": [ "#Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.\n", "#primary cients are more long standing.\n", @@ -1415,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "id": "95002485-6f20-4f4a-a763-d037b9faa90f", "metadata": {}, "outputs": [ @@ -1471,7 +1441,7 @@ "1 3.743455 6.780515" ] }, - "execution_count": 54, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1482,13 +1452,690 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "2ac842fc-b333-4bb9-b804-01f9fc8f0c72", "metadata": {}, "outputs": [], "source": [ "#They also make more calls and more logons in 6 months, so they are more active on all channels, not only online." ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "2a4f3984-9698-45f1-9d0e-299d234bdccf", + "metadata": {}, + "outputs": [], + "source": [ + "df_merged = df_merged.rename(columns={'Variation': 'variation'})\n", + "df3 = df3.rename(columns={'Variation': 'variation'})\n", + "df_control = df_control.rename(columns={'Variation': 'variation'})\n", + "df_test = df_test.rename(columns={'Variation': 'variation'})" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "73f27b9e-3a8c-4ad7-a756-6838314f5b34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['client_id', 'n_visits', 'primary', 'visitor_id', 'visit_id',\n", + " 'process_step', 'date_time', 'variation', 'clnt_tenure_yr',\n", + " 'clnt_tenure_mnth', 'clnt_age', 'gendr', 'num_accts', 'bal',\n", + " 'calls_6_mnth', 'logons_6_mnth'],\n", + " dtype='object')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_merged.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "5da498ad-e434-4dbc-af5f-84b4a322b09d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time',\n", + " 'variation', 'clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'gendr',\n", + " 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth'],\n", + " dtype='object')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_control.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "110917b6-fa20-47d1-9704-9263dafd5312", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['client_id', 'variation'], dtype='object')" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "90d7a353-0bd9-422f-b3e4-98d8d709c90d", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b4edff36-d806-4048-9b85-aa83de863e3c", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "5f86541c-bc8d-4ea1-a7b5-d823188ed736", + "metadata": {}, + "outputs": [], + "source": [ + "step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eed944a6-5fce-4e95-ad1e-8f384f15addb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "28162307-e0cf-4d5d-b662-58f03c6a67e2", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['step_num'] = dfs_var['process_step'].map(step_order)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "c71f9b3d-9581-40fc-986d-bf30da3e1882", + "metadata": {}, + "outputs": [], + "source": [ + "######################################################\n", + "#COMPLETION RATE" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "0699b897-2d70-4a36-a2fa-de92681667c0", + "metadata": {}, + "outputs": [], + "source": [ + "last_step = dfs_var['step_num'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "1451a1a2-67f8-4522-b352-6a72513b7a3c", + "metadata": {}, + "outputs": [], + "source": [ + "visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "decd9c0a-810e-4e3d-a3da-4aa31dd2c58c", + "metadata": {}, + "outputs": [], + "source": [ + "visitcomp[\"completed\"] = (visitcomp[\"max_step\"] == last_step).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "323d47af-6a05-4a10-bc42-3886c7c88ca4", + "metadata": {}, + "outputs": [], + "source": [ + "completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "c501100b-d67e-4774-9e73-db3350874cf7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationcompleted
0Control0.498493
1Test0.585173
\n", + "
" + ], + "text/plain": [ + " variation completed\n", + "0 Control 0.498493\n", + "1 Test 0.585173" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completionrate" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average time spent on each steap is variation completed\n", + "0 Control 0.498493\n", + "1 Test 0.585173\n" + ] + } + ], + "source": [ + "print('Average time spent on each steap is', completionrate)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8", + "metadata": {}, + "outputs": [], + "source": [ + "############################################################################################\n", + "#TIME SPENT ON EACH STEP" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "f4a43083-3154-46ca-823f-10e50602565c", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "0b2f7798-0953-4991-9634-1984749b0703", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "57d91d74-ae22-46fc-a195-ad7e1b777296", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "ec88f0a4-fcb2-4422-a325-52661e23016f", + "metadata": {}, + "outputs": [], + "source": [ + "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationprocess_stepstep_durationsec
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [variation, process_step, step_durationsec]\n", + "Index: []" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "step_time" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb", + "metadata": {}, + "outputs": [], + "source": [ + "##################################################################\n", + "#CHECKING RATING OF ERRORS PER EACH STEP" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1", + "metadata": {}, + "outputs": [], + "source": [ + "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationprocess_steperror_rate
0Controlconfirm4.0
1Controlstart0.0
2Controlstep_11.0
3Controlstep_22.0
4Controlstep_33.0
5Testconfirm4.0
6Teststart0.0
7Teststep_11.0
8Teststep_22.0
9Teststep_33.0
\n", + "
" + ], + "text/plain": [ + " variation process_step error_rate\n", + "0 Control confirm 4.0\n", + "1 Control start 0.0\n", + "2 Control step_1 1.0\n", + "3 Control step_2 2.0\n", + "4 Control step_3 3.0\n", + "5 Test confirm 4.0\n", + "6 Test start 0.0\n", + "7 Test step_1 1.0\n", + "8 Test step_2 2.0\n", + "9 Test step_3 3.0" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "error_rates" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601", + "metadata": {}, + "outputs": [], + "source": [ + "#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.\n", + "#TIME SPENT - The test version is better for user to complete the steps faster\n", + "#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a07db602-47f1-4c8c-8785-76c322d070db", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.stats import norm\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " variation n_complete n_total\n", + "0 Control 16046 32189\n", + "1 Test 21731 37136\n" + ] + } + ], + "source": [ + "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00", + "metadata": {}, + "outputs": [], + "source": [ + "#Order control test\n", + "summary = summary.set_index('variation').loc[['Control', 'Test']]\n", + "x1, x2 = summary['n_complete'].values\n", + "n1, n2 = summary['n_total'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160", + "metadata": {}, + "outputs": [], + "source": [ + "#propoortions\n", + "p1 = x1 / n1\n", + "p2 = x2 / n2" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5", + "metadata": {}, + "outputs": [], + "source": [ + "#h0\n", + "p_pool = (x1 + x2) / (n1 + n2)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "48060fa7-6f84-4685-98c0-9f818b234cbe", + "metadata": {}, + "outputs": [], + "source": [ + "#test of 2 proportions *THANK YOU CHATGPT\n", + "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n", + "z = (p2 - p1) / se" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completion of rate Control: 0.498493\n", + "Completion of rate Test: 0.585173\n", + "z-statistic: 22.856841\n", + "p-value : 0.000000\n", + "Statistical diference highly significative (alpha=0.05)\n" + ] + } + ], + "source": [ + "p_value = 2 * (1 - norm.cdf(abs(z)))\n", + "print(f'Completion of rate Control: {p1:4f}')\n", + "print(f'Completion of rate Test: {p2:4f}')\n", + "print(f'z-statistic: {z:4f}')\n", + "print(f'p-value : {p_value:.6f}')\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print('Statistical diference highly significative (alpha=0.05)')\n", + "else:\n", + " print('Statistical diference not significative (alpha=0.05)')" + ] + }, + { + "cell_type": "markdown", + "id": "b6336d5f-7c76-4b04-bded-84819c26634d", + "metadata": {}, + "source": [ + "Next step: MAKE GRAPHIC FOR THIS\n", + "and: Carried out an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold.\n", + "Carried out another hypothesis test of your choosing.\n", + "Evaluated the experiment by answering questions relating to:\n", + "\n", + " Design Effectiveness\n", + " Duration\n", + " Additional Data Needs\n", + "\n" + ] } ], "metadata": { diff --git a/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebook/PROJECT2_pedro.ipynb b/notebook/PROJECT2_pedro.ipynb index 29cdc42..00ce796 100644 --- a/notebook/PROJECT2_pedro.ipynb +++ b/notebook/PROJECT2_pedro.ipynb @@ -1767,6 +1767,16 @@ { "cell_type": "code", "execution_count": 76, + "id": "0b2f7798-0953-4991-9634-1984749b0703", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, "id": "57d91d74-ae22-46fc-a195-ad7e1b777296", "metadata": {}, "outputs": [], @@ -1776,7 +1786,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "id": "ec88f0a4-fcb2-4422-a325-52661e23016f", "metadata": {}, "outputs": [], @@ -1786,7 +1796,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd", "metadata": {}, "outputs": [ @@ -1817,85 +1827,17 @@ " \n", " \n", " \n", - " \n", - " 0\n", - " Control\n", - " confirm\n", - " 153.740233\n", - " \n", - " \n", - " 1\n", - " Control\n", - " start\n", - " 49.744712\n", - " \n", - " \n", - " 2\n", - " Control\n", - " step_1\n", - " 45.093323\n", - " \n", - " \n", - " 3\n", - " Control\n", - " step_2\n", - " 86.703724\n", - " \n", - " \n", - " 4\n", - " Control\n", - " step_3\n", - " 140.788394\n", - " \n", - " \n", - " 5\n", - " Test\n", - " confirm\n", - " 246.065397\n", - " \n", - " \n", - " 6\n", - " Test\n", - " start\n", - " 38.240943\n", - " \n", - " \n", - " 7\n", - " Test\n", - " step_1\n", - " 60.130113\n", - " \n", - " \n", - " 8\n", - " Test\n", - " step_2\n", - " 89.756501\n", - " \n", - " \n", - " 9\n", - " Test\n", - " step_3\n", - " 139.834792\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " variation process_step step_durationsec\n", - "0 Control confirm 153.740233\n", - "1 Control start 49.744712\n", - "2 Control step_1 45.093323\n", - "3 Control step_2 86.703724\n", - "4 Control step_3 140.788394\n", - "5 Test confirm 246.065397\n", - "6 Test start 38.240943\n", - "7 Test step_1 60.130113\n", - "8 Test step_2 89.756501\n", - "9 Test step_3 139.834792" + "Empty DataFrame\n", + "Columns: [variation, process_step, step_durationsec]\n", + "Index: []" ] }, - "execution_count": 78, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1906,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb", "metadata": {}, "outputs": [], @@ -1917,7 +1859,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67", "metadata": {}, "outputs": [], @@ -1927,7 +1869,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1", "metadata": {}, "outputs": [], @@ -1937,7 +1879,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806", "metadata": {}, "outputs": [ @@ -2046,7 +1988,7 @@ "9 Test step_3 3.0" ] }, - "execution_count": 81, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -2054,6 +1996,146 @@ "source": [ "error_rates" ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601", + "metadata": {}, + "outputs": [], + "source": [ + "#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.\n", + "#TIME SPENT - The test version is better for user to complete the steps faster\n", + "#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a07db602-47f1-4c8c-8785-76c322d070db", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.stats import norm\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " variation n_complete n_total\n", + "0 Control 16046 32189\n", + "1 Test 21731 37136\n" + ] + } + ], + "source": [ + "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00", + "metadata": {}, + "outputs": [], + "source": [ + "#Order control test\n", + "summary = summary.set_index('variation').loc[['Control', 'Test']]\n", + "x1, x2 = summary['n_complete'].values\n", + "n1, n2 = summary['n_total'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160", + "metadata": {}, + "outputs": [], + "source": [ + "#propoortions\n", + "p1 = x1 / n1\n", + "p2 = x2 / n2" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5", + "metadata": {}, + "outputs": [], + "source": [ + "#h0\n", + "p_pool = (x1 + x2) / (n1 + n2)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "48060fa7-6f84-4685-98c0-9f818b234cbe", + "metadata": {}, + "outputs": [], + "source": [ + "#test of 2 proportions *THANK YOU CHATGPT\n", + "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n", + "z = (p2 - p1) / se" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completion of rate Control: 0.498493\n", + "Completion of rate Test: 0.585173\n", + "z-statistic: 22.856841\n", + "p-value : 0.000000\n", + "Statistical diference highly significative (alpha=0.05)\n" + ] + } + ], + "source": [ + "p_value = 2 * (1 - norm.cdf(abs(z)))\n", + "print(f'Completion of rate Control: {p1:4f}')\n", + "print(f'Completion of rate Test: {p2:4f}')\n", + "print(f'z-statistic: {z:4f}')\n", + "print(f'p-value : {p_value:.6f}')\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print('Statistical diference highly significative (alpha=0.05)')\n", + "else:\n", + " print('Statistical diference not significative (alpha=0.05)')" + ] + }, + { + "cell_type": "markdown", + "id": "b6336d5f-7c76-4b04-bded-84819c26634d", + "metadata": {}, + "source": [ + "Next step: MAKE GRAPHIC FOR THIS\n", + "and: Carried out an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold.\n", + "Carried out another hypothesis test of your choosing.\n", + "Evaluated the experiment by answering questions relating to:\n", + "\n", + " Design Effectiveness\n", + " Duration\n", + " Additional Data Needs\n", + "\n" + ] } ], "metadata": { diff --git a/notebook/Untitled.ipynb b/notebook/Untitled.ipynb new file mode 100644 index 0000000..b41851f --- /dev/null +++ b/notebook/Untitled.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "139f8b74-4220-4cbd-b2b9-666f8b7bfaf5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}