From 664d2c1932bc1b4d80cb090b9a49d2ea94c51ba3 Mon Sep 17 00:00:00 2001 From: Pedro Manuel Gil Basilio Ferreira Date: Thu, 18 Dec 2025 16:53:15 +0000 Subject: [PATCH] day4 --- .../notebook/PROJECT2_pedro.ipynb | 317 ++++++++ .virtual_documents/notebook/Untitled.ipynb | 1 + anaconda_projects/db/project_filebrowser.db | Bin 0 -> 32768 bytes .../PROJECT2_pedro-checkpoint.ipynb | 737 ++++++++++++++++-- .../Untitled-checkpoint.ipynb | 6 + notebook/PROJECT2_pedro.ipynb | 240 ++++-- notebook/Untitled.ipynb | 33 + 7 files changed, 1210 insertions(+), 124 deletions(-) create mode 100644 .virtual_documents/notebook/PROJECT2_pedro.ipynb create mode 100644 .virtual_documents/notebook/Untitled.ipynb create mode 100644 anaconda_projects/db/project_filebrowser.db create mode 100644 notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 notebook/Untitled.ipynb diff --git a/.virtual_documents/notebook/PROJECT2_pedro.ipynb b/.virtual_documents/notebook/PROJECT2_pedro.ipynb new file mode 100644 index 0000000..2752210 --- /dev/null +++ b/.virtual_documents/notebook/PROJECT2_pedro.ipynb @@ -0,0 +1,317 @@ +import pandas as pd + +df4 = pd.read_csv("df_final_demo.txt") + + +df3 = pd.read_csv("df_final_experiment_clients.txt") + + +df1 = pd.read_csv("df_final_web_data_pt_1.txt") + + +df2 = pd.read_csv("df_final_web_data_pt_2.txt") + + +df1.columns + + +df2.columns + + +dfs = pd.concat([df1, df2], ignore_index=True) + + +dfs['client_id'].unique() + + +dfs.isnull() + + +dfs.shape + + +dfs.head() + + +#checking what the type of date_time +dfs.dtypes + + +#convert object into date_time +dfs['date_time'] = pd.to_datetime(dfs['date_time']) + + +dfs.dtypes + + +df3.columns + + +df4.columns + + + + + +#df_all.head() + + + +dfs.columns + + +#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True]) + + +dfs.head(10) + + +#dfsorted[dfsorted['client_id'] == 442857] + + +dfs['client_id'].value_counts().head() + + +dfs['date_time'].isnull().value_counts() + + +dfs['visitor_id'].isnull().value_counts() + + +dfs['date_time'].isnull().value_counts() + + +dfs['visit_id'].isnull().value_counts() + + +dfs['visitor_id'].isnull().value_counts() + + +dfs['process_step'].isnull().value_counts() + + +dfs.isnull().value_counts() + + +df4.columns + + +df3.columns + + +#df_test.isnull() + + +dfs['process_step'].unique() + + +#merfe dfs + experiment(df3) to have Variaton = Control OR test) +df_mergeexp = dfs.merge(df3, on="client_id", how="left") + + +#merge with demo(df4) (demographic data) +df_merged = df_mergeexp.merge(df4, on="client_id", how="left") + + +df_merged.shape + + +#remove duplicates +df_merged = df_merged.drop_duplicates() + + +#remove rows without any variation +df_merged = df_merged[df_merged["Variation"].isin(["Control", "Test"])] + + +df_merged.shape + + +df_merged["process_step"].value_counts() + + +df_merged["Variation"].value_counts() + + +#Give a table for Tests and another for Controls! +df_control = df_merged[df_merged["Variation"] == "Control"].copy() +df_test = df_merged[df_merged["Variation"] == "Test"].copy() + + +df_control.shape + + +df_test.shape + + +df_merged['num_accts'].nunique() + + +#################################################################### + + +#Q1: Who are the primary clients using this online process? + + +usage = (df_merged.groupby("client_id")["visit_id"].nunique().reset_index(name="n_visits")) #hor many visits per client + + +cutoff = usage["n_visits"].quantile(0.75) +usage['primary'] = (usage["n_visits"] >= cutoff).astype(int) #top25% by number of visits + + +df_merged.columns + + +df_merged = usage.merge(df_merged, left_on="client_id", right_on="client_id", how="left") #join demographics + + +df_merged.groupby("primary")[["clnt_age","clnt_tenure_yr","clnt_tenure_mnth","logons_6_mnth"]].mean() + + + +sub = df_merged[df_merged["primary"] == 1] +print(sub[["clnt_age","clnt_tenure_yr","bal","gendr"]].describe(include="all")) + + +#Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients. +#primary cients are more long standing. +#The most frequent gender is “M” + + +df_merged.groupby("primary")[['calls_6_mnth', 'logons_6_mnth']].mean() + + +#They also make more calls and more logons in 6 months, so they are more active on all channels, not only online. + + +df_merged = df_merged.rename(columns={'Variation': 'variation'}) +df3 = df3.rename(columns={'Variation': 'variation'}) +df_control = df_control.rename(columns={'Variation': 'variation'}) +df_test = df_test.rename(columns={'Variation': 'variation'}) + + +df_merged.columns + + +df_control.columns + + +df3.columns + + +dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left') + + +dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time']) + + +step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4} + + + + + +dfs_var['step_num'] = dfs_var['process_step'].map(step_order) + + +###################################################### +#COMPLETION RATE + + +last_step = dfs_var['step_num'].max() + + +visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step')) + + +visitcomp["completed"] = (visitcomp["max_step"] == last_step).astype(int) + + +completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index()) + + +completionrate + + +print('Average time spent on each steap is', completionrate) + + +############################################################################################ +#TIME SPENT ON EACH STEP + + +dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time']) + + +dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time']) + + +dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit + + +dfs_var["next_time"] = pd.to_datetime(dfs_var["next_time"], errors="coerce") + + +dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds + + +step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index()) + + +step_time + + +################################################################## +#CHECKING RATING OF ERRORS PER EACH STEP + + +dfs_var['error_flag'] = (dfs_var['step_num'].astype(int)) + + +error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'})) + + +error_rates + + +#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps. +#TIME SPENT - The test version is better for user to complete the steps faster +#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors. + + +import numpy as np +from scipy.stats import norm + + + +#Order control test +summary = summary.set_index('variation').loc[['Control', 'Test']] +x1, x2 = summary['n_complete'].values +n1, n2 = summary['n_total'].values + + +#propoortions +p1 = x1 / n1 +p2 = x2 / n2 + + +#h0 +p_pool = (x1 + x2) / (n1 + n2) + + +#test of 2 proportions *THANK YOU CHATGPT +se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2)) +z = (p2 - p1) / se + + +p_value = 2 * (1 - norm.cdf(abs(z))) +print(f'Completion of rate Control: {p1:4f}') +print(f'Completion of rate Test: {p2:4f}') +print(f'z-statistic: {z:4f}') +print(f'p-value : {p_value:.6f}') +alpha = 0.05 +if p_value < alpha + print('Statistical diference highly significative (alpha=0.05)') +else + print('Statistical diference not significative (alpha=0.05)') diff --git a/.virtual_documents/notebook/Untitled.ipynb b/.virtual_documents/notebook/Untitled.ipynb new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.virtual_documents/notebook/Untitled.ipynb @@ -0,0 +1 @@ + diff --git a/anaconda_projects/db/project_filebrowser.db b/anaconda_projects/db/project_filebrowser.db new file mode 100644 index 0000000000000000000000000000000000000000..e5ae8f7b27481bddaae38d7c3b92c21339f9d216 GIT binary patch literal 32768 zcmeI)!Ee$~9Ki9G4H#@4>V?yimnF*}TQp-%2tE5^l`rW+4agu^3uXlwd{COG`szd?TLNmhN7y< zBOw$;S@mPZkAxpFKhl0I`w?zxcyP^rRk{87mZp7E;%k>`M$3F%yVSm{f7e=C>zcD@ zLI42-5I_I{1Q0*~f&W8bbTg66ZfvNZhMs-Wmc4$rB~QIUxL+>T4Ba$@sXs0oVl%AX z6uETDX^2;Pt@uo@iOQiVD)n-CSEN%7+p~pfyf%YE_k%0@(~@g<<*Sh7!lPt0^HYrnilb_%c+s1\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationcompleted
0Control0.498493
1Test0.585173
\n", + "" + ], + "text/plain": [ + " variation completed\n", + "0 Control 0.498493\n", + "1 Test 0.585173" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completionrate" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "1f83ff5d-1898-479b-8fcd-e69478d05b96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average time spent on each steap is variation completed\n", + "0 Control 0.498493\n", + "1 Test 0.585173\n" + ] + } + ], + "source": [ + "print('Average time spent on each steap is', completionrate)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "fb188b90-2c49-4edf-8975-8c7c2f4d56f8", + "metadata": {}, + "outputs": [], + "source": [ + "############################################################################################\n", + "#TIME SPENT ON EACH STEP" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "37da593d-4aa1-466c-9a4b-f3524bfdb0b5", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "3c09e14a-51ab-4c1d-bf97-2d348c4aded6", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "f4a43083-3154-46ca-823f-10e50602565c", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "0b2f7798-0953-4991-9634-1984749b0703", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "57d91d74-ae22-46fc-a195-ad7e1b777296", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "ec88f0a4-fcb2-4422-a325-52661e23016f", + "metadata": {}, + "outputs": [], + "source": [ + "step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationprocess_stepstep_durationsec
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [variation, process_step, step_durationsec]\n", + "Index: []" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "step_time" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb", + "metadata": {}, + "outputs": [], + "source": [ + "##################################################################\n", + "#CHECKING RATING OF ERRORS PER EACH STEP" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1", + "metadata": {}, + "outputs": [], + "source": [ + "error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variationprocess_steperror_rate
0Controlconfirm4.0
1Controlstart0.0
2Controlstep_11.0
3Controlstep_22.0
4Controlstep_33.0
5Testconfirm4.0
6Teststart0.0
7Teststep_11.0
8Teststep_22.0
9Teststep_33.0
\n", + "
" + ], + "text/plain": [ + " variation process_step error_rate\n", + "0 Control confirm 4.0\n", + "1 Control start 0.0\n", + "2 Control step_1 1.0\n", + "3 Control step_2 2.0\n", + "4 Control step_3 3.0\n", + "5 Test confirm 4.0\n", + "6 Test start 0.0\n", + "7 Test step_1 1.0\n", + "8 Test step_2 2.0\n", + "9 Test step_3 3.0" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "error_rates" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601", + "metadata": {}, + "outputs": [], + "source": [ + "#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.\n", + "#TIME SPENT - The test version is better for user to complete the steps faster\n", + "#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a07db602-47f1-4c8c-8785-76c322d070db", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.stats import norm\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " variation n_complete n_total\n", + "0 Control 16046 32189\n", + "1 Test 21731 37136\n" + ] + } + ], + "source": [ + "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00", + "metadata": {}, + "outputs": [], + "source": [ + "#Order control test\n", + "summary = summary.set_index('variation').loc[['Control', 'Test']]\n", + "x1, x2 = summary['n_complete'].values\n", + "n1, n2 = summary['n_total'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160", + "metadata": {}, + "outputs": [], + "source": [ + "#propoortions\n", + "p1 = x1 / n1\n", + "p2 = x2 / n2" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5", + "metadata": {}, + "outputs": [], + "source": [ + "#h0\n", + "p_pool = (x1 + x2) / (n1 + n2)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "48060fa7-6f84-4685-98c0-9f818b234cbe", + "metadata": {}, + "outputs": [], + "source": [ + "#test of 2 proportions *THANK YOU CHATGPT\n", + "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n", + "z = (p2 - p1) / se" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completion of rate Control: 0.498493\n", + "Completion of rate Test: 0.585173\n", + "z-statistic: 22.856841\n", + "p-value : 0.000000\n", + "Statistical diference highly significative (alpha=0.05)\n" + ] + } + ], + "source": [ + "p_value = 2 * (1 - norm.cdf(abs(z)))\n", + "print(f'Completion of rate Control: {p1:4f}')\n", + "print(f'Completion of rate Test: {p2:4f}')\n", + "print(f'z-statistic: {z:4f}')\n", + "print(f'p-value : {p_value:.6f}')\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print('Statistical diference highly significative (alpha=0.05)')\n", + "else:\n", + " print('Statistical diference not significative (alpha=0.05)')" + ] + }, + { + "cell_type": "markdown", + "id": "b6336d5f-7c76-4b04-bded-84819c26634d", + "metadata": {}, + "source": [ + "Next step: MAKE GRAPHIC FOR THIS\n", + "and: Carried out an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold.\n", + "Carried out another hypothesis test of your choosing.\n", + "Evaluated the experiment by answering questions relating to:\n", + "\n", + " Design Effectiveness\n", + " Duration\n", + " Additional Data Needs\n", + "\n" + ] } ], "metadata": { diff --git a/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/notebook/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebook/PROJECT2_pedro.ipynb b/notebook/PROJECT2_pedro.ipynb index 29cdc42..00ce796 100644 --- a/notebook/PROJECT2_pedro.ipynb +++ b/notebook/PROJECT2_pedro.ipynb @@ -1767,6 +1767,16 @@ { "cell_type": "code", "execution_count": 76, + "id": "0b2f7798-0953-4991-9634-1984749b0703", + "metadata": {}, + "outputs": [], + "source": [ + "dfs_var[\"next_time\"] = pd.to_datetime(dfs_var[\"next_time\"], errors=\"coerce\")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, "id": "57d91d74-ae22-46fc-a195-ad7e1b777296", "metadata": {}, "outputs": [], @@ -1776,7 +1786,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "id": "ec88f0a4-fcb2-4422-a325-52661e23016f", "metadata": {}, "outputs": [], @@ -1786,7 +1796,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "id": "d3178785-bcb9-4034-a4e1-1007ea34d3fd", "metadata": {}, "outputs": [ @@ -1817,85 +1827,17 @@ " \n", " \n", " \n", - " \n", - " 0\n", - " Control\n", - " confirm\n", - " 153.740233\n", - " \n", - " \n", - " 1\n", - " Control\n", - " start\n", - " 49.744712\n", - " \n", - " \n", - " 2\n", - " Control\n", - " step_1\n", - " 45.093323\n", - " \n", - " \n", - " 3\n", - " Control\n", - " step_2\n", - " 86.703724\n", - " \n", - " \n", - " 4\n", - " Control\n", - " step_3\n", - " 140.788394\n", - " \n", - " \n", - " 5\n", - " Test\n", - " confirm\n", - " 246.065397\n", - " \n", - " \n", - " 6\n", - " Test\n", - " start\n", - " 38.240943\n", - " \n", - " \n", - " 7\n", - " Test\n", - " step_1\n", - " 60.130113\n", - " \n", - " \n", - " 8\n", - " Test\n", - " step_2\n", - " 89.756501\n", - " \n", - " \n", - " 9\n", - " Test\n", - " step_3\n", - " 139.834792\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " variation process_step step_durationsec\n", - "0 Control confirm 153.740233\n", - "1 Control start 49.744712\n", - "2 Control step_1 45.093323\n", - "3 Control step_2 86.703724\n", - "4 Control step_3 140.788394\n", - "5 Test confirm 246.065397\n", - "6 Test start 38.240943\n", - "7 Test step_1 60.130113\n", - "8 Test step_2 89.756501\n", - "9 Test step_3 139.834792" + "Empty DataFrame\n", + "Columns: [variation, process_step, step_durationsec]\n", + "Index: []" ] }, - "execution_count": 78, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1906,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "id": "cd0f8815-ef1b-4311-b4d6-645d3ed1edeb", "metadata": {}, "outputs": [], @@ -1917,7 +1859,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "id": "c6f6a958-a82a-4386-9e1f-ab5d57397e67", "metadata": {}, "outputs": [], @@ -1927,7 +1869,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "id": "5e5c5a7c-9498-4b17-8941-8fef7839cbf1", "metadata": {}, "outputs": [], @@ -1937,7 +1879,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "id": "0e39a5d1-4c37-4226-9728-0e514dbc9806", "metadata": {}, "outputs": [ @@ -2046,7 +1988,7 @@ "9 Test step_3 3.0" ] }, - "execution_count": 81, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -2054,6 +1996,146 @@ "source": [ "error_rates" ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8bf8ae7d-ab0e-4057-bccf-b379f2a32601", + "metadata": {}, + "outputs": [], + "source": [ + "#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.\n", + "#TIME SPENT - The test version is better for user to complete the steps faster\n", + "#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a07db602-47f1-4c8c-8785-76c322d070db", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.stats import norm\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "1d6944b6-7979-4f60-b7e5-8886c4dc15ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " variation n_complete n_total\n", + "0 Control 16046 32189\n", + "1 Test 21731 37136\n" + ] + } + ], + "source": [ + "summary = (visitcomp.groupby(\"variation\")[\"completed\"].agg(n_complete=\"sum\", n_total=\"count\").reset_index())\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "086e2bee-9e59-4756-a45f-55c2a6c3fa00", + "metadata": {}, + "outputs": [], + "source": [ + "#Order control test\n", + "summary = summary.set_index('variation').loc[['Control', 'Test']]\n", + "x1, x2 = summary['n_complete'].values\n", + "n1, n2 = summary['n_total'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ef1baf41-13b1-4f5f-bdb4-69d71ea93160", + "metadata": {}, + "outputs": [], + "source": [ + "#propoortions\n", + "p1 = x1 / n1\n", + "p2 = x2 / n2" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "5d8ace24-1dcc-4d8c-9ccf-da4fa3aff7b5", + "metadata": {}, + "outputs": [], + "source": [ + "#h0\n", + "p_pool = (x1 + x2) / (n1 + n2)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "48060fa7-6f84-4685-98c0-9f818b234cbe", + "metadata": {}, + "outputs": [], + "source": [ + "#test of 2 proportions *THANK YOU CHATGPT\n", + "se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))\n", + "z = (p2 - p1) / se" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "15c0edac-b6cd-41ec-b36f-ddeab6282675", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completion of rate Control: 0.498493\n", + "Completion of rate Test: 0.585173\n", + "z-statistic: 22.856841\n", + "p-value : 0.000000\n", + "Statistical diference highly significative (alpha=0.05)\n" + ] + } + ], + "source": [ + "p_value = 2 * (1 - norm.cdf(abs(z)))\n", + "print(f'Completion of rate Control: {p1:4f}')\n", + "print(f'Completion of rate Test: {p2:4f}')\n", + "print(f'z-statistic: {z:4f}')\n", + "print(f'p-value : {p_value:.6f}')\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print('Statistical diference highly significative (alpha=0.05)')\n", + "else:\n", + " print('Statistical diference not significative (alpha=0.05)')" + ] + }, + { + "cell_type": "markdown", + "id": "b6336d5f-7c76-4b04-bded-84819c26634d", + "metadata": {}, + "source": [ + "Next step: MAKE GRAPHIC FOR THIS\n", + "and: Carried out an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold.\n", + "Carried out another hypothesis test of your choosing.\n", + "Evaluated the experiment by answering questions relating to:\n", + "\n", + " Design Effectiveness\n", + " Duration\n", + " Additional Data Needs\n", + "\n" + ] } ], "metadata": { diff --git a/notebook/Untitled.ipynb b/notebook/Untitled.ipynb new file mode 100644 index 0000000..b41851f --- /dev/null +++ b/notebook/Untitled.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "139f8b74-4220-4cbd-b2b9-666f8b7bfaf5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}