Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 317 additions & 0 deletions .virtual_documents/notebook/PROJECT2_pedro.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
import pandas as pd

df4 = pd.read_csv("df_final_demo.txt")


df3 = pd.read_csv("df_final_experiment_clients.txt")


df1 = pd.read_csv("df_final_web_data_pt_1.txt")


df2 = pd.read_csv("df_final_web_data_pt_2.txt")


df1.columns


df2.columns


dfs = pd.concat([df1, df2], ignore_index=True)


dfs['client_id'].unique()


dfs.isnull()


dfs.shape


dfs.head()


#checking what the type of date_time
dfs.dtypes


#convert object into date_time
dfs['date_time'] = pd.to_datetime(dfs['date_time'])


dfs.dtypes


df3.columns


df4.columns





#df_all.head()



dfs.columns


#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])


dfs.head(10)


#dfsorted[dfsorted['client_id'] == 442857]


dfs['client_id'].value_counts().head()


dfs['date_time'].isnull().value_counts()


dfs['visitor_id'].isnull().value_counts()


dfs['date_time'].isnull().value_counts()


dfs['visit_id'].isnull().value_counts()


dfs['visitor_id'].isnull().value_counts()


dfs['process_step'].isnull().value_counts()


dfs.isnull().value_counts()


df4.columns


df3.columns


#df_test.isnull()


dfs['process_step'].unique()


#merfe dfs + experiment(df3) to have Variaton = Control OR test)
df_mergeexp = dfs.merge(df3, on="client_id", how="left")


#merge with demo(df4) (demographic data)
df_merged = df_mergeexp.merge(df4, on="client_id", how="left")


df_merged.shape


#remove duplicates
df_merged = df_merged.drop_duplicates()


#remove rows without any variation
df_merged = df_merged[df_merged["Variation"].isin(["Control", "Test"])]


df_merged.shape


df_merged["process_step"].value_counts()


df_merged["Variation"].value_counts()


#Give a table for Tests and another for Controls!
df_control = df_merged[df_merged["Variation"] == "Control"].copy()
df_test = df_merged[df_merged["Variation"] == "Test"].copy()


df_control.shape


df_test.shape


df_merged['num_accts'].nunique()


####################################################################


#Q1: Who are the primary clients using this online process?


usage = (df_merged.groupby("client_id")["visit_id"].nunique().reset_index(name="n_visits")) #hor many visits per client


cutoff = usage["n_visits"].quantile(0.75)
usage['primary'] = (usage["n_visits"] >= cutoff).astype(int) #top25% by number of visits


df_merged.columns


df_merged = usage.merge(df_merged, left_on="client_id", right_on="client_id", how="left") #join demographics


df_merged.groupby("primary")[["clnt_age","clnt_tenure_yr","clnt_tenure_mnth","logons_6_mnth"]].mean()



sub = df_merged[df_merged["primary"] == 1]
print(sub[["clnt_age","clnt_tenure_yr","bal","gendr"]].describe(include="all"))


#Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.
#primary cients are more long standing.
#The most frequent gender is “M”


df_merged.groupby("primary")[['calls_6_mnth', 'logons_6_mnth']].mean()


#They also make more calls and more logons in 6 months, so they are more active on all channels, not only online.


df_merged = df_merged.rename(columns={'Variation': 'variation'})
df3 = df3.rename(columns={'Variation': 'variation'})
df_control = df_control.rename(columns={'Variation': 'variation'})
df_test = df_test.rename(columns={'Variation': 'variation'})


df_merged.columns


df_control.columns


df3.columns


dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')


dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])


step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}





dfs_var['step_num'] = dfs_var['process_step'].map(step_order)


######################################################
#COMPLETION RATE


last_step = dfs_var['step_num'].max()


visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))


visitcomp["completed"] = (visitcomp["max_step"] == last_step).astype(int)


completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())


completionrate


print('Average time spent on each steap is', completionrate)


############################################################################################
#TIME SPENT ON EACH STEP


dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])


dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])


dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit


dfs_var["next_time"] = pd.to_datetime(dfs_var["next_time"], errors="coerce")


dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds


step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())


step_time


##################################################################
#CHECKING RATING OF ERRORS PER EACH STEP


dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))


error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))


error_rates


#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.
#TIME SPENT - The test version is better for user to complete the steps faster
#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors.


import numpy as np
from scipy.stats import norm



#Order control test
summary = summary.set_index('variation').loc[['Control', 'Test']]
x1, x2 = summary['n_complete'].values
n1, n2 = summary['n_total'].values


#propoortions
p1 = x1 / n1
p2 = x2 / n2


#h0
p_pool = (x1 + x2) / (n1 + n2)


#test of 2 proportions *THANK YOU CHATGPT
se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))
z = (p2 - p1) / se


p_value = 2 * (1 - norm.cdf(abs(z)))
print(f'Completion of rate Control: {p1:4f}')
print(f'Completion of rate Test: {p2:4f}')
print(f'z-statistic: {z:4f}')
print(f'p-value : {p_value:.6f}')
alpha = 0.05
if p_value < alpha
print('Statistical diference highly significative (alpha=0.05)')
else
print('Statistical diference not significative (alpha=0.05)')
1 change: 1 addition & 0 deletions .virtual_documents/notebook/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added anaconda_projects/db/project_filebrowser.db
Binary file not shown.
Loading