hsospedra · Pmgbf · Dec 18, 2025
diff --git a/.virtual_documents/notebook/PROJECT2_pedro.ipynb b/.virtual_documents/notebook/PROJECT2_pedro.ipynb
@@ -0,0 +1,317 @@
+import pandas as pd
+
+df4 = pd.read_csv("df_final_demo.txt")
+
+
+df3 = pd.read_csv("df_final_experiment_clients.txt")
+
+
+df1 = pd.read_csv("df_final_web_data_pt_1.txt")
+
+
+df2 = pd.read_csv("df_final_web_data_pt_2.txt")
+
+
+df1.columns
+
+
+df2.columns
+
+
+dfs = pd.concat([df1, df2], ignore_index=True)
+
+
+dfs['client_id'].unique()
+
+
+dfs.isnull()
+
+
+dfs.shape
+
+
+dfs.head()
+
+
+#checking what the type of date_time
+dfs.dtypes
+
+
+#convert object into date_time
+dfs['date_time'] = pd.to_datetime(dfs['date_time'])
+
+
+dfs.dtypes
+
+
+df3.columns
+
+
+df4.columns
+
+
+
+
+
+#df_all.head()
+
+
+
+dfs.columns
+
+
+#dfs.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'], ascending=[True, True, True, True])
+
+
+dfs.head(10)
+
+
+#dfsorted[dfsorted['client_id'] == 442857]
+
+
+dfs['client_id'].value_counts().head()
+
+
+dfs['date_time'].isnull().value_counts()
+
+
+dfs['visitor_id'].isnull().value_counts()
+
+
+dfs['date_time'].isnull().value_counts()
+
+
+dfs['visit_id'].isnull().value_counts()
+
+
+dfs['visitor_id'].isnull().value_counts()
+
+
+dfs['process_step'].isnull().value_counts()
+
+
+dfs.isnull().value_counts()
+
+
+df4.columns
+
+
+df3.columns
+
+
+#df_test.isnull()
+
+
+dfs['process_step'].unique()
+
+
+#merfe dfs + experiment(df3) to have Variaton = Control OR test)
+df_mergeexp = dfs.merge(df3, on="client_id", how="left")
+
+
+#merge with demo(df4) (demographic data)
+df_merged = df_mergeexp.merge(df4, on="client_id", how="left")
+
+
+df_merged.shape
+
+
+#remove duplicates
+df_merged = df_merged.drop_duplicates()
+
+
+#remove rows without any variation
+df_merged = df_merged[df_merged["Variation"].isin(["Control", "Test"])]
+
+
+df_merged.shape
+
+
+df_merged["process_step"].value_counts()
+
+
+df_merged["Variation"].value_counts()
+
+
+#Give a table for Tests and another for Controls! 
+df_control = df_merged[df_merged["Variation"] == "Control"].copy()
+df_test = df_merged[df_merged["Variation"] == "Test"].copy()
+
+
+df_control.shape
+
+
+df_test.shape
+
+
+df_merged['num_accts'].nunique()
+
+
+####################################################################
+
+
+#Q1: Who are the primary clients using this online process?
+
+
+usage = (df_merged.groupby("client_id")["visit_id"].nunique().reset_index(name="n_visits"))  #hor many visits per client
+
+
+cutoff = usage["n_visits"].quantile(0.75)
+usage['primary'] = (usage["n_visits"] >= cutoff).astype(int)          #top25% by number of visits
+
+
+df_merged.columns
+
+
+df_merged = usage.merge(df_merged, left_on="client_id", right_on="client_id", how="left") #join demographics
+
+
+df_merged.groupby("primary")[["clnt_age","clnt_tenure_yr","clnt_tenure_mnth","logons_6_mnth"]].mean()
+
+
+
+sub = df_merged[df_merged["primary"] == 1]
+print(sub[["clnt_age","clnt_tenure_yr","bal","gendr"]].describe(include="all"))
+
+
+#Average age is about 51.8, half of the primary clients are between 39 and 63. So they are midle-aged and not very young customers, it means primary clients are older than non primary clients.
+#primary cients are more long standing.
+#The most frequent gender is “M”
+
+
+df_merged.groupby("primary")[['calls_6_mnth', 'logons_6_mnth']].mean()
+
+
+#They also make more calls and more logons in 6 months, so they are more active on all channels, not only online.
+
+
+df_merged = df_merged.rename(columns={'Variation': 'variation'})
+df3 = df3.rename(columns={'Variation': 'variation'})
+df_control = df_control.rename(columns={'Variation': 'variation'})
+df_test = df_test.rename(columns={'Variation': 'variation'})
+
+
+df_merged.columns
+
+
+df_control.columns
+
+
+df3.columns
+
+
+dfs_var = dfs.merge(df3[['client_id', 'variation']], on='client_id', how='left')
+
+
+dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])
+
+
+step_order = {'start': 0, 'step_1' : 1, 'step_2' : 2, 'step_3' : 3, 'confirm' : 4}
+
+
+
+
+
+dfs_var['step_num'] = dfs_var['process_step'].map(step_order)
+
+
+######################################################
+#COMPLETION RATE
+
+
+last_step = dfs_var['step_num'].max()
+
+
+visitcomp = (dfs_var.groupby(['variation', 'visit_id'])['step_num'].max().reset_index(name='max_step'))
+
+
+visitcomp["completed"] = (visitcomp["max_step"] == last_step).astype(int)
+
+
+completionrate = (visitcomp.groupby('variation')['completed'].mean().reset_index())
+
+
+completionrate
+
+
+print('Average time spent on each steap is', completionrate)
+
+
+############################################################################################
+#TIME SPENT ON EACH STEP
+
+
+dfs_var['date_time'] = pd.to_datetime(dfs_var['date_time'])
+
+
+dfs_var = dfs_var.sort_values(['variation', 'visit_id','step_num','date_time'])
+
+
+dfs_var['next_time'] = (dfs_var.groupby(['variation','visit_id'])['date_time'].shift) #time (-1)for the next step of the same visit
+
+
+dfs_var["next_time"] = pd.to_datetime(dfs_var["next_time"], errors="coerce")
+
+
+dfs_var['step_durationsec'] = (dfs_var['next_time'] - dfs_var['date_time']).dt.total_seconds() #duration in seconds
+
+
+step_time = (dfs_var.dropna(subset=['step_durationsec']).groupby(['variation', 'process_step'])['step_durationsec'].mean().reset_index())
+
+
+step_time
+
+
+##################################################################
+#CHECKING RATING OF ERRORS PER EACH STEP
+
+
+dfs_var['error_flag'] = (dfs_var['step_num'].astype(int))
+
+
+error_rates = (dfs_var.groupby(['variation', 'process_step'])['error_flag'].mean().reset_index().rename(columns={'error_flag' : 'error_rate'}))
+
+
+error_rates
+
+
+#COMPLETITION RATE - Comparing the percentage of visits who reach into final step in control vs test, highter percentage of completion in the test means better efectiveness for users to finish all the steps.
+#TIME SPENT - The test version is better for user to complete the steps faster
+#ERRORS RATES - The test version neither reduce or increased the frequency of errors in any step. The two versions have identical performance when talking about number of errors.
+
+
+import numpy as np
+from scipy.stats import norm
+
+
+
+#Order control test
+summary = summary.set_index('variation').loc[['Control', 'Test']]
+x1, x2 = summary['n_complete'].values
+n1, n2 = summary['n_total'].values
+
+
+#propoortions
+p1 = x1 / n1
+p2 = x2 / n2
+
+
+#h0
+p_pool = (x1 + x2) / (n1 + n2)
+
+
+#test of 2 proportions *THANK YOU CHATGPT
+se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))
+z = (p2 - p1) / se
+
+
+p_value = 2 * (1 - norm.cdf(abs(z)))
+print(f'Completion of rate Control: {p1:4f}')
+print(f'Completion of rate Test: {p2:4f}')
+print(f'z-statistic: {z:4f}')
+print(f'p-value    : {p_value:.6f}')
+alpha = 0.05
+if p_value < alpha
+    print('Statistical diference highly significative (alpha=0.05)')
+else
+    print('Statistical diference not significative (alpha=0.05)')
diff --git a/.virtual_documents/notebook/Untitled.ipynb b/.virtual_documents/notebook/Untitled.ipynb
@@ -0,0 +1 @@
+
diff --git a/anaconda_projects/db/project_filebrowser.db b/anaconda_projects/db/project_filebrowser.db