Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Personal
Data/
Tests/
tester_2.ipynb

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
Expand Down
85 changes: 85 additions & 0 deletions cleaning_enrollments_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd
import numpy as np

class EnrollmentsCleaning:
def __init__(self, raw_data):
self.raw_data = raw_data

def Drop_columns(self, df):
COLUMNS_TO_DROP = ['Full Name']
result = df.drop(columns=COLUMNS_TO_DROP)
return result

def Fix_nan_values(self, df):
# Fix NaN values
NAN_VALUE_SUBSTITUTE = 'NA'
columns_to_fix = {
'Projected Start Date': NAN_VALUE_SUBSTITUTE, 'Actual Start Date': NAN_VALUE_SUBSTITUTE, 'Projected End Date': NAN_VALUE_SUBSTITUTE,
'Actual End Date': NAN_VALUE_SUBSTITUTE, 'Outcome': NAN_VALUE_SUBSTITUTE
}
# 'ATP Cohort' NA will handle in a separed function
for column, substitute_value in columns_to_fix.items():
df[column] = df[column].fillna(substitute_value)

return df

def Rename_values(self, df):
# Fix change name Data Analitics 2 to Data Analysis 2 for consistency
df.loc[df['Service'] == 'Data Analytics 2', 'Service'] = 'Data Analysis 2'
return df

def Delete_values(self, df):
# Delete values not needed
# 'Referral to External Service', 'Supportive Services Referral', are deleted because dont have a "Projected Start Date"
values_not_needed = {
'Service': ['Software Development 1', 'Software Development 2', 'Web Development 1', 'Web Development 2', 'Data Analysis 1','Data Analysis 2', 'Referral to External Service', 'Supportive Services Referral']
}
for column, value in values_not_needed.items():
df = df[~df[column].isin(value)]
return df

def Set_data_types(self, df):
# DataTypes
column_datatype: dict = {'Auto Id': str, 'KY Region': str, 'Assessment ID': str, 'EnrollmentId': str,
'Enrollment Service Name': str, 'Service': str, 'Projected Start Date': str,
'Actual Start Date': str, 'Projected End Date': str, 'Actual End Date': str, 'Outcome': str,
'ATP Cohort': 'datetime64[ns]'}
# TODO: 'Projected Start Date', 'Actual Start Date', 'Projected End Date', 'Actual End Date' are all datetime types but have a value fix of NA

for column, type in column_datatype.items():
df[column] = df[column].astype(type)
return df

def Find_cohort(self, id: str, projected_start_date: str, cohort_to_find: str, df_to_clean: pd.DataFrame):
## Q: What to do with Service: ['Referral to External Service', 'Supportive Services Referral']
## TODO: Clean the NaTType before this function runs
if pd.isna(cohort_to_find):
student_df = df_to_clean[df_to_clean['Auto Id'] == id]
# remove ATP Cohort NA values, it can be more than one
student_df: pd.DataFrame = student_df[~student_df['ATP Cohort'].isna()]
cohorts_participaded = student_df['ATP Cohort'].astype('datetime64[ns]').unique()

# print(cohorts_participaded)
if len(cohorts_participaded) == 1:
return cohorts_participaded[0]
else:
# cohorts_participaded.append(pd.to_datetime(projected_start_date))
stimated_module_date = np.datetime64(projected_start_date)
cohorts_participaded = np.append(cohorts_participaded, stimated_module_date)
cohorts_participaded.sort()
previus_date = cohorts_participaded[0]
for cohort in cohorts_participaded:
if stimated_module_date == cohort:
return previus_date
else:
return np.datetime64(cohort_to_find)

def Get_clean_data(self):
df = self.raw_data
df = self.Drop_columns(df)
df = self.Fix_nan_values(df)
df = self.Rename_values(df)
df = self.Delete_values(df)
df = self.Set_data_types(df)
df['ATP Cohort'] = df.apply(lambda row: self.Find_cohort(row['Auto Id'], row['Projected Start Date'], row['ATP Cohort'], df), axis=1)
return df
52 changes: 52 additions & 0 deletions completion_rate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd

class Completion_rate_data:
def __init__(self, data):
self.data = data
self.__pathways = [
'Web Development M1',
'Web Development M2',
'Web Development M3',
'Web Development M4',
'Data Analysis M1',
'Data Analysis M2',
'Data Analysis M3',
'Data Analysis M4',
'Software Development M1',
'Software Development M2',
'Software Development M3',
'Software Development M4',
'Quality Assurance M1',
'Quality Assurance M2',
'Quality Assurance M3',
'Quality Assurance M4',
'User Experience M1',
'User Experience M2',
'User Experience M3',
'User Experience M4',
]

# Not the best Pandas way to do it:
def Get_completion_percentages(self, cohort: str = 'All cohorts') -> pd.DataFrame:


if cohort == 'All cohorts':
data = self.data
else:
data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)]

completion_dictionary = {}

for path in self.__pathways:
outcome = data[data['Service'] == path]['Outcome'].value_counts(normalize=True).reset_index()
completion_dictionary[path] = {row.Outcome: row.proportion for row in outcome.itertuples(index=True)}

result_df = pd.DataFrame(completion_dictionary).transpose().fillna(0).rename_axis('Module').reset_index()

result_df['Pathway'] = result_df['Module'].apply(lambda x: x[:x.rfind(' ')]) # intended to be able to sort by pathway
return result_df
# TODO: Add test

def Get_pathways_name(self, df: pd.DataFrame) -> list:
return list(df['Pathway'].unique())

41 changes: 41 additions & 0 deletions most_common_pathways_taken_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd

class Most_common_pathways_taken_data:
def __init__(self, data):
self.data = data
self.__starter_pathways = [
'Web Development M1',
'Data Analysis M1',
'Software Development M1',
'Quality Assurance M1',
'User Experience M1',
]
self.starter_only_df = self.Get_starting_pathways()

def Get_starting_pathways(self):
"""
Returns a pandas.DataFrame were all the services are the biginning paths

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
mask_starter_pathways = self.data['Service'].isin(self.__starter_pathways)
return self.data[mask_starter_pathways]

def Get_cohorts_list(self):
df = self.starter_only_df
cohorts = list(pd.to_datetime(df['ATP Cohort'][df['ATP Cohort'] != 'NA']).sort_values(ascending=True).astype(str).unique())
cohorts.insert(0, 'All cohorts')
return cohorts

def Get_data_by_cohort(self, cohort: str = 'All cohorts') -> pd.DataFrame:
df = self.starter_only_df
if cohort == 'All cohorts':
result = df.value_counts('Service').reset_index()
else:
result = df[df['ATP Cohort'] == str(pd.to_datetime(cohort))].value_counts('Service').reset_index()

return result
Loading
Loading