Skip to content

Commit 28d3ed2

Browse files
authored
Merge pull request #13 from dmorton714/historical_dashboard
Added DocStrings to all the methods and functions
2 parents 302eee6 + 90a2a67 commit 28d3ed2

File tree

6 files changed

+582
-11
lines changed

6 files changed

+582
-11
lines changed

src/Carmen_WORCEmployment.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pandas as pd
22

3+
34
def load_and_clean(file_path="data/WORC_Employment.xlsx"):
45
"""
56
Loads and cleans the WORC Employment dataset.
@@ -18,12 +19,15 @@ def load_and_clean(file_path="data/WORC_Employment.xlsx"):
1819
worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
1920

2021
# Clean up data types
21-
worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
22-
worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')
22+
worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) # noqa
23+
worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'],
24+
errors='coerce')
2325

24-
# Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
26+
# Adjust salary that is listed as 60,000 to 28.84 for
27+
# consistency with other salaries
2528
# Took 60,000 / 2080hrs - 28.84
26-
worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)
29+
worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) # noqa
2730

2831
worc_clean = worc_cols_dropped
32+
2933
return worc_clean

src/Carmen_WORCEmployment_Plots.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import sys
2-
import os
31
import pandas as pd
42
from Carmen_WORCEmployment import load_and_clean
53
import matplotlib.pyplot as plt
@@ -8,36 +6,46 @@
86

97
def plot_salary_by_gender(data):
108
plt.figure(figsize=(8, 5))
11-
sns.boxplot(data=data, x='Gender', y='Salary')
9+
sns.boxplot(data=data,
10+
x='Gender',
11+
y='Salary')
1212
plt.title("Salary Distribution by Gender")
1313
plt.show()
1414

1515

1616
def plot_avg_salary_by_city(data):
1717
region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
18-
region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
18+
region_salary.plot(kind='barh',
19+
figsize=(8, 5),
20+
title="Average Salary by KY Region")
1921
plt.xlabel("Average Salary")
2022
plt.show()
2123

2224

2325
def plot_placements_over_time(data):
24-
data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
26+
data.set_index('Start Date').resample('M').size().plot(kind='line',
27+
marker='o',
28+
figsize=(10, 4))
2529
plt.title("Number of Placements Over Time")
2630
plt.ylabel("Placements")
2731
plt.show()
2832

2933

3034
def plot_placement_type_by_program(data):
3135
plt.figure(figsize=(10, 6))
32-
sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
36+
sns.countplot(data=data,
37+
x='ATP Placement Type',
38+
hue='Program: Program Name')
3339
plt.xticks(rotation=45)
3440
plt.title("Placement Type by Program")
3541
plt.show()
3642

3743

3844
def plot_top_cities(data):
3945
city_counts = data['Mailing City'].value_counts().head(10)
40-
city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
46+
city_counts.plot(kind='bar',
47+
title='Top Cities by Participant Count',
48+
figsize=(8, 4))
4149
plt.ylabel("Count")
4250
plt.show()
4351

src/cleaning_enrollments_data.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
5+
class EnrollmentsCleaning:
6+
def __init__(self, raw_data):
7+
self.raw_data = raw_data
8+
9+
def __Drop_columns(self, df):
10+
"""
11+
Deletes the columns not needed for the analysis,
12+
if you want to add columns to delete change the const variable
13+
'COLUMNS_TO_DROP'.
14+
15+
Args:
16+
df: pandas.DataFrame
17+
18+
Return:
19+
pandas.DataFrame
20+
"""
21+
COLUMNS_TO_DROP = ['Full Name']
22+
result = df.drop(columns=COLUMNS_TO_DROP)
23+
return result
24+
25+
def __Fix_nan_values(self, df):
26+
"""
27+
Gives values to NaN.
28+
29+
Args:
30+
df: pandas.DataFrame
31+
32+
Return:
33+
pandas.DataFrame
34+
"""
35+
# Fix NaN values
36+
NAN_VALUE_SUBSTITUTE = 'NA'
37+
columns_to_fix = {
38+
'Projected Start Date': NAN_VALUE_SUBSTITUTE,
39+
'Actual Start Date': NAN_VALUE_SUBSTITUTE,
40+
'Projected End Date': NAN_VALUE_SUBSTITUTE,
41+
'Actual End Date': NAN_VALUE_SUBSTITUTE,
42+
'Outcome': NAN_VALUE_SUBSTITUTE
43+
}
44+
# 'ATP Cohort' NA will handle in a separed function
45+
for column, substitute_value in columns_to_fix.items():
46+
df[column] = df[column].fillna(substitute_value)
47+
48+
return df
49+
50+
def __Rename_values(self, df):
51+
"""
52+
Changes values for consistency.
53+
54+
Args:
55+
df: pandas.DataFrame
56+
57+
Return:
58+
pandas.DataFrame
59+
"""
60+
df.loc[df['Service'] == 'Data Analytics 2',
61+
'Service'] = 'Data Analysis 2'
62+
return df
63+
64+
def __Delete_values(self, df):
65+
"""
66+
Deletes values not needed, if you want to add values to delete
67+
change the const variable 'VALUES_NOT_NEEDED'.
68+
69+
70+
Args:
71+
df: pandas.DataFrame
72+
73+
Return:
74+
pandas.DataFrame
75+
"""
76+
# 'Referral to External Service', 'Supportive Services Referral',
77+
# are deleted because dont have a "Projected Start Date"
78+
VALUES_NOT_NEEDED = {
79+
'Service': ['Software Development 1',
80+
'Software Development 2',
81+
'Web Development 1', 'Web Development 2',
82+
'Data Analysis 1', 'Data Analysis 2',
83+
'Referral to External Service',
84+
'Supportive Services Referral']
85+
}
86+
for column, value in VALUES_NOT_NEEDED.items():
87+
df = df[~df[column].isin(value)]
88+
return df
89+
90+
def __Set_data_types(self, df):
91+
"""
92+
Sets data type for each column.
93+
94+
Args:
95+
df: pandas.DataFrame
96+
97+
Return:
98+
pandas.DataFrame
99+
"""
100+
# DataTypes
101+
column_datatype: dict = {'Auto Id': str, 'KY Region': str,
102+
'Assessment ID': str, 'EnrollmentId': str,
103+
'Enrollment Service Name': str,
104+
'Service': str,
105+
'Projected Start Date': str,
106+
'Actual Start Date': str,
107+
'Projected End Date': str,
108+
'Actual End Date': str,
109+
'Outcome': str,
110+
'ATP Cohort': 'datetime64[ns]'}
111+
# TODO: 'Projected Start Date', 'Actual Start Date',
112+
# 'Projected End Date','Actual End Date' are all datetime
113+
# types but have a value fix of NA
114+
115+
for column, type in column_datatype.items():
116+
df[column] = df[column].astype(type)
117+
return df
118+
119+
def __Find_cohort(self, id: str,
120+
projected_start_date: str,
121+
cohort_to_find: str,
122+
df_to_clean: pd.DataFrame):
123+
"""
124+
Finds values for each NaN of 'ATP Cohort' column.
125+
This function was created with the idea of using
126+
pandas.DataFrame.apply().
127+
128+
129+
Args:
130+
id: str
131+
projected_start_date: str
132+
cohort_to_find: str
133+
df_to_clean: pandas.DataFrame
134+
135+
Return:
136+
numpy.array
137+
"""
138+
# Q: What to do with Service: ['Referral to External Service',
139+
# 'Supportive Services Referral']
140+
# TODO: Clean the NaTType before this function runs
141+
if pd.isna(cohort_to_find):
142+
student_df = df_to_clean[df_to_clean['Auto Id'] == id]
143+
# remove ATP Cohort NA values, it can be more than one
144+
student_df: pd.DataFrame = student_df[~student_df['ATP Cohort']
145+
.isna()]
146+
cohorts_participaded = student_df['ATP Cohort'].astype(
147+
'datetime64[ns]').unique()
148+
149+
# print(cohorts_participaded)
150+
if len(cohorts_participaded) == 1:
151+
return cohorts_participaded[0]
152+
else:
153+
# cohorts_participaded.append(pd.to_datetime(projected_start_date))
154+
stimated_module_date = np.datetime64(projected_start_date)
155+
cohorts_participaded = np.append(
156+
cohorts_participaded, stimated_module_date)
157+
cohorts_participaded.sort()
158+
previus_date = cohorts_participaded[0]
159+
for cohort in cohorts_participaded:
160+
if stimated_module_date == cohort:
161+
return previus_date
162+
else:
163+
return np.datetime64(cohort_to_find)
164+
165+
def Get_clean_data(self):
166+
"""
167+
Cleans the raw data.
168+
169+
Args:
170+
df: pandas.DataFrame
171+
172+
Return:
173+
pandas.DataFrame
174+
"""
175+
df = self.raw_data
176+
df = self.__Drop_columns(df)
177+
df = self.__Fix_nan_values(df)
178+
df = self.__Rename_values(df)
179+
df = self.__Delete_values(df)
180+
df = self.__Set_data_types(df)
181+
df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort(
182+
row['Auto Id'],
183+
row['Projected Start Date'],
184+
row['ATP Cohort'],
185+
df), axis=1)
186+
return df

src/completion_rate_data.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import pandas as pd
2+
3+
4+
class Completion_rate_data:
5+
def __init__(self, data):
6+
self.data = data
7+
self.__pathways = [
8+
'Web Development M1',
9+
'Web Development M2',
10+
'Web Development M3',
11+
'Web Development M4',
12+
'Data Analysis M1',
13+
'Data Analysis M2',
14+
'Data Analysis M3',
15+
'Data Analysis M4',
16+
'Software Development M1',
17+
'Software Development M2',
18+
'Software Development M3',
19+
'Software Development M4',
20+
'Quality Assurance M1',
21+
'Quality Assurance M2',
22+
'Quality Assurance M3',
23+
'Quality Assurance M4',
24+
'User Experience M1',
25+
'User Experience M2',
26+
'User Experience M3',
27+
'User Experience M4',
28+
]
29+
30+
# Not the best Pandas way to do it:
31+
def Get_completion_percentages(self,
32+
cohort: str = 'All cohorts') -> pd.DataFrame: # noqa
33+
"""
34+
Creates a pandas.Datafreme that contains the %
35+
of completion of each pathway.
36+
37+
Args:
38+
cohort: str
39+
40+
Return:
41+
pandas.DataFrame
42+
"""
43+
if cohort == 'All cohorts':
44+
data = self.data
45+
else:
46+
data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)]
47+
48+
completion_dictionary = {}
49+
50+
for path in self.__pathways:
51+
outcome = data[data['Service'] == path]['Outcome'].value_counts(
52+
normalize=True).reset_index()
53+
completion_dictionary[path] = {
54+
row.Outcome: row.proportion for row in outcome.itertuples(index=True)} # noqa
55+
56+
result_df = pd.DataFrame(completion_dictionary).transpose().fillna(
57+
0).rename_axis('Module').reset_index()
58+
59+
result_df['Pathway'] = result_df['Module'].apply(
60+
# intended to be able to sort by pathway
61+
lambda x: x[:x.rfind(' ')])
62+
return result_df
63+
64+
def Get_pathways_name(self, df: pd.DataFrame) -> list:
65+
"""
66+
List of all the pathways in a pandas.DataFrame generated by
67+
self.Get_completion_percentages().
68+
69+
Args:
70+
df: pandas.DataFrame
71+
72+
Return:
73+
list
74+
"""
75+
return list(df['Pathway'].unique())

0 commit comments

Comments
 (0)