Skip to content
Open

Rui #82

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4b5e540
Version_1 clean data upload
CharulMathur4592 Dec 8, 2025
7d39a94
Version1 of cleanning data upload
DavyN25 Dec 8, 2025
015e339
version1
ruiparreira75 Dec 8, 2025
089286a
verion1
Dec 8, 2025
c74be28
functions in one file
Dec 9, 2025
2c159b0
Merge pull request #1 from tomapasta/rui
tomapasta Dec 9, 2025
209e661
Merge pull request #2 from tomapasta/charul
tomapasta Dec 9, 2025
fd069bb
Merge pull request #3 from tomapasta/Davy
tomapasta Dec 9, 2025
87e4ee0
Merge pull request #4 from tomapasta/Ako
tomapasta Dec 9, 2025
309bbbb
update
DavyN25 Dec 9, 2025
2e0a4fe
Merge pull request #5 from tomapasta/Davy
tomapasta Dec 9, 2025
77ba04b
"adding readme and rui_clean"
ruiparreira75 Dec 9, 2025
e76d205
update day2
DavyN25 Dec 9, 2025
ed7b407
Version2 clean data with graphs
CharulMathur4592 Dec 9, 2025
01300d8
day1
Dec 9, 2025
27a348c
rui day 2 update and readme
ruiparreira75 Dec 9, 2025
a13616e
Merge pull request #6 from tomapasta/Ako
tomapasta Dec 9, 2025
370cdc3
Merge pull request #7 from tomapasta/Davy
tomapasta Dec 9, 2025
c2a0859
Merge pull request #8 from tomapasta/charul
tomapasta Dec 9, 2025
f567651
Merge pull request #9 from tomapasta/rui
tomapasta Dec 9, 2025
e6a3c33
rui_push
ruiparreira75 Dec 10, 2025
d45a39a
added 2 files - charts
CharulMathur4592 Dec 10, 2025
7388695
Merge branch 'main' into charul
CharulMathur4592 Dec 10, 2025
72c137c
update
DavyN25 Dec 10, 2025
4a289d3
day3
ruiparreira75 Dec 10, 2025
e28349c
Day3
Dec 10, 2025
49fd15a
update day3
DavyN25 Dec 10, 2025
6b99ba6
updated name
DavyN25 Dec 10, 2025
f7261e5
Merge pull request #10 from tomapasta/rui
tomapasta Dec 10, 2025
4ef5ab6
Merge pull request #11 from tomapasta/Davy
tomapasta Dec 10, 2025
a461483
Merge pull request #12 from tomapasta/Ako
tomapasta Dec 10, 2025
d6c3935
rui_again
ruiparreira75 Dec 11, 2025
16be409
Merge branch 'main' into rui
ruiparreira75 Dec 11, 2025
a0af853
clean ako file
CharulMathur4592 Dec 11, 2025
cff2b91
Testing
CharulMathur4592 Dec 11, 2025
e62124c
Merge pull request #13 from tomapasta/charul
CharulMathur4592 Dec 11, 2025
bdbf6c4
Updated .gitignore file
ruiparreira75 Dec 11, 2025
b3266d5
Merge branch 'main' into rui
ruiparreira75 Dec 11, 2025
096df5c
Merge pull request #14 from tomapasta/rui
ruiparreira75 Dec 11, 2025
c469cf8
day4
ruiparreira75 Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ notebooks/.env
notebooks/.DS_Store
.DS_Store
*.in
notebooks/__pycache__/clean_string_content_regex.cpython-313.pyc
.virtual_documents/
1 change: 1 addition & 0 deletions .virtual_documents/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

78 changes: 78 additions & 0 deletions .virtual_documents/notebooks/Clean_data_Davy.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import numpy as np
import pandas as pd
import yaml


try:
with open("../config.yaml", "r") as file:
config = yaml.safe_load(file)
except:
print("Configuration file not found!")



df = pd.read_csv(config['input_data']['file'])
davy_df = df.copy()
davy_df


davy_df.info()





davy_df.duplicated().sum()


davy_df.duplicated().any()


davy_df.duplicated().any()


import duplicate
# Remove duplicate rows
davy_df_clean = duplicate.remove_duplicate_rows(davy_df, keep="first")
display(davy_df_clean)


# standardiser (normalise ) columns names for SQL
davy_df.columns = (
davy_df.columns
.str.strip()
.str.lower()
.str.replace(" ", "_")
.str.replace(r"[^a-zA-Z0-9_]", "", regex = True)
)


# outlier handling



davy_df.describe()


davy_df.describe()


import matplotlib.pyplot as plt
import seaborn as sns
from box_plot import box_plot
davy_df_outliers = box_plot(davy_df, column ="revenue_millions_", title="revenue in millions dollars", color="skyblue")


import matplotlib.pyplot as plt
import seaborn as sns
from box_plot import box_plot
davy_df_outliers = box_plot(davy_df, column ="players_millions", title="number of players", color="skyblue")


import matplotlib.pyplot as plt
import seaborn as sns
from box_plot import box_plot
davy_df_outliers = box_plot(davy_df, column ="metacritic_score", title="the metacritic score", color="skyblue")



47 changes: 47 additions & 0 deletions .virtual_documents/notebooks/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
import yaml


try:
with open("../config.yaml", "r") as file:
config = yaml.safe_load(file)
except:
print("Configuration file not found")


df = pd.read_csv(config['input_data']['file'])
df


# # Checking for Null Values

df.isnull()


# # Check for null values in each column

df.isna().any()


# # Count the number of null values in each column

df.isna().sum()


# Drop column 'Peak Concurrent Players'

df.drop('Peak Concurrent Players', axis=1, inplace=True)
df.head()


# # Drop column 'Trending Status'

df.drop('Trending Status', axis=1, inplace=True)
df.head()


import rename_columns
clean_columns_df = rename_multiple_columns(df, column_mapping)



52 changes: 52 additions & 0 deletions .virtual_documents/notebooks/data_clean_rui.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
url = 'https://docs.google.com/spreadsheets/d/18wyCTVAbd4W91vcGURS0P5ax-dI93iq2w7-s7wj98bs/export?format=csv&gid=1823224578'
df = pd.read_csv ('https://docs.google.com/spreadsheets/d/18wyCTVAbd4W91vcGURS0P5ax-dI93iq2w7-s7wj98bs/export?format=csv&gid=1823224578')



df.head()


df.dtypes


# converting to correct type
df['Revenue (Millions $)'] = df['Revenue (Millions $)'].astype(float)


df.dtypes





import re


def clean_string_content(text):
if pd.isna(text):
return text
text = str(text)
text = text.lower()
text = re.sub(r'[^a-z0-9\s.]+', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text



text_cols = df.select_dtypes(include="object").columns


df[text_cols] = df[text_cols].applymap(clean_string_content)


df = df.applymap(clean_string_content)



df.head()




Loading