isg75 · tomapasta · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ notebooks/.env
 notebooks/.DS_Store
 .DS_Store
 *.in
+notebooks/__pycache__/clean_string_content_regex.cpython-313.pyc
+.virtual_documents/
diff --git a/.virtual_documents/Untitled.ipynb b/.virtual_documents/Untitled.ipynb
@@ -0,0 +1 @@
+
diff --git a/.virtual_documents/notebooks/Clean_data_Davy.ipynb b/.virtual_documents/notebooks/Clean_data_Davy.ipynb
@@ -0,0 +1,78 @@
+import numpy as np
+import pandas as pd
+import yaml
+
+
+try:
+    with open("../config.yaml", "r") as file:
+        config = yaml.safe_load(file)
+except:
+    print("Configuration file not found!")
+
+
+
+df = pd.read_csv(config['input_data']['file'])
+davy_df = df.copy()
+davy_df
+
+
+davy_df.info()
+
+
+
+
+
+davy_df.duplicated().sum()
+
+
+davy_df.duplicated().any()
+
+
+davy_df.duplicated().any()
+
+
+import duplicate
+# Remove duplicate rows
+davy_df_clean = duplicate.remove_duplicate_rows(davy_df, keep="first")
+display(davy_df_clean)
+
+
+# standardiser (normalise ) columns names for SQL
+davy_df.columns = (
+    davy_df.columns
+        .str.strip()
+        .str.lower()
+        .str.replace(" ", "_")
+        .str.replace(r"[^a-zA-Z0-9_]", "", regex = True)
+)
+
+
+# outlier handling 
+
+
+
+davy_df.describe()
+
+
+davy_df.describe()
+
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from box_plot import box_plot
+davy_df_outliers = box_plot(davy_df, column ="revenue_millions_", title="revenue in millions dollars", color="skyblue")
+
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from box_plot import box_plot
+davy_df_outliers = box_plot(davy_df, column ="players_millions", title="number of players", color="skyblue")
+
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from box_plot import box_plot
+davy_df_outliers = box_plot(davy_df, column ="metacritic_score", title="the metacritic score", color="skyblue")
+
+
+
diff --git a/.virtual_documents/notebooks/Untitled.ipynb b/.virtual_documents/notebooks/Untitled.ipynb
@@ -0,0 +1,47 @@
+import pandas as pd
+import yaml
+
+
+try:
+    with open("../config.yaml", "r") as file:
+        config = yaml.safe_load(file)
+except:
+    print("Configuration file not found")
+
+
+df = pd.read_csv(config['input_data']['file'])
+df
+
+
+# # Checking for Null Values
+
+df.isnull()
+
+
+# # Check for null values in each column
+
+df.isna().any()
+
+
+# # Count the number of null values in each column
+
+df.isna().sum()
+
+
+# Drop column 'Peak Concurrent Players'
+
+df.drop('Peak Concurrent Players', axis=1, inplace=True)
+df.head()
+
+
+# # Drop column 'Trending Status'
+
+df.drop('Trending Status', axis=1, inplace=True)
+df.head()
+
+
+import rename_columns
+clean_columns_df = rename_multiple_columns(df, column_mapping)
+
+
+
diff --git a/.virtual_documents/notebooks/data_clean_rui.ipynb b/.virtual_documents/notebooks/data_clean_rui.ipynb
@@ -0,0 +1,52 @@
+import pandas as pd
+url = 'https://docs.google.com/spreadsheets/d/18wyCTVAbd4W91vcGURS0P5ax-dI93iq2w7-s7wj98bs/export?format=csv&gid=1823224578'
+df = pd.read_csv ('https://docs.google.com/spreadsheets/d/18wyCTVAbd4W91vcGURS0P5ax-dI93iq2w7-s7wj98bs/export?format=csv&gid=1823224578')
+
+
+
+df.head()
+
+
+df.dtypes
+
+
+# converting to correct type
+df['Revenue (Millions $)'] = df['Revenue (Millions $)'].astype(float)
+
+
+df.dtypes
+
+
+
+
+
+import re
+
+
+def clean_string_content(text):
+    if pd.isna(text):
+        return text
+    text = str(text)
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9\s.]+', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+
+text_cols = df.select_dtypes(include="object").columns
+
+
+df[text_cols] = df[text_cols].applymap(clean_string_content)
+
+
+df = df.applymap(clean_string_content)
+
+
+
+df.head()
+
+
+
+