-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
48 lines (38 loc) · 1.69 KB
/
preprocessor.py
File metadata and controls
48 lines (38 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
# Functie die een DataFrame voorbereidt op wat de preprocessor verwacht
def prepare_dataframe(df):
df.columns = df.columns.str.strip()
# Zorg dat 'release_date' een datetime is
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
# Voeg maand en jaar toe
df['release_month'] = df['release_date'].dt.month
df['release_year'] = df['release_date'].dt.year
# Vul ontbrekende tekstvelden aan met lege string
for col in ['overview', 'genre', 'cast']:
df[col] = df[col].fillna('')
return df
# === Stap 1: Data inladen ===
df = pd.read_csv("imdb_movies_schoon.csv", skipinitialspace=True)
# === Stap 2: Voorbewerken ===
df = prepare_dataframe(df)
# === Stap 3: Preprocessor instellen ===
preprocessor = ColumnTransformer(
transformers=[
# Verdriedubbeling van budget via herhaling
("num", StandardScaler(), ["budget", "release_month", "release_year"]),
("cat", OneHotEncoder(handle_unknown="ignore"), ["country"]),
("genre_text", TfidfVectorizer(max_features=200), "genre"),
("cast_text", TfidfVectorizer(max_features=1000), "cast"),
]
)
# === Stap 4: Preprocessor trainen en opslaan ===
preprocessor.fit(df)
joblib.dump(preprocessor, "preprocessor.joblib")
print("Preprocessor is opgeslagen als 'preprocessor.joblib'.")
print("Details van de preprocessor:")
for name, transformer, columns in preprocessor.transformers:
print(f"- Transformer '{name}': {transformer.__class__.__name__} op kolom(men): {columns}")