Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
06ff5b4
Update README.md
EED85 Mar 7, 2025
17aff6f
add selenium
EED85 Mar 23, 2025
7f43595
add soup
EED85 Mar 23, 2025
46827bf
add requests
EED85 Mar 23, 2025
229670c
uv add cryptography
EED85 Mar 23, 2025
f27ff95
Feature/es 275 2503 verschlusselung hinzufugen (#37)
EED85 Mar 23, 2025
af2c248
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Mar 23, 2025
8b58f9a
erster grober versuch
EED85 Mar 23, 2025
c939f40
Merge branch 'feature/ES-274-lorano-puls-scrapping' of https://github…
EED85 Mar 23, 2025
697f701
url in config verschlüsselt
EED85 Mar 23, 2025
0352467
Update test_encrypt_decrypt.py (#39)
EED85 Mar 24, 2025
a99038b
neue duckdb utils
EED85 Mar 26, 2025
f18f645
fix pytest bug
EED85 Mar 26, 2025
92a4be1
test paramtrized
EED85 Mar 28, 2025
1665360
return tbl_name as well
EED85 Mar 28, 2025
bafa6ea
add tbl_add_primary key
EED85 Mar 28, 2025
14020fc
tests simplified
EED85 Mar 29, 2025
9a4466b
encriptio functions to own .py file
EED85 Mar 29, 2025
c25b36b
all pytests run locally
EED85 Mar 29, 2025
0261fb1
add method for encrypt and decrypt files
EED85 Mar 29, 2025
689ba64
locally it worls savint the webpage to an duckdb
EED85 Mar 29, 2025
ef3d6ec
Merge branch 'main' into release
EED85 Mar 29, 2025
c2155ee
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Mar 29, 2025
e4ba9c4
try running it in ga
EED85 Mar 29, 2025
9112a78
reanem folder
EED85 Mar 29, 2025
059a508
folder now lower cased
EED85 Mar 29, 2025
668eeb1
Update daily_scrapping_pollenvorhersage.yml
EED85 Mar 30, 2025
7d7c8e8
Update daily_scrapping_pollenvorhersage.yml
EED85 Mar 30, 2025
dbe15f4
Update daily_scrapping_pollenvorhersage.yml
EED85 Mar 30, 2025
326b71f
add password and salt secrets
EED85 Mar 30, 2025
d8846f2
code sicherung
EED85 Apr 6, 2025
20f4c3c
Merge branch 'main' into release
EED85 Apr 6, 2025
571f3b4
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Apr 6, 2025
3194e4c
ES-280 - auch für pollenvorhersage
EED85 Apr 6, 2025
0db727a
Merge branch 'main' into release
EED85 Apr 6, 2025
132100e
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Apr 6, 2025
a525344
Merge branch 'main' into release
EED85 Apr 6, 2025
89aa537
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Apr 6, 2025
8cc7e12
Merge branch 'main' into release
EED85 Apr 8, 2025
21e220c
Merge branch 'release' into feature/ES-274-lorano-puls-scrapping
EED85 Apr 8, 2025
4ed80b6
Improvement/es 277 2504 change how env works for webscrapping (#47)
EED85 Apr 8, 2025
bf4a79a
Improvement/es 284 2504 weitere plz hinzufugen (#49)
EED85 Apr 9, 2025
33ecb13
works fine (#50)
EED85 Apr 11, 2025
fd49ba7
sleep -> sleep_random
EED85 Apr 13, 2025
0301f0c
get_table_def works fine (#51)
EED85 Apr 13, 2025
2687278
get_table_def integrated
EED85 Apr 13, 2025
99c1ce6
integrated pollenflug_vorhersage
EED85 Apr 13, 2025
9d98b2c
first draft
EED85 Apr 13, 2025
379db1b
Merge branch 'main' into feature/ES-286-2504-pollenflug-intensitat-au…
EED85 Apr 13, 2025
59716b9
fix pre commits
EED85 Apr 14, 2025
389f8fb
Merge remote-tracking branch 'origin/main' into feature/ES-286-2504-p…
EED85 Apr 14, 2025
9a44238
ad depnendecnise for polrs and date functions
EED85 Apr 14, 2025
64c32f9
use in memory database
EED85 Apr 14, 2025
1d4ed51
first draft for exraction of values
EED85 Apr 14, 2025
252f1c0
pytests are working
EED85 Apr 14, 2025
9c8c398
Improvement/es 291 2504 update decrypted testpage (#56)
EED85 Apr 15, 2025
e6f11f4
Merge branch 'release' into feature/ES-286-2504-pollenflug-intensitat…
EED85 Apr 15, 2025
c8ad839
added functin and tests for polars dfcreation function
EED85 Apr 15, 2025
5a36ca2
implement decode string - first draft
EED85 Apr 17, 2025
ee7aa1b
save code
EED85 Apr 17, 2025
e737275
decode string works for ä/ß
EED85 Apr 26, 2025
303a615
checkpoint
EED85 Apr 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ runs daily main.py and scrappes data
#### Developement
add if: github.ref == 'refs/heads/master' # TODO for developement issues - remove before merge in master
before step Checkout latest release for developement puposese

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ dependencies = [
"cryptography>=44.0.2",
"duckdb>=1.2.0",
"gitpython>=3.1.44",
"polars>=1.27.1",
"pyarrow>=19.0.1",
"python-dateutil>=2.9.0.post0",
"python-dotenv>=1.0.1",
"pyyaml>=6.0.2",
"requests>=2.32.3",
Expand Down
6 changes: 5 additions & 1 deletion src/eed_webscrapping_scripts/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,17 @@
"add_primary_key",
# eed_utils
"sleep_random",
"decode_string",
# polar_utils
"create_table_from_lists",
]
from .duckdb_utils import (
add_primary_key,
check_if_primary_key_exists,
get_db_schema_tbl_from_table_name,
)
from .eed_utils import sleep_random
from .eed_polars_utils import create_table_from_lists
from .eed_utils import decode_string, sleep_random
from .encryption_utils import (
decrypt,
decrypt_direct,
Expand Down
48 changes: 48 additions & 0 deletions src/eed_webscrapping_scripts/modules/eed_polars_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import polars as pl


def create_table_from_lists(
column_names: list,
values: list,
row_indices: list = None,
row_indices_column_name: str = "index",
):
"""
Creates a table with the given column names, row indices, and values using polars.
Notice that the values are reshaped into a 2D list based on the number of columns.

Parameters:
column_names (list): List of column names.
row_indices (list): List of row indices.
values (list): List of values to populate the table.

Returns:
pl.DataFrame: The resulting table as a polars DataFrame.
Example:
column_names = ["COL A", "COL B"]
row_indices = ["1", "2", "3"]
values = ["l", "l", "m", "h", "m", "h"]
df = create_table(column_names, row_indices, values)

shape: (3, 2)
┌───────┬───────┐
│ COL A ┆ COL B │
│ --- ┆ --- │
│ str ┆ str │
╞═══════╪═══════╡
│ l ┆ l │
│ m ┆ h │
│ m ┆ h │
└───────┴───────┘

"""
# Reshape the values list into a 2D list
reshaped_values = [
values[i : i + len(column_names)] for i in range(0, len(values), len(column_names))
]
# Create the DataFrame
df = pl.DataFrame(reshaped_values, schema=column_names)
# Add the row indices as a new column
if row_indices is not None:
df = df.with_columns(pl.Series(row_indices_column_name, row_indices))
return df
16 changes: 16 additions & 0 deletions src/eed_webscrapping_scripts/modules/eed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,19 @@ def sleep_random(
sleep_in_seconds = round(sleep_in_seconds, ndigits)
time.sleep(sleep_in_seconds)
return sleep_in_seconds


def decode_string(encoded_string: str, encode: str = "latin1", decode: str = "utf-8") -> str:
"""
Decodes an encoded string from Latin-1 to UTF-8.

Parameters:
encoded_string (str): The encoded string to decode.

Returns:
str: The decoded string.
"""
decoded_string = encoded_string.encode(encode).decode(decode)
if encode == "latin1" and decode == "utf-8":
decoded_string = decoded_string.replace(r"\xc3\x9f", "ß").replace(r"\xc3\xa4", "ä")
return decoded_string
7 changes: 3 additions & 4 deletions src/eed_webscrapping_scripts/modules/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ def read_sql_file(path_to_file: str, git_root: str = None) -> str:
return sql


def connect_to_db(cfg=None):
def connect_to_db(cfg=None, database_name: str = "dwd"):
"""Connects to Motherduck database.
Needs Github secret ``MD_TOKEN`` defined, if used in github Actions.
Needs ``.motherduck_token`` file in your home directory.
Returns:
DuckDB / Motherduck connection:
"""
if cfg["env"]["_ENVIRONMENT_"] == "PROD": # TODO: ES-282 DWD anpasse
if cfg["env"]["_ENVIRONMENT_"] == "PROD":
try:
with open(os.path.join(home_dir, ".motherduck_token")) as f:
md_token = f.read()
Expand All @@ -62,8 +62,7 @@ def connect_to_db(cfg=None):
print("Connected to Motherduck")
else:
con = duckdb.connect()
con.sql("ATTACH ':memory:' AS dwd;")
con.sql("USE dwd")
con.sql(f"ATTACH ':memory:' AS {database_name};")
print("Connected to in memory duckdb database")
return con

Expand Down
3 changes: 2 additions & 1 deletion src/eed_webscrapping_scripts/pollenvorhersage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"open_webpage_and_select_plz",
"prepare_db",
"upload_webpage_to_db",
"download_wepages",
]

from .db import prepare_db
from .utils import get_config, open_webpage_and_select_plz, upload_webpage_to_db
from .utils import download_wepages, get_config, open_webpage_and_select_plz, upload_webpage_to_db
13 changes: 11 additions & 2 deletions src/eed_webscrapping_scripts/pollenvorhersage/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,17 @@ pollenvorhersage:
primary_key:
- file
- last_modified_date
information_layer: "information_layer"

information_layer:
name: "information_layer"
tables:
pollenflug_vorhersage:
name: pollenflug_vorhersage
path: ""
primary_key:
- last_update_dt
- plz
- pollenart
- date
git_root: ""
env:
_EXECUTION_ENVIRONMENT_: ""
Expand Down
18 changes: 9 additions & 9 deletions src/eed_webscrapping_scripts/pollenvorhersage/db.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from eed_webscrapping_scripts.modules import connect_to_db
from eed_webscrapping_scripts.modules import connect_to_db, get_table_definition
from eed_webscrapping_scripts.modules.duckdb_utils import add_primary_key


Expand All @@ -14,11 +14,9 @@ def prepare_db(cfg, con=None):
Useful, if con has been ommitted by inputargs.
"""

con = con or connect_to_db(cfg)
con = con or connect_to_db(cfg, database_name="pollenvorhersage")
if cfg["env"]["_ENVIRONMENT_"] == "PROD":
con.sql("CREATE DATABASE IF NOT EXISTS pollenvorhersage")
else:
con.sql("ATTACH IF NOT EXISTS 'pollenvorhersage.duckdb'")
con.sql("USE pollenvorhersage")
con.sql("CREATE SCHEMA IF NOT EXISTS datalake")
con.sql("CREATE SCHEMA IF NOT EXISTS information_layer")
Expand All @@ -30,10 +28,12 @@ def prepare_db(cfg, con=None):
)
"""
)

table_pollenflug_vorhersage = get_table_definition(
cfg=cfg, schema_name="information_layer", table_name="pollenflug_vorhersage"
)
con.sql(
"""
CREATE TABLE IF NOT EXISTS information_layer.pollenflug_vorhersage(
f"""
CREATE TABLE IF NOT EXISTS {table_pollenflug_vorhersage["path"]}(
table_name VARCHAR
, last_update TIMESTAMP
, last_update_dt DATE
Expand All @@ -46,8 +46,8 @@ def prepare_db(cfg, con=None):
)

add_primary_key(
table_name="information_layer.pollenflug_vorhersage",
primary_key=("last_update_dt", "plz", "pollenart", "date"),
table_name=table_pollenflug_vorhersage["path"],
primary_key=table_pollenflug_vorhersage["primary_key"],
con=con,
if_exists="pass",
)
Expand Down
83 changes: 73 additions & 10 deletions src/eed_webscrapping_scripts/pollenvorhersage/pollenvorhersage.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
import re
from datetime import datetime
from pathlib import Path

import polars as pl
from bs4 import BeautifulSoup
from dateutil.parser import parse
from selenium import webdriver

from eed_webscrapping_scripts.modules import (
ask_user_for_local_production_run,
create_table_from_lists,
decode_string,
decrypt_direct,
decrypt_file,
save_webpage,
)
from eed_webscrapping_scripts.pollenvorhersage import (
download_wepages,
get_config,
open_webpage_and_select_plz,
prepare_db,
Expand Down Expand Up @@ -52,17 +60,7 @@ def fetch_and_store_html(self):
upload_webpage_to_db(con, file, plz, cfg)
print("upladed")

# Enter the value into the search box

# soup.find_all('img', {'title': True})
# soup.find_all(class_='datum')
# soup.find_all(class_='tooltip')

# Wait for the data to load and scrape the data
# Add your scraping logic here

# clean up

match cfg["env"]["_ENVIRONMENT_"]:
case "PROD":
driver.quit()
Expand All @@ -73,6 +71,71 @@ def fetch_and_store_html(self):
print("END")
return con

def extract_pollenvorhersage(self):
cfg = self.cfg
con = self.con
webpages = download_wepages(cfg=cfg, con=con)
print(webpages)

current_date = datetime.now().date()
for i in range(len(webpages)):
content = webpages["content"][i]
soup = BeautifulSoup(content, "html.parser")
dates_to_extract = soup.find_all(class_="datum")
dates = list(range(len(dates_to_extract)))
for j in range(len(dates_to_extract)):
date_to_extract = dates_to_extract[j].get_text(strip=True)
dd_mm = re.search(r"\d{2}.\d{2}", date_to_extract)[0]
parsed_date = parse(f"""{dd_mm}.{current_date.year}""", dayfirst=True).date()
dates[j] = parsed_date
dates_str = [date.strftime("%Y_%m_%d") for date in dates]
soup_pollenarten = soup.find_all(class_="tooltip")
pollenarten = list(range(len(soup_pollenarten)))
for k in range(len(soup_pollenarten)):
pollenart = soup_pollenarten[k].find(class_="tooltiptext").find("img")["alt"]
pollenarten[k] = decode_string(pollenart)

soup_belastungen = soup.find_all("img", {"title": True})
belastungen = [
belastung["title"]
for belastung in soup_belastungen[: (len(dates) * len(soup_pollenarten))]
]

df = create_table_from_lists(
column_names=dates_str,
values=belastungen,
row_indices=pollenarten,
row_indices_column_name="pollenart",
)
df = df.unpivot(index="pollenart")

# TODO: Encoding using polars does not work yet, normla function works
df = df.with_columns(
pl.col("pollenart")
.map_elements(lambda x: decode_string(x))
.alias("pollenart_decoded")
)

print(len(df))
mapping = {
"keine Belastung": 0,
"schwache Belastung": 1,
"mittlere Belastung": 2,
"starke Belastung": 3,
}
con.sql(f"""
-- CREATE OR REPLACE TEMP TABLE pollenflug AS
SELECT
* EXCLUDE(variable)
, strptime(variable, '%Y_%m_%d')::DATE AS date
, MAP {str(mapping)}[value] AS belastung
FROM df
WHERE pollenart = 'Gräser'
ORDER BY pollenart, date
""")
# Wait for the data to load and scrape the data
# Add your scraping logic here


if __name__ == "__main__":
pollenvorhersage_handler = PollenvorhersageHandler()
Expand Down
47 changes: 42 additions & 5 deletions src/eed_webscrapping_scripts/pollenvorhersage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
add_primary_key,
get_environment,
get_git_root,
get_table_definition,
sleep_random,
)

Expand Down Expand Up @@ -60,17 +61,53 @@ def upload_webpage_to_db(con, file, plz, cfg: dict, table: str = "_webpage_"):
FROM read_text('{str(file)}')
WHERE TRUE
""")

con.sql(f"""CREATE TABLE IF NOT EXISTS datalake.webpages AS SELECT * FROM {table} LIMIT 0""")
table_webpages = get_table_definition(cfg=cfg, table_name="webpages")
con.sql(
f"""CREATE TABLE IF NOT EXISTS {table_webpages["path"]} AS SELECT * FROM {table} LIMIT 0"""
)
add_primary_key(
table_name="datalake.webpages",
primary_key=("file", "last_modified_date"),
table_name=table_webpages["path"],
primary_key=table_webpages["primary_key"],
con=con,
if_exists="pass",
)
con.sql(f"""
INSERT OR IGNORE INTO datalake.webpages
INSERT OR IGNORE INTO {table_webpages["path"]}
SELECT * FROM {table}
""")

pass


def download_wepages(cfg, con):
tbl_w = get_table_definition(cfg=cfg, table_name="webpages")
tbl_pv = get_table_definition(
cfg=cfg, table_name="pollenflug_vorhersage", schema_name="information_layer"
)

# identify tables, that have not been scrapped
con.sql(f"""
CREATE OR REPLACE TEMPORARY TABLE tables_not_scrapped AS
WITH _w AS (
SELECT plz, last_modified_date, file
FROM {tbl_w["path"]}
)
, _pv AS (
SELECT last_update_dt AS last_modified_date , plz
FROM {tbl_pv["path"]}
)
SELECT
_w.file,
_w.last_modified_date
FROM _w LEFT JOIN _pv USING(plz, last_modified_date)
WHERE TRUE
AND _pv.plz IS NULL
""")
# download tables
df = con.sql(f"""
SELECT
file, content, plz, last_modified_date
FROM {tbl_w["path"]}
INNER JOIN tables_not_scrapped USING(file,last_modified_date)
""").pl()
return df

Large diffs are not rendered by default.

Loading
Loading