From 1c91ce611d7d76bf13148275bda57e200aad31b2 Mon Sep 17 00:00:00 2001 From: idelcano Date: Wed, 21 May 2025 13:36:01 +0200 Subject: [PATCH 01/23] added documents marked as to be removed clean script --- .../file_garbage_remover.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 DHIS2/post_clone_scripts/file_garbage_remover.py diff --git a/DHIS2/post_clone_scripts/file_garbage_remover.py b/DHIS2/post_clone_scripts/file_garbage_remover.py new file mode 100644 index 00000000..cf7cb107 --- /dev/null +++ b/DHIS2/post_clone_scripts/file_garbage_remover.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +import subprocess +import sys +import argparse + + +LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql" +LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" +REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" +FILE_BASE_PATH = "/DHIS2_home/files" + + +SQL_QUERY = """ + SELECT storagekey + FROM fileresource fr + WHERE NOT EXISTS ( + SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid + ) + AND fr.uid NOT IN ( + SELECT url FROM document + ) + AND fr.domain = 'DOCUMENT'; +""" + + +def run(command, check=True, capture=False): + """Run a shell command.""" + print(f"$ {' '.join(command)}") + return subprocess.run(command, check=check, capture_output=capture, text=True) + + +def find_container_id(name_pattern): + """Find the container ID based on a name substring.""" + result = run(["docker", "ps", "--format", "{{.ID}}\t{{.Names}}"], capture=True) + for line in result.stdout.strip().splitlines(): + cid, name = line.split("\t") + if name_pattern in name: + return cid + return None + + +def slugify(instance_name): + """Convert instance name to container name slug.""" + return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-") + + +def main(): + parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.") + parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)") + args = parser.parse_args() + + instance_name = args.instance + container_slug = slugify(instance_name) + db_container_match = container_slug + "-db-1" + core_container_match = container_slug + "-core-1" + + print("Writing SQL file...") + with open(LOCAL_SQL_FILE, "w") as f: + f.write(SQL_QUERY) + + print("Running SQL with d2-docker and capturing output...") + result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True) + + print("Saving result to file...") + with open(LOCAL_LIST_FILE, "w") as f: + for line in result.stdout.strip().splitlines(): + line = line.strip() + if line and not line.lower().startswith("storagekey"): + f.write(f"{line}\n") + + print("Identifying containers...") + db_container = find_container_id(db_container_match.replace("dhis2-data-", "")) + core_container = find_container_id(core_container_match.replace("dhis2-data-","")) + print(db_container) + print(core_container) + if not db_container or not core_container: + print("Could not find DB or Core container.") + print(f"Looked for: {db_container_match}, {core_container_match}") + sys.exit(1) + + print(f"DB container: {db_container}") + print(f"Core container: {core_container}") + + print("Copying file list to Core container...") + run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"]) + + print(f"Deleting orphaned files in Core container... {core_container} {db_container_match}") + delete_cmd = f""" + bash -c ' + if [ ! -f "{REMOTE_LIST_FILE}" ]; then + echo "File list not found: {REMOTE_LIST_FILE}" + exit 1 + fi + + while IFS= read -r key; do + fullpath="{FILE_BASE_PATH}/$key" + echo "Deleting files: $fullpath*" + rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath" + done < "{REMOTE_LIST_FILE}" + ' + """ + + run(["docker", "exec", core_container, "bash", "-c", delete_cmd]) + + print("Cleanup complete.") + + +if __name__ == "__main__": + main() From fe1ce44b170fc6700d1126e47127d9f254e57712 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 18:59:00 +0200 Subject: [PATCH 02/23] rename folder --- .../file_garbage_remover.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename DHIS2/{post_clone_scripts => file_garbage_remover}/file_garbage_remover.py (100%) diff --git a/DHIS2/post_clone_scripts/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py similarity index 100% rename from DHIS2/post_clone_scripts/file_garbage_remover.py rename to DHIS2/file_garbage_remover/file_garbage_remover.py From 776c7efe9ddac9221442b95b4592967dfe15f213 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 18:59:38 +0200 Subject: [PATCH 03/23] remove unnecesary code and simplify script --- .../file_garbage_remover.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py index cf7cb107..ff8e1fea 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover.py @@ -31,13 +31,10 @@ def run(command, check=True, capture=False): def find_container_id(name_pattern): - """Find the container ID based on a name substring.""" - result = run(["docker", "ps", "--format", "{{.ID}}\t{{.Names}}"], capture=True) - for line in result.stdout.strip().splitlines(): - cid, name = line.split("\t") - if name_pattern in name: - return cid - return None + """Find the container ID based on a name substring. Retrieves first match only""" + result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True) + lines = result.stdout.strip().splitlines() + return lines[0] if lines else None def slugify(instance_name): @@ -52,8 +49,7 @@ def main(): instance_name = args.instance container_slug = slugify(instance_name) - db_container_match = container_slug + "-db-1" - core_container_match = container_slug + "-core-1" + core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1" print("Writing SQL file...") with open(LOCAL_SQL_FILE, "w") as f: @@ -70,22 +66,15 @@ def main(): f.write(f"{line}\n") print("Identifying containers...") - db_container = find_container_id(db_container_match.replace("dhis2-data-", "")) - core_container = find_container_id(core_container_match.replace("dhis2-data-","")) - print(db_container) + core_container = find_container_id(core_container_match) print(core_container) - if not db_container or not core_container: - print("Could not find DB or Core container.") - print(f"Looked for: {db_container_match}, {core_container_match}") - sys.exit(1) - print(f"DB container: {db_container}") print(f"Core container: {core_container}") print("Copying file list to Core container...") run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"]) - print(f"Deleting orphaned files in Core container... {core_container} {db_container_match}") + print(f"Deleting orphaned files in Core container... {core_container}") delete_cmd = f""" bash -c ' if [ ! -f "{REMOTE_LIST_FILE}" ]; then From bdf37f41c6cd5056150bdf7aada823af0377df41 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 19:00:05 +0200 Subject: [PATCH 04/23] rename file --- .../file_garbage_remover_docker.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 DHIS2/file_garbage_remover/file_garbage_remover_docker.py diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py new file mode 100644 index 00000000..ff8e1fea --- /dev/null +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import subprocess +import sys +import argparse + + +LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql" +LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" +REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" +FILE_BASE_PATH = "/DHIS2_home/files" + + +SQL_QUERY = """ + SELECT storagekey + FROM fileresource fr + WHERE NOT EXISTS ( + SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid + ) + AND fr.uid NOT IN ( + SELECT url FROM document + ) + AND fr.domain = 'DOCUMENT'; +""" + + +def run(command, check=True, capture=False): + """Run a shell command.""" + print(f"$ {' '.join(command)}") + return subprocess.run(command, check=check, capture_output=capture, text=True) + + +def find_container_id(name_pattern): + """Find the container ID based on a name substring. Retrieves first match only""" + result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True) + lines = result.stdout.strip().splitlines() + return lines[0] if lines else None + + +def slugify(instance_name): + """Convert instance name to container name slug.""" + return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-") + + +def main(): + parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.") + parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)") + args = parser.parse_args() + + instance_name = args.instance + container_slug = slugify(instance_name) + core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1" + + print("Writing SQL file...") + with open(LOCAL_SQL_FILE, "w") as f: + f.write(SQL_QUERY) + + print("Running SQL with d2-docker and capturing output...") + result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True) + + print("Saving result to file...") + with open(LOCAL_LIST_FILE, "w") as f: + for line in result.stdout.strip().splitlines(): + line = line.strip() + if line and not line.lower().startswith("storagekey"): + f.write(f"{line}\n") + + print("Identifying containers...") + core_container = find_container_id(core_container_match) + print(core_container) + + print(f"Core container: {core_container}") + + print("Copying file list to Core container...") + run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"]) + + print(f"Deleting orphaned files in Core container... {core_container}") + delete_cmd = f""" + bash -c ' + if [ ! -f "{REMOTE_LIST_FILE}" ]; then + echo "File list not found: {REMOTE_LIST_FILE}" + exit 1 + fi + + while IFS= read -r key; do + fullpath="{FILE_BASE_PATH}/$key" + echo "Deleting files: $fullpath*" + rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath" + done < "{REMOTE_LIST_FILE}" + ' + """ + + run(["docker", "exec", core_container, "bash", "-c", delete_cmd]) + + print("Cleanup complete.") + + +if __name__ == "__main__": + main() From 6f7613be830722bd477041890850b84c181852a4 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 19:03:43 +0200 Subject: [PATCH 05/23] Added tomcat version --- .../file_garbage_remover_tomcat.py | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py new file mode 100644 index 00000000..a1e03f5e --- /dev/null +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +import re + +import psycopg2 +import shutil +import os +import argparse +import json +import sys +from datetime import datetime + +SQL_FIND_ORPHANS = """ + SELECT fileresourceid, storagekey + FROM fileresource fr + WHERE NOT EXISTS ( + SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid + ) + AND fr.uid NOT IN ( + SELECT url FROM document + ) + AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; +""" + +SQL_INSERT_AUDIT = """ + INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +SQL_DELETE_ORIGINAL = """ + DELETE FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +def log(message): + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if isinstance(message, str): + # Detect pattern like: postgresql://user:password@host + message = re.sub( + r"(postgresql://[^:]+:)([^@]+)(@)", + r"\1*****\3", + message + ) + print(f"[{timestamp}] {message}") + +def load_config(path): + with open(path, "r") as f: + return json.load(f) + +def get_matching_document_files(storage_key, base_dir): + storage_key = storage_key.replace("document/", "") + base_dir = os.path.join(base_dir, "document") + matching = [] + + for filename in os.listdir(base_dir): + if filename.startswith(storage_key): + fullpath = os.path.join(base_dir, filename) + if os.path.isfile(fullpath): + matching.append(fullpath) + return matching + +def update_db(fileresourceid, cursor, conn, dry_run=False): + if dry_run: + log("[DRY RUN] Would execute:") + log(f" {SQL_INSERT_AUDIT.strip()} with fileresourceid = '{fileresourceid}'") + log(f" {SQL_DELETE_ORIGINAL.strip()} with fileresourceid = '{fileresourceid}'") + else: + try: + cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid}) + cursor.execute(SQL_DELETE_ORIGINAL, {"fid": fileresourceid}) + conn.commit() + log(f"Archived and deleted fileresourceid {fileresourceid}") + except Exception as e: + log(f"❌ Failed DB update for fileresourceid {fileresourceid}: {e}") + conn.rollback() + raise + +def move_files(file_list, file_base_path, temp_file_path, dry_run=False): + base_dir = os.path.join(file_base_path, "document") + dest_dir = os.path.join(temp_file_path, "document") + for src in file_list: + rel_path = os.path.relpath(src, base_dir) + log(rel_path) + dst = os.path.join(dest_dir, rel_path) + log(dst) + dst_dir = os.path.dirname(dst) + log(dst_dir) + try: + os.makedirs(dst_dir, exist_ok=True) + except Exception as e: + log(f"❌ Failed to create directory {dst_dir}: {e}") + raise + + log(f"{'[DRY RUN] ' if dry_run else ''}Moving {src} -> {dst}") + if not dry_run: + try: + shutil.move(src, dst) + except Exception as e: + log(f"❌ Failed to move {src}: {e}") + raise + + + + +def ensure_audit_table_exists(cursor, dry_run=False): + if dry_run: + log("[DRY RUN] Would execute:") + log(" CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;") + else: + log("Ensuring 'fileresourcesaudit' table exists...") + cursor.execute(""" + CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; + """) + log("'fileresourcesaudit' table ready.") + +def main(): + parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes).") + group.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") + parser.add_argument("--config", required=True, help="Path to config.json file.") + args = parser.parse_args() + + config = load_config(args.config) + required_keys = ["db_host", "db_port", "db_name", "db_user", "file_base_path", "temp_file_path"] + missing_keys = [k for k in required_keys if not config.get(k)] + + if missing_keys: + log(f"❌ Missing required config keys: {', '.join(missing_keys)}") + sys.exit(1) + + db_password = os.environ.get("DB_PASSWORD_FILE_G") + if not db_password: + log("❌ Missing required environment variable: DB_PASSWORD") + sys.exit(1) + + db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}" + file_base_path = config["file_base_path"] + temp_file_path = config["temp_file_path"] + + if not os.path.isdir(file_base_path) or not os.path.isdir(temp_file_path): + log("❌ Both 'file_base_path' and 'temp_file_path' must exist and be directories.") + log(f" file_base_path: {file_base_path} -> {'OK' if os.path.isdir(file_base_path) else 'INVALID'}") + log(f" temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}") + sys.exit(1) + + dry_run = args.test + log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}") + log(f"Connecting to database: {db_url}") + log(f"File base path: {file_base_path}") + log(f"Temporary move path: {temp_file_path}") + + with psycopg2.connect(dsn=db_url) as conn: + with conn.cursor() as cur: + ensure_audit_table_exists(cur, dry_run) + log("Querying orphaned fileresource entries...") + cur.execute(SQL_FIND_ORPHANS) + rows = cur.fetchall() + + if not rows: + log("✅ No orphaned fileresource entries found.") + return + + log(f"Found {len(rows)} orphaned entries.") + count = 0 + for fileresourceid, storagekey in rows: + if not fileresourceid: + log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") + continue + + try: + matches = get_matching_document_files(storagekey, file_base_path) + if not matches: + raise Exception(f"No files found for storage_key: {storagekey}") + move_files(matches, file_base_path, temp_file_path, dry_run) + update_db(fileresourceid, cur, conn, dry_run) + except Exception as e: + log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}") + sys.exit(1) + count = count + 1 + log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") + +if __name__ == "__main__": + main() From 546761127482e522223d80bf4c0f3418890b7d48 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 19:03:57 +0200 Subject: [PATCH 06/23] remove duplicate old file --- .../file_garbage_remover.py | 99 ------------------- 1 file changed, 99 deletions(-) delete mode 100644 DHIS2/file_garbage_remover/file_garbage_remover.py diff --git a/DHIS2/file_garbage_remover/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py deleted file mode 100644 index ff8e1fea..00000000 --- a/DHIS2/file_garbage_remover/file_garbage_remover.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 - -import subprocess -import sys -import argparse - - -LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql" -LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" -REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" -FILE_BASE_PATH = "/DHIS2_home/files" - - -SQL_QUERY = """ - SELECT storagekey - FROM fileresource fr - WHERE NOT EXISTS ( - SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid - ) - AND fr.uid NOT IN ( - SELECT url FROM document - ) - AND fr.domain = 'DOCUMENT'; -""" - - -def run(command, check=True, capture=False): - """Run a shell command.""" - print(f"$ {' '.join(command)}") - return subprocess.run(command, check=check, capture_output=capture, text=True) - - -def find_container_id(name_pattern): - """Find the container ID based on a name substring. Retrieves first match only""" - result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True) - lines = result.stdout.strip().splitlines() - return lines[0] if lines else None - - -def slugify(instance_name): - """Convert instance name to container name slug.""" - return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-") - - -def main(): - parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.") - parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)") - args = parser.parse_args() - - instance_name = args.instance - container_slug = slugify(instance_name) - core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1" - - print("Writing SQL file...") - with open(LOCAL_SQL_FILE, "w") as f: - f.write(SQL_QUERY) - - print("Running SQL with d2-docker and capturing output...") - result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True) - - print("Saving result to file...") - with open(LOCAL_LIST_FILE, "w") as f: - for line in result.stdout.strip().splitlines(): - line = line.strip() - if line and not line.lower().startswith("storagekey"): - f.write(f"{line}\n") - - print("Identifying containers...") - core_container = find_container_id(core_container_match) - print(core_container) - - print(f"Core container: {core_container}") - - print("Copying file list to Core container...") - run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"]) - - print(f"Deleting orphaned files in Core container... {core_container}") - delete_cmd = f""" - bash -c ' - if [ ! -f "{REMOTE_LIST_FILE}" ]; then - echo "File list not found: {REMOTE_LIST_FILE}" - exit 1 - fi - - while IFS= read -r key; do - fullpath="{FILE_BASE_PATH}/$key" - echo "Deleting files: $fullpath*" - rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath" - done < "{REMOTE_LIST_FILE}" - ' - """ - - run(["docker", "exec", core_container, "bash", "-c", delete_cmd]) - - print("Cleanup complete.") - - -if __name__ == "__main__": - main() From b047fcedb6fa3aedbe34c9910899090f66e3c116 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 7 Jul 2025 19:09:04 +0200 Subject: [PATCH 07/23] Added readme --- DHIS2/file_garbage_remover/Readme.md | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 DHIS2/file_garbage_remover/Readme.md diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md new file mode 100644 index 00000000..faf1ff77 --- /dev/null +++ b/DHIS2/file_garbage_remover/Readme.md @@ -0,0 +1,72 @@ +# DHIS2 File Garbage Remover Scripts + +This folder contains two Python scripts designed to manage orphaned resources in DHIS2 instances. +These scripts identify files with no database references and take appropriate actions depending on the environment, production (tomcat) or testing (docker). + +Included Scripts + +## file_garbage_remover_tomcat.py + +### Description: + +Designed for production environments. Identifies orphaned file resources (only documents for now), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table. + +### Usage: + +Run the script in either test or force mode. + +Test Mode (dry-run): + +```./file_garbage_remover_tomcat.py --test --config /path/to/config.json``` + +Force Mode (apply changes): + +``` +export DB_PASSWORD_FILE_G='your_password' +./file_garbage_remover_tomcat.py --force --config /path/to/config.json +``` + +config.json File Requirements: +``` +{ + "db_host": "", + "db_port": "", + "db_name": "", + "db_user": "", + "file_base_path": "", + "temp_file_path": "" +} +``` + +Ensure file_base_path and temp_file_path exist and are valid directories. + +Password should be provided through the environment variable DB_PASSWORD_FILE_G. + +## file_garbage_remover_docker.py + +### Description: + +Intended for d2-docker testing environments. +Identifies orphaned files in a Dockerized DHIS2 instance and directly deletes them from the container. This script does not archive or move files, permanently removing identified resources. + +### Usage: + +Run the script specifying the Docker DHIS2 instance: + +``` +./file_garbage_remover_docker.py --instance docker.eyeseetea.com/project/dhis2-data:2.41-test +``` + +### Operation: + +Executes an SQL query within the DHIS2 container using d2-docker run-sql. + +Copies the generated file list into the container. + +Deletes identified files directly inside the container. + +# Precautions + +Production Environments: Always use --test mode before using --force to verify intended changes. + +Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered. \ No newline at end of file From b98b7bb7e2f7b45ee2ff648d6ec318a5da453c69 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 14 Jul 2025 17:15:36 +0200 Subject: [PATCH 08/23] fix readme --- DHIS2/file_garbage_remover/Readme.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index faf1ff77..8a259966 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -17,7 +17,10 @@ Run the script in either test or force mode. Test Mode (dry-run): -```./file_garbage_remover_tomcat.py --test --config /path/to/config.json``` +``` +export DB_PASSWORD_FILE_G='your_password' +./file_garbage_remover_tomcat.py --test --config /path/to/config.json +``` Force Mode (apply changes): @@ -42,6 +45,19 @@ Ensure file_base_path and temp_file_path exist and are valid directories. Password should be provided through the environment variable DB_PASSWORD_FILE_G. +Bash wrapper example: +``` +#!/bin/bash +MODE="${1:-test}" +[[ "$MODE" != "test" && "$MODE" != "force" ]] && echo "Invalid mode." && exit 1 + +DB_PASSWORD_FILE_G=db_password + +python3 /path/to/script/bin/file_garbage_remover/file_garbage_remover_tomcat.py --config /path/to/script/bin/file_garbage_remover/config.json --$MODE 2>&1 | tee -a /path/to/logs/orphan_cleanup.log + +unset DB_PASSWORD_FILE_G +``` + ## file_garbage_remover_docker.py ### Description: From 3c6062ad8b7e06319ae124337f5cda6e1607ab5e Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 14 Jul 2025 17:25:45 +0200 Subject: [PATCH 09/23] Refactoring to remove magic values and fix log comment --- .../file_garbage_remover_tomcat.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index a1e03f5e..5268f053 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -29,6 +29,10 @@ DELETE FROM fileresource WHERE fileresourceid = %(fid)s; """ +SQL_CREATE_TABLE_IF_NOT_EXIST = """ + CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; +""" + def log(message): timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if isinstance(message, str): @@ -59,8 +63,10 @@ def get_matching_document_files(storage_key, base_dir): def update_db(fileresourceid, cursor, conn, dry_run=False): if dry_run: log("[DRY RUN] Would execute:") - log(f" {SQL_INSERT_AUDIT.strip()} with fileresourceid = '{fileresourceid}'") - log(f" {SQL_DELETE_ORIGINAL.strip()} with fileresourceid = '{fileresourceid}'") + log(f" {SQL_INSERT_AUDIT.strip()}") + log(f" with parameters: {{'fid': '{fileresourceid}'}}") + log(f" {SQL_DELETE_ORIGINAL.strip()} ") + log(f" with parameters: {{'fid': '{fileresourceid}'}}") else: try: cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid}) @@ -102,12 +108,10 @@ def move_files(file_list, file_base_path, temp_file_path, dry_run=False): def ensure_audit_table_exists(cursor, dry_run=False): if dry_run: log("[DRY RUN] Would execute:") - log(" CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;") + log(SQL_CREATE_TABLE_IF_NOT_EXIST) else: log("Ensuring 'fileresourcesaudit' table exists...") - cursor.execute(""" - CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; - """) + cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST) log("'fileresourcesaudit' table ready.") def main(): From 0de1af1273b3a65068d31a83b2d98fa31de0939c Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 14 Jul 2025 17:27:56 +0200 Subject: [PATCH 10/23] change db variable name --- DHIS2/file_garbage_remover/Readme.md | 10 +++++----- .../file_garbage_remover_tomcat.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index 8a259966..a71416cf 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -18,14 +18,14 @@ Run the script in either test or force mode. Test Mode (dry-run): ``` -export DB_PASSWORD_FILE_G='your_password' +export DB_PASSWORD_FG='your_password' ./file_garbage_remover_tomcat.py --test --config /path/to/config.json ``` Force Mode (apply changes): ``` -export DB_PASSWORD_FILE_G='your_password' +export DB_PASSWORD_FG='your_password' ./file_garbage_remover_tomcat.py --force --config /path/to/config.json ``` @@ -43,7 +43,7 @@ config.json File Requirements: Ensure file_base_path and temp_file_path exist and are valid directories. -Password should be provided through the environment variable DB_PASSWORD_FILE_G. +Password should be provided through the environment variable DB_PASSWORD_FG. Bash wrapper example: ``` @@ -51,11 +51,11 @@ Bash wrapper example: MODE="${1:-test}" [[ "$MODE" != "test" && "$MODE" != "force" ]] && echo "Invalid mode." && exit 1 -DB_PASSWORD_FILE_G=db_password +DB_PASSWORD_FG=db_password python3 /path/to/script/bin/file_garbage_remover/file_garbage_remover_tomcat.py --config /path/to/script/bin/file_garbage_remover/config.json --$MODE 2>&1 | tee -a /path/to/logs/orphan_cleanup.log -unset DB_PASSWORD_FILE_G +unset DB_PASSWORD_FG ``` ## file_garbage_remover_docker.py diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index 5268f053..2e064281 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -130,7 +130,7 @@ def main(): log(f"❌ Missing required config keys: {', '.join(missing_keys)}") sys.exit(1) - db_password = os.environ.get("DB_PASSWORD_FILE_G") + db_password = os.environ.get("DB_PASSWORD_FG") if not db_password: log("❌ Missing required environment variable: DB_PASSWORD") sys.exit(1) From 9ebf3e31107a8a9525aa08fcd631ff1f14bcd2eb Mon Sep 17 00:00:00 2001 From: Carlos Garcia Bautista Date: Thu, 17 Jul 2025 15:03:54 +0000 Subject: [PATCH 11/23] Edited error reported to the user to reflect the correct name of the environmental variable --- DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index 2e064281..b813c8d0 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -132,7 +132,7 @@ def main(): db_password = os.environ.get("DB_PASSWORD_FG") if not db_password: - log("❌ Missing required environment variable: DB_PASSWORD") + log("❌ Missing required environment variable: DB_PASSWORD_FG") sys.exit(1) db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}" From 8048428f4e6b172f2570264ce173bc8dfb639ba0 Mon Sep 17 00:00:00 2001 From: idelcano Date: Fri, 25 Jul 2025 16:12:54 +0200 Subject: [PATCH 12/23] Update script to clean data value file resources excluding actives in event, datavalue, trackedentityattribute values. --- DHIS2/file_garbage_remover/Readme.md | 2 +- .../file_garbage_remover_tomcat.py | 177 ++++++++++++++---- 2 files changed, 138 insertions(+), 41 deletions(-) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index a71416cf..886a8f5c 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -9,7 +9,7 @@ Included Scripts ### Description: -Designed for production environments. Identifies orphaned file resources (only documents for now), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table. +Designed for production environments. Identifies orphaned file resources (documents or datavalues (files or images attached to a value)), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table. ### Usage: diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index b813c8d0..97660547 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 -import re - -import psycopg2 -import shutil -import os import argparse import json +import os +import re +import shutil import sys from datetime import datetime -SQL_FIND_ORPHANS = """ +import psycopg2 + +SQL_FIND_ORPHANS_DOCUMENTS = """ SELECT fileresourceid, storagekey FROM fileresource fr WHERE NOT EXISTS ( @@ -21,6 +21,11 @@ AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; """ +SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ + SELECT fileresourceid, uid, storagekey + FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; +""" + SQL_INSERT_AUDIT = """ INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; """ @@ -33,6 +38,25 @@ CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; """ +SQL_EVENT_FILE_UIDS = """ + SELECT eventdatavalues + FROM event + WHERE programstageid + IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid + IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; +""" + +SQL_DATA_VALUE_UIDS = """ + SELECT dv.value + FROM datavalue dv + WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') +""" + +SQL_TRACKER_ATTRIBUTE_UIDS = """ +select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); +""" + + def log(message): timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if isinstance(message, str): @@ -44,13 +68,45 @@ def log(message): ) print(f"[{timestamp}] {message}") + def load_config(path): with open(path, "r") as f: return json.load(f) -def get_matching_document_files(storage_key, base_dir): - storage_key = storage_key.replace("document/", "") - base_dir = os.path.join(base_dir, "document") + +def get_events(cursor): + cursor.execute(SQL_EVENT_FILE_UIDS) + return [row[0] for row in cursor.fetchall() if row[0]] + + +def get_event_blob(cursor): + event_texts = get_events(cursor) + return "\n".join(json.dumps(e) for e in event_texts) + + +def get_data_value_file_resources(cursor): + cursor.execute(""" + SELECT fileresourceid, uid, storagekey + FROM fileresource + WHERE domain = 'DATA_VALUE' + """) + return cursor.fetchall() + + +def get_all_datavalue_uids(cursor): + cursor.execute(SQL_DATA_VALUE_UIDS) + return set(row[0] for row in cursor.fetchall() if row[0]) + + +def get_all_tracker_uids(cursor): + cursor.execute(SQL_TRACKER_ATTRIBUTE_UIDS) + return set(row[0] for row in cursor.fetchall() if row[0]) + + +def get_matching_files(storage_key, base_dir, folder): + # Folder (document or dataValue) + storage_key = storage_key.replace(folder + "/", "") + base_dir = os.path.join(base_dir, folder) matching = [] for filename in os.listdir(base_dir): @@ -60,6 +116,7 @@ def get_matching_document_files(storage_key, base_dir): matching.append(fullpath) return matching + def update_db(fileresourceid, cursor, conn, dry_run=False): if dry_run: log("[DRY RUN] Would execute:") @@ -78,9 +135,10 @@ def update_db(fileresourceid, cursor, conn, dry_run=False): conn.rollback() raise -def move_files(file_list, file_base_path, temp_file_path, dry_run=False): - base_dir = os.path.join(file_base_path, "document") - dest_dir = os.path.join(temp_file_path, "document") + +def move_files(file_list, file_base_path, temp_file_path, folder, dry_run): + base_dir = os.path.join(file_base_path, folder) + dest_dir = os.path.join(temp_file_path, folder) for src in file_list: rel_path = os.path.relpath(src, base_dir) log(rel_path) @@ -103,8 +161,6 @@ def move_files(file_list, file_base_path, temp_file_path, dry_run=False): raise - - def ensure_audit_table_exists(cursor, dry_run=False): if dry_run: log("[DRY RUN] Would execute:") @@ -114,6 +170,7 @@ def ensure_audit_table_exists(cursor, dry_run=False): cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST) log("'fileresourcesaudit' table ready.") + def main(): parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.") group = parser.add_mutually_exclusive_group(required=True) @@ -154,32 +211,72 @@ def main(): with psycopg2.connect(dsn=db_url) as conn: with conn.cursor() as cur: ensure_audit_table_exists(cur, dry_run) - log("Querying orphaned fileresource entries...") - cur.execute(SQL_FIND_ORPHANS) - rows = cur.fetchall() - - if not rows: - log("✅ No orphaned fileresource entries found.") - return - - log(f"Found {len(rows)} orphaned entries.") - count = 0 - for fileresourceid, storagekey in rows: - if not fileresourceid: - log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") - continue - - try: - matches = get_matching_document_files(storagekey, file_base_path) - if not matches: - raise Exception(f"No files found for storage_key: {storagekey}") - move_files(matches, file_base_path, temp_file_path, dry_run) - update_db(fileresourceid, cur, conn, dry_run) - except Exception as e: - log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}") - sys.exit(1) - count = count + 1 - log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") + remove_documents(file_base_path, temp_file_path, dry_run, cur, conn) + remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn) + + +def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn): + log("Querying orphaned fileresource entries...") + cur.execute(SQL_FIND_ORPHANS_DOCUMENTS) + rows = cur.fetchall() + + if not rows: + log("✅ No orphaned fileresource entries found.") + return + + log(f"Found {len(rows)} \"document\" orphaned entries.") + process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document") + + +def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn): + log("Querying orphaned fileresource entries...") + cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES) + rows = cur.fetchall() + + # get all file resource datavalues + datavalue_uids = get_all_datavalue_uids(cur) + # get all events with file dataelement values + event_blob = get_event_blob(cur) + + tracker_uids = get_all_tracker_uids(cur) + # 3. Filter out referenced file resources + orphan_data_values = [] + for fileresourceid, uid, storagekey in rows: + if uid in datavalue_uids: + continue # Referenced in datavalue + if uid in event_blob: + continue # Referenced in event + if uid in tracker_uids: + continue # referenced in a trackerentityattribute + orphan_data_values.append((fileresourceid, storagekey)) + + if not orphan_data_values: + log("✅ No fileResources entries found.") + return + + log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.") + process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue") + + +def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder): + count = 0 + for fileresourceid, storagekey in orphan_data_values: + if not fileresourceid: + log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") + continue + try: + matches = get_matching_files(storagekey, file_base_path, "dataValue") + if not matches: + log(f"No files found for storage_key: {storagekey}") + continue + move_files(matches, file_base_path, temp_file_path, folder, dry_run) + update_db(fileresourceid, cur, conn, dry_run) + except Exception as e: + log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}") + sys.exit(1) + count = count + 1 + log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") + if __name__ == "__main__": main() From 274d5d58e9d5b7c0b0b537169bb06e04224f63df Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 24 Nov 2025 18:52:07 +0100 Subject: [PATCH 13/23] update to add notifications and refactor --- DHIS2/file_garbage_remover/cleaner.py | 301 ++++++++++++++++++ DHIS2/file_garbage_remover/csv_utils.py | 85 +++++ .../file_garbage_remover_tomcat.py | 120 +++++-- DHIS2/file_garbage_remover/main.py | 71 +++++ DHIS2/file_garbage_remover/notifier.py | 68 ++++ 5 files changed, 625 insertions(+), 20 deletions(-) create mode 100644 DHIS2/file_garbage_remover/cleaner.py create mode 100644 DHIS2/file_garbage_remover/csv_utils.py create mode 100644 DHIS2/file_garbage_remover/main.py create mode 100644 DHIS2/file_garbage_remover/notifier.py diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py new file mode 100644 index 00000000..2e7ccd37 --- /dev/null +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +import json +import os +import re +import shutil +import sys +from datetime import datetime + +import subprocess +import psycopg2 + +from csv_utils import append_items, deduplicate_items + +SQL_FIND_ORPHANS_DOCUMENTS = """ + SELECT fileresourceid, storagekey, name, created + FROM fileresource fr + WHERE NOT EXISTS ( + SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid + ) + AND fr.uid NOT IN ( + SELECT url FROM document + ) + AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; +""" + +SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ + SELECT fileresourceid, uid, storagekey, name, created + FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; +""" + +SQL_INSERT_AUDIT = """ + INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +SQL_DELETE_ORIGINAL = """ + DELETE FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +SQL_CREATE_TABLE_IF_NOT_EXIST = """ + CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; +""" + +SQL_EVENT_FILE_UIDS = """ + SELECT eventdatavalues + FROM event + WHERE programstageid + IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid + IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; +""" + +SQL_DATA_VALUE_UIDS = """ + SELECT dv.value + FROM datavalue dv + WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') +""" + +SQL_TRACKER_ATTRIBUTE_UIDS = """ +select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); +""" + + +def log(message): + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if isinstance(message, str): + message = re.sub( + r"(postgresql://[^:]+:)([^@]+)(@)", + r"\1*****\3", + message + ) + print(f"[{timestamp}] {message}") + + +def load_config(path): + with open(path, "r") as f: + return json.load(f) + + +def get_events(cursor): + cursor.execute(SQL_EVENT_FILE_UIDS) + return [row[0] for row in cursor.fetchall() if row[0]] + + +def get_event_blob(cursor): + event_texts = get_events(cursor) + return "\n".join(json.dumps(e) for e in event_texts) + + +def get_all_datavalue_uids(cursor): + cursor.execute(SQL_DATA_VALUE_UIDS) + return set(row[0] for row in cursor.fetchall() if row[0]) + + +def get_all_tracker_uids(cursor): + cursor.execute(SQL_TRACKER_ATTRIBUTE_UIDS) + return set(row[0] for row in cursor.fetchall() if row[0]) + + +def get_matching_files(storage_key, base_dir, folder): + storage_key = storage_key.replace(folder + "/", "") + base_dir = os.path.join(base_dir, folder) + if not os.path.isdir(base_dir): + log(f"⚠️ Skipping search: base directory does not exist: {base_dir}") + return [] + matching = [] + for filename in os.listdir(base_dir): + if filename.startswith(storage_key): + fullpath = os.path.join(base_dir, filename) + if os.path.isfile(fullpath): + matching.append(fullpath) + return matching + + +def update_db(fileresourceid, cursor, conn, dry_run=False): + if dry_run: + log("[DRY RUN] Would execute:") + log(f" {SQL_INSERT_AUDIT.strip()}") + log(f" with parameters: {{'fid': '{fileresourceid}'}}") + log(f" {SQL_DELETE_ORIGINAL.strip()} ") + log(f" with parameters: {{'fid': '{fileresourceid}'}}") + else: + try: + cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid}) + cursor.execute(SQL_DELETE_ORIGINAL, {"fid": fileresourceid}) + conn.commit() + log(f"Archived and deleted fileresourceid {fileresourceid}") + except Exception as e: + log(f"❌ Failed DB update for fileresourceid {fileresourceid}: {e}") + conn.rollback() + raise + + +def move_files(file_list, file_base_path, temp_file_path, folder, dry_run): + base_dir = os.path.join(file_base_path, folder) + dest_dir = os.path.join(temp_file_path, folder) + for src in file_list: + rel_path = os.path.relpath(src, base_dir) + dst = os.path.join(dest_dir, rel_path) + dst_dir = os.path.dirname(dst) + os.makedirs(dst_dir, exist_ok=True) + log(f"{'[DRY RUN] ' if dry_run else ''}Moving {src} -> {dst}") + if not dry_run: + shutil.move(src, dst) + + +def ensure_audit_table_exists(cursor, dry_run=False): + if dry_run: + log("[DRY RUN] Would execute:") + log(SQL_CREATE_TABLE_IF_NOT_EXIST) + else: + log("Ensuring 'fileresourcesaudit' table exists...") + cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST) + log("'fileresourcesaudit' table ready.") + + +def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary): + log("Querying orphaned fileresource entries (documents)...") + cur.execute(SQL_FIND_ORPHANS_DOCUMENTS) + rows = cur.fetchall() + if not rows: + log("✅ No orphaned fileresource entries found (documents).") + return + log(f"Found {len(rows)} \"document\" orphaned entries.") + process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary) + + +def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary): + log("Querying orphaned fileresource entries (data values)...") + cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES) + rows = cur.fetchall() + + datavalue_uids = get_all_datavalue_uids(cur) + event_blob = get_event_blob(cur) + tracker_uids = get_all_tracker_uids(cur) + + orphan_data_values = [] + for fileresourceid, uid, storagekey, name, created in rows: + if uid in datavalue_uids: + continue # Referenced in datavalue + if uid in event_blob: + continue # Referenced in event + if uid in tracker_uids: + continue # Referenced in a tracker attribute + orphan_data_values.append((fileresourceid, storagekey, name, created)) + + if not orphan_data_values: + log("✅ No fileResources entries found (data values).") + return + + log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.") + process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary) + + +def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary): + count = 0 + for entry in orphan_data_values: + fileresourceid, storagekey = entry[0], entry[1] + name = entry[2] if len(entry) > 2 else "" + created = entry[3] if len(entry) > 3 else None + if not fileresourceid: + log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") + continue + matches = get_matching_files(storagekey, file_base_path, folder) + if not matches: + log(f"No files found for storage_key: {storagekey} (folder={folder})") + else: + move_files(matches, file_base_path, temp_file_path, folder, dry_run) + update_db(fileresourceid, cur, conn, dry_run) + rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches] + summary["items"].append({ + "id": fileresourceid, + "name": name, + "storagekey": storagekey, + "folder": folder, + "files": rel_matches, + "created": created.isoformat() if hasattr(created, "isoformat") else created, + "detection_date": datetime.utcnow().isoformat(), + "action": "would_move_and_delete" if dry_run else "moved_and_deleted", + "notified": False + }) + count += 1 + log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") + + +def emit_summary(summary): + mode = summary.get("mode", "TEST") + items = summary.get("items", []) + log(f"Summary ({mode}): {len(items)} fileresource entries processed.") + for item in items: + files = item.get("files") or [""] + log(f" - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}") + for f in files: + log(f" file: {f}") + + +def run_cleanup(args): + if args.mode == "docker": + run_docker_cleanup(args) + return + + config = load_config(args.config) + required_keys = ["db_host", "db_port", "db_name", "db_user", "file_base_path", "temp_file_path"] + missing_keys = [k for k in required_keys if not config.get(k)] + if missing_keys: + log(f"❌ Missing required config keys: {', '.join(missing_keys)}") + sys.exit(1) + + db_password = os.environ.get("DB_PASSWORD_FG") + if not db_password: + log("❌ Missing required environment variable: DB_PASSWORD_FG") + sys.exit(1) + + db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}" + file_base_path = config["file_base_path"] + temp_file_path = config["temp_file_path"] + + if not os.path.isdir(file_base_path) or not os.path.isdir(temp_file_path): + log("❌ Both 'file_base_path' and 'temp_file_path' must exist and be directories.") + log(f" file_base_path: {file_base_path} -> {'OK' if os.path.isdir(file_base_path) else 'INVALID'}") + log(f" temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}") + sys.exit(1) + + if args.force and args.test: + log("❌ Choose either --force or --test, not both.") + sys.exit(1) + + dry_run = not args.force + log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}") + log(f"Connecting to database: {db_url}") + log(f"File base path: {file_base_path}") + log(f"Temporary move path: {temp_file_path}") + + summary = {"mode": "TEST" if dry_run else "FORCE", "items": []} + + with psycopg2.connect(dsn=db_url) as conn: + with conn.cursor() as cur: + ensure_audit_table_exists(cur, dry_run) + remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary) + remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary) + + if args.csv_path: + overwrite_csv = bool(args.force) and not args.maintain_csv + items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) + append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) + emit_summary(summary) + + +def run_docker_cleanup(args): + if not args.docker_instance: + log("❌ --docker-instance is required when --mode=docker") + sys.exit(1) + script_path = os.path.join(os.path.dirname(__file__), "file_garbage_remover_docker.py") + if not os.path.isfile(script_path): + log(f"❌ Docker cleanup script not found at {script_path}") + sys.exit(1) + cmd = [sys.executable, script_path, "--instance", args.docker_instance] + log(f"Running docker cleanup via: {' '.join(cmd)}") + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + log(f"❌ Docker cleanup failed: {e}") + sys.exit(e.returncode) diff --git a/DHIS2/file_garbage_remover/csv_utils.py b/DHIS2/file_garbage_remover/csv_utils.py new file mode 100644 index 00000000..12b1cd8b --- /dev/null +++ b/DHIS2/file_garbage_remover/csv_utils.py @@ -0,0 +1,85 @@ +import csv +import os + +DEFAULT_FIELDNAMES = ["id", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"] + + +def serialize_item(item): + return { + "id": item.get("id"), + "name": item.get("name"), + "created": item.get("created"), + "detection_date": item.get("detection_date"), + "storagekey": item.get("storagekey"), + "folder": item.get("folder"), + "action": item.get("action"), + "files": "|".join(item.get("files") or []), + "notified": str(item.get("notified", False)).lower(), + } + + +def append_items(csv_path, items, overwrite=False, log=None): + """ + Append serialized items to CSV. If overwrite=True, start fresh. + """ + mode = "w" if overwrite else "a" + file_exists = os.path.isfile(csv_path) + want_header = overwrite or not file_exists + try: + with open(csv_path, mode, newline="") as f: + writer = csv.DictWriter(f, fieldnames=DEFAULT_FIELDNAMES) + if want_header: + writer.writeheader() + for item in items: + writer.writerow(serialize_item(item)) + if log: + log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}") + except Exception as e: + if log: + log(f"❌ Failed to write CSV {csv_path}: {e}") + else: + raise + + +def read_rows(csv_path): + with open(csv_path, newline="") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or DEFAULT_FIELDNAMES + rows = list(reader) + return rows, fieldnames + + +def write_rows(csv_path, rows, fieldnames=None): + fieldnames = fieldnames or DEFAULT_FIELDNAMES + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False, log=None): + """ + Returns new_items filtered to avoid duplicates with existing CSV based on unique_keys. + If overwrite=True, no filtering is applied (CSV will be rewritten). + """ + if overwrite or not os.path.isfile(csv_path): + return new_items + + existing_keys = set() + try: + rows, _ = read_rows(csv_path) + for row in rows: + key = tuple((row.get(k) or "").strip() for k in unique_keys) + existing_keys.add(key) + except Exception as e: + if log: + log(f"⚠️ Could not read existing CSV for deduplication: {e}") + + filtered = [] + for item in new_items: + key = tuple((str(item.get(k)) or "").strip() for k in unique_keys) + if key in existing_keys: + continue + existing_keys.add(key) + filtered.append(item) + return filtered diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index 97660547..ced18f32 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import argparse +import csv import json import os import re @@ -10,7 +11,7 @@ import psycopg2 SQL_FIND_ORPHANS_DOCUMENTS = """ - SELECT fileresourceid, storagekey + SELECT fileresourceid, storagekey, name, created FROM fileresource fr WHERE NOT EXISTS ( SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid @@ -22,7 +23,7 @@ """ SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ - SELECT fileresourceid, uid, storagekey + SELECT fileresourceid, uid, storagekey, name, created FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; """ @@ -173,10 +174,11 @@ def ensure_audit_table_exists(cursor, dry_run=False): def main(): parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes).") - group.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") + parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") + parser.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes). Default if --force not provided.") parser.add_argument("--config", required=True, help="Path to config.json file.") + parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.") + parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") args = parser.parse_args() config = load_config(args.config) @@ -202,20 +204,31 @@ def main(): log(f" temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}") sys.exit(1) - dry_run = args.test + if args.force and args.test: + log("❌ Choose either --force or --test, not both.") + sys.exit(1) + + dry_run = not args.force # default to test mode unless --force is provided log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}") log(f"Connecting to database: {db_url}") log(f"File base path: {file_base_path}") log(f"Temporary move path: {temp_file_path}") + summary = {"mode": "TEST" if dry_run else "FORCE", "items": []} + with psycopg2.connect(dsn=db_url) as conn: with conn.cursor() as cur: ensure_audit_table_exists(cur, dry_run) - remove_documents(file_base_path, temp_file_path, dry_run, cur, conn) - remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn) + remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary) + remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary) + if args.csv_path: + overwrite_csv = bool(args.force) and not args.maintain_csv + write_csv(args.csv_path, summary, overwrite=overwrite_csv) + emit_summary(summary) -def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn): + +def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary): log("Querying orphaned fileresource entries...") cur.execute(SQL_FIND_ORPHANS_DOCUMENTS) rows = cur.fetchall() @@ -225,10 +238,10 @@ def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn): return log(f"Found {len(rows)} \"document\" orphaned entries.") - process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document") + process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary) -def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn): +def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary): log("Querying orphaned fileresource entries...") cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES) rows = cur.fetchall() @@ -241,36 +254,52 @@ def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn): tracker_uids = get_all_tracker_uids(cur) # 3. Filter out referenced file resources orphan_data_values = [] - for fileresourceid, uid, storagekey in rows: + for fileresourceid, uid, storagekey, name, created in rows: if uid in datavalue_uids: continue # Referenced in datavalue if uid in event_blob: continue # Referenced in event if uid in tracker_uids: continue # referenced in a trackerentityattribute - orphan_data_values.append((fileresourceid, storagekey)) + orphan_data_values.append((fileresourceid, storagekey, name, created)) if not orphan_data_values: log("✅ No fileResources entries found.") return log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.") - process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue") + process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary) -def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder): +def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary): count = 0 - for fileresourceid, storagekey in orphan_data_values: + for entry in orphan_data_values: + # entries may come as (id, storagekey, name) + fileresourceid, storagekey = entry[0], entry[1] + name = entry[2] if len(entry) > 2 else "" + created = entry[3] if len(entry) > 3 else None if not fileresourceid: log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") continue try: - matches = get_matching_files(storagekey, file_base_path, "dataValue") + matches = get_matching_files(storagekey, file_base_path, folder) if not matches: - log(f"No files found for storage_key: {storagekey}") - continue - move_files(matches, file_base_path, temp_file_path, folder, dry_run) + log(f"No files found for storage_key: {storagekey} (folder={folder})") + else: + move_files(matches, file_base_path, temp_file_path, folder, dry_run) + # Always update the db also if the files not existed on disk update_db(fileresourceid, cur, conn, dry_run) + rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches] + summary["items"].append({ + "id": fileresourceid, + "name": name, + "storagekey": storagekey, + "folder": folder, + "files": rel_matches, + "created": created.isoformat() if hasattr(created, "isoformat") else created, + "action": "would_move_and_delete" if dry_run else "moved_and_deleted", + "notified": False + }) except Exception as e: log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}") sys.exit(1) @@ -278,5 +307,56 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") +def emit_summary(summary): + mode = summary.get("mode", "TEST") + items = summary.get("items", []) + log(f"Summary ({mode}): {len(items)} fileresource entries processed.") + for item in items: + files = item.get("files") or [""] + log(f" - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}") + for f in files: + log(f" file: {f}") + + +def write_csv(csv_path, summary, overwrite=False): + if not csv_path: + return + file_exists = os.path.isfile(csv_path) + mode = "w" if overwrite else "a" + want_header = overwrite or not file_exists + fieldnames = ["id", "name", "created", "storagekey", "folder", "action", "files", "notified"] + existing_ids = set() + if not overwrite and file_exists: + try: + with open(csv_path, "r", newline="") as f: + reader = csv.DictReader(f) + for row in reader: + if "id" in row and row["id"]: + existing_ids.add(str(row["id"])) + except Exception as e: + log(f"⚠️ Could not read existing CSV for deduplication: {e}") + try: + with open(csv_path, mode, newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + if want_header: + writer.writeheader() + for item in summary.get("items", []): + if str(item.get("id")) in existing_ids: + continue + writer.writerow({ + "id": item.get("id"), + "name": item.get("name"), + "created": item.get("created"), + "storagekey": item.get("storagekey"), + "folder": item.get("folder"), + "action": item.get("action"), + "files": "|".join(item.get("files") or []), + "notified": str(item.get("notified", False)).lower(), + }) + log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}") + except Exception as e: + log(f"❌ Failed to write CSV {csv_path}: {e}") + + if __name__ == "__main__": main() diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py new file mode 100644 index 00000000..f44e2233 --- /dev/null +++ b/DHIS2/file_garbage_remover/main.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import argparse +import sys + +import cleaner +from notifier import mark_notified, pending_from_csv, send_notification + + +def build_parser(): + parser = argparse.ArgumentParser(description="DHIS2 file garbage remover and notifier.") + subparsers = parser.add_subparsers(dest="command", required=True) + + cleanup = subparsers.add_parser("cleanup", help="Move/cleanup orphaned file resources and archive DB entries.") + cleanup.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") + cleanup.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).") + cleanup.add_argument("--config", required=True, help="Path to config.json file.") + cleanup.add_argument("--csv-path", help="Optional CSV file to record processed entries.") + cleanup.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") + cleanup.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.") + cleanup.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).") + + notify = subparsers.add_parser("notify", help="Print recent, non-notified filenames from CSV and mark them notified.") + notify.add_argument("--csv-path", required=True, help="Path to the CSV generated by cleanup.") + notify.add_argument("--webhook-url", help="Webhook URL to send the notification.") + notify.add_argument("--title", help="Notification title (required if --webhook-url).") + notify.add_argument("--http-proxy", help="HTTP proxy (optional).") + notify.add_argument("--https-proxy", help="HTTPS proxy (optional).") + + return parser + + +def main(): + parser = build_parser() + args = parser.parse_args() + + if args.command == "cleanup": + cleaner.run_cleanup(args) + return + + if args.command == "notify": + names, ids, fieldnames, rows = pending_from_csv(args.csv_path) + if not names: + sys.exit(1) + content = "\n".join(names) + if args.webhook_url: + if not args.title: + print("❌ --title is required when using --webhook-url", file=sys.stderr) + sys.exit(1) + try: + send_notification( + webhook_url=args.webhook_url, + title=args.title, + content=content, + http_proxy=args.http_proxy, + https_proxy=args.https_proxy, + ) + except Exception as e: + print(f"❌ Failed to send notification: {e}", file=sys.stderr) + sys.exit(1) + else: + for name in names: + print(name) + mark_notified(args.csv_path, rows, fieldnames, ids) + sys.exit(0) + + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/DHIS2/file_garbage_remover/notifier.py b/DHIS2/file_garbage_remover/notifier.py new file mode 100644 index 00000000..870f0f52 --- /dev/null +++ b/DHIS2/file_garbage_remover/notifier.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +import os +import sys +import requests + +from csv_utils import read_rows, write_rows + + +def pending_from_csv(csv_path): + rows, fieldnames = read_rows(csv_path) + + notify_ids = set() + names_to_print = [] + for row in rows: + notified = str(row.get("notified", "")).lower() == "true" + if notified: + continue + name = row.get("name", "") + if name: + names_to_print.append(name) + notify_ids.add(str(row.get("id"))) + + return names_to_print, notify_ids, fieldnames, rows + + +def mark_notified(csv_path, rows, fieldnames, notify_ids): + if not notify_ids: + return + for row in rows: + if str(row.get("id")) in notify_ids: + row["notified"] = "true" + write_rows(csv_path, rows, fieldnames=fieldnames) + + +def send_notification(webhook_url, title, content, http_proxy=None, https_proxy=None): + proxies = { + "http": http_proxy or os.getenv("http_proxy", ""), + "https": https_proxy or os.getenv("https_proxy", "") + } + payload = {"text": f"**{title}**\n{content}"} + response = requests.post( + webhook_url, + json=payload, + headers={"Content-Type": "application/json"}, + proxies=proxies, + verify=True, + timeout=10, + ) + response.raise_for_status() + return response.status_code + + +def main(): + if len(sys.argv) < 2: + sys.exit(1) + csv_path = sys.argv[1] + days = int(sys.argv[2]) if len(sys.argv) > 2 else None + names, ids, fieldnames, rows = pending_from_csv(csv_path, days) + if not names: + sys.exit(1) + for name in names: + print(name) + mark_notified(csv_path, rows, fieldnames, ids) + sys.exit(0) + + +if __name__ == "__main__": + main() From 1b712281dc5bd939baf17d7a47db2dbb7e13fe05 Mon Sep 17 00:00:00 2001 From: idelcano Date: Wed, 26 Nov 2025 11:29:37 +0100 Subject: [PATCH 14/23] improve script fixing error, adding notify, improve workflow and fix docker remover --- DHIS2/file_garbage_remover/Readme.md | 32 ++- DHIS2/file_garbage_remover/cleaner.py | 6 + .../file_garbage_remover_docker.py | 259 +++++++++++++----- DHIS2/file_garbage_remover/main.py | 137 ++++++--- 4 files changed, 325 insertions(+), 109 deletions(-) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index 886a8f5c..0c2c627d 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -5,6 +5,36 @@ These scripts identify files with no database references and take appropriate ac Included Scripts +## main.py (recommended entry point) + +Run cleanup (tomcat or docker) and optionally send a notification in one command. Dry-run is the default unless you add `--force`. + +### Example (docker cleanup + notify using webhook from config.json) +``` +python3 main.py \ + --mode docker \ + --docker-instance docker.eyeseetea.com/widpit/dhis2-data:2.41-widp-dev-test \ + --csv-path /tmp/fg_docker.csv \ + --config /path/to/config.json \ + --notify-title "Orphans widp-dev-test" +``` +- Add `--force` to delete/move for real (tomcat) or delete in container (docker). +- Add `--notify-test` to print the payload instead of sending. +- To notify only (no cleanup): `python3 main.py --notify-only --csv-path /tmp/fg.csv --config /path/to/config.json --notify-title "..." [--notify-test]` + +config.json needs the cleanup settings plus the webhook (if you want notifications): +``` +{ + "db_host": "", + "db_port": "", + "db_name": "", + "db_user": "", + "file_base_path": "", + "temp_file_path": "", + "webhook-url": "https://your.webhook.url" +} +``` + ## file_garbage_remover_tomcat.py ### Description: @@ -85,4 +115,4 @@ Deletes identified files directly inside the container. Production Environments: Always use --test mode before using --force to verify intended changes. -Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered. \ No newline at end of file +Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered. diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index 2e7ccd37..eeb86436 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -293,6 +293,12 @@ def run_docker_cleanup(args): log(f"❌ Docker cleanup script not found at {script_path}") sys.exit(1) cmd = [sys.executable, script_path, "--instance", args.docker_instance] + if args.csv_path: + cmd.extend(["--csv-path", args.csv_path]) + if args.maintain_csv: + cmd.append("--maintain-csv") + if args.force: + cmd.append("--force") log(f"Running docker cleanup via: {' '.join(cmd)}") try: subprocess.run(cmd, check=True) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index ff8e1fea..3075e232 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -1,18 +1,16 @@ #!/usr/bin/env python3 +import argparse +import os import subprocess import sys -import argparse - +import tempfile +from datetime import datetime -LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql" -LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" -REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt" -FILE_BASE_PATH = "/DHIS2_home/files" +from csv_utils import append_items, deduplicate_items - -SQL_QUERY = """ - SELECT storagekey +SQL_FIND_ORPHANS_DOCUMENTS = """ + SELECT fileresourceid, storagekey, name, created FROM fileresource fr WHERE NOT EXISTS ( SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid @@ -20,79 +18,206 @@ AND fr.uid NOT IN ( SELECT url FROM document ) - AND fr.domain = 'DOCUMENT'; + AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; """ +SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ + SELECT fileresourceid, uid, storagekey, name, created + FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; +""" -def run(command, check=True, capture=False): - """Run a shell command.""" - print(f"$ {' '.join(command)}") - return subprocess.run(command, check=check, capture_output=capture, text=True) - - -def find_container_id(name_pattern): - """Find the container ID based on a name substring. Retrieves first match only""" - result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True) - lines = result.stdout.strip().splitlines() - return lines[0] if lines else None - - -def slugify(instance_name): - """Convert instance name to container name slug.""" - return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-") - +SQL_EVENT_FILE_UIDS = """ + SELECT eventdatavalues + FROM event + WHERE programstageid + IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid + IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; +""" -def main(): - parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.") - parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)") - args = parser.parse_args() +SQL_DATA_VALUE_UIDS = """ + SELECT dv.value + FROM datavalue dv + WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') +""" - instance_name = args.instance - container_slug = slugify(instance_name) - core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1" +SQL_TRACKER_ATTRIBUTE_UIDS = """ +select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); +""" - print("Writing SQL file...") - with open(LOCAL_SQL_FILE, "w") as f: - f.write(SQL_QUERY) +SQL_INSERT_AUDIT = """ + INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = {fid}; +""" - print("Running SQL with d2-docker and capturing output...") - result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True) +SQL_DELETE_ORIGINAL = """ + DELETE FROM fileresource WHERE fileresourceid = {fid}; +""" - print("Saving result to file...") - with open(LOCAL_LIST_FILE, "w") as f: - for line in result.stdout.strip().splitlines(): - line = line.strip() - if line and not line.lower().startswith("storagekey"): - f.write(f"{line}\n") +SQL_CREATE_TABLE_IF_NOT_EXIST = """ + CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; +""" - print("Identifying containers...") - core_container = find_container_id(core_container_match) - print(core_container) - print(f"Core container: {core_container}") +def run(cmd, capture=False): + kwargs = {"check": True} + if capture: + kwargs["stdout"] = subprocess.PIPE + kwargs["stderr"] = subprocess.PIPE + kwargs["text"] = True + return subprocess.run(cmd, **kwargs) - print("Copying file list to Core container...") - run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"]) - print(f"Deleting orphaned files in Core container... {core_container}") - delete_cmd = f""" - bash -c ' - if [ ! -f "{REMOTE_LIST_FILE}" ]; then - echo "File list not found: {REMOTE_LIST_FILE}" - exit 1 - fi +def find_container_id(instance): + slug = "d2-docker-" + instance.replace("/", "-").replace(":", "-").replace(".", "-") + core = slug.replace("dhis2-data-", "") + "-core-1" + result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={core}"], capture=True) + lines = result.stdout.strip().splitlines() + if not lines: + return None + return lines[0] + + +def run_sql(instance, sql): + with tempfile.NamedTemporaryFile("w", delete=False) as f: + f.write(sql) + tmp_path = f.name + try: + result = run(["d2-docker", "run-sql", "-i", instance, tmp_path], capture=True) + return result.stdout + finally: + try: + os.remove(tmp_path) + except OSError: + pass + + +def parse_tab_rows(output, expected_cols): + rows = [] + for line in output.strip().splitlines(): + if not line or line.lower().startswith("fileresourceid") or line.lower().startswith("value") or line.lower().startswith("eventdatavalues"): + continue + if line.startswith("(") and "row" in line: + continue + parts = [p.strip() for p in line.split("|")] + if len(parts) < expected_cols: + continue + rows.append(parts) + return rows + + +def get_orphan_documents(instance): + out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS) + rows = [] + for fid, storagekey, name, created in parse_tab_rows(out, 4): + rows.append({ + "id": fid, + "name": name, + "storagekey": storagekey, + "folder": "document", + "files": [], + "created": created, + "detection_date": datetime.utcnow().isoformat(), + "action": "would_move_and_delete", + "notified": False, + }) + return rows + + +def get_orphan_datavalues(instance): + data_rows = parse_tab_rows(run_sql(instance, SQL_FIND_DATA_VALUES_FILE_RESOURCES), 5) + datavalue_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_DATA_VALUE_UIDS), 1)) + tracker_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_TRACKER_ATTRIBUTE_UIDS), 1)) + event_blob = "\n".join(r[0] for r in parse_tab_rows(run_sql(instance, SQL_EVENT_FILE_UIDS), 1)) + + orphans = [] + for fid, uid, storagekey, name, created in data_rows: + if uid in datavalue_uids: + continue + if uid in event_blob: + continue + if uid in tracker_uids: + continue + orphans.append({ + "id": fid, + "name": name, + "storagekey": storagekey, + "folder": "dataValue", + "files": [], + "created": created, + "detection_date": datetime.utcnow().isoformat(), + "action": "would_move_and_delete", + "notified": False, + }) + return orphans + + +def delete_files(container_id, row, dry_run): + base = "/DHIS2_home/files" + storagekey = row["storagekey"] + folder = row["folder"] + prefix = os.path.join(base, folder, storagekey.replace(f"{folder}/", "")) + cmd = ["docker", "exec", container_id, "bash", "-c", f"rm -v {prefix}*"] + if dry_run: + print(f"[DRY RUN] {' '.join(cmd)}") + else: + try: + run(cmd) + except subprocess.CalledProcessError as e: + print(f"⚠️ File deletion failed (likely missing): {' '.join(cmd)} -> {e}") + # Continue with DB cleanup and CSV logging even if file is already gone. + + +def ensure_audit_table(instance): + run_sql(instance, SQL_CREATE_TABLE_IF_NOT_EXIST) + + +def archive_and_delete(instance, fileresourceid): + fid = int(fileresourceid) + sql = SQL_INSERT_AUDIT.format(fid=fid) + SQL_DELETE_ORIGINAL.format(fid=fid) + run_sql(instance, sql) - while IFS= read -r key; do - fullpath="{FILE_BASE_PATH}/$key" - echo "Deleting files: $fullpath*" - rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath" - done < "{REMOTE_LIST_FILE}" - ' - """ - run(["docker", "exec", core_container, "bash", "-c", delete_cmd]) +def main(): + parser = argparse.ArgumentParser(description="Find/delete orphan file resources in d2-docker instance and log to CSV.") + parser.add_argument("--instance", required=True, help="d2-docker instance name (e.g. docker.eyeseetea.com/project/dhis2-data:2.41-test)") + parser.add_argument("--csv-path", help="CSV file to log orphan entries.") + parser.add_argument("--force", action="store_true", help="Actually delete files in container.") + parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).") + args = parser.parse_args() - print("Cleanup complete.") + container_id = find_container_id(args.instance) + if not container_id: + print("❌ Could not find core container for instance", file=sys.stderr) + sys.exit(1) + + dry_run = not args.force + print(f"Running in {'DRY-RUN' if dry_run else 'FORCE'} mode against container {container_id}") + + ensure_audit_table(args.instance) + + rows = [] + rows.extend(get_orphan_documents(args.instance)) + rows.extend(get_orphan_datavalues(args.instance)) + print(f"Found {len(rows)} orphan entries") + + for row in rows: + delete_files(container_id, row, dry_run) + if dry_run: + print(f"[DRY RUN] Skipping DB archive/delete for {row['id']} ({row.get('name', '')}) action={row.get('action')}") + continue + try: + archive_and_delete(args.instance, row["id"]) + row["action"] = "moved_and_deleted" + except Exception as e: + print(f"❌ Failed DB archive/delete for {row['id']}: {e}", file=sys.stderr) + if not dry_run: + sys.exit(1) + + if args.csv_path: + overwrite_csv = bool(args.force) and not args.maintain_csv + items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv) + append_items(args.csv_path, items, overwrite=overwrite_csv) + + print(f"Summary: processed {len(rows)} orphan entries") if __name__ == "__main__": diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py index f44e2233..09896b83 100644 --- a/DHIS2/file_garbage_remover/main.py +++ b/DHIS2/file_garbage_remover/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import argparse +import json import sys import cleaner @@ -8,63 +9,117 @@ def build_parser(): parser = argparse.ArgumentParser(description="DHIS2 file garbage remover and notifier.") - subparsers = parser.add_subparsers(dest="command", required=True) - - cleanup = subparsers.add_parser("cleanup", help="Move/cleanup orphaned file resources and archive DB entries.") - cleanup.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") - cleanup.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).") - cleanup.add_argument("--config", required=True, help="Path to config.json file.") - cleanup.add_argument("--csv-path", help="Optional CSV file to record processed entries.") - cleanup.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") - cleanup.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.") - cleanup.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).") - - notify = subparsers.add_parser("notify", help="Print recent, non-notified filenames from CSV and mark them notified.") - notify.add_argument("--csv-path", required=True, help="Path to the CSV generated by cleanup.") - notify.add_argument("--webhook-url", help="Webhook URL to send the notification.") - notify.add_argument("--title", help="Notification title (required if --webhook-url).") - notify.add_argument("--http-proxy", help="HTTP proxy (optional).") - notify.add_argument("--https-proxy", help="HTTPS proxy (optional).") + # Cleanup options + parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") + parser.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).") + parser.add_argument("--config", help="Path to config.json file. Required unless --notify-only.") + parser.add_argument("--csv-path", help="CSV file to record processed entries or to read notifications from.") + parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") + parser.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.") + parser.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).") + + # Notification options + parser.add_argument("--notify-only", action="store_true", help="Skip cleanup and only send notifications from CSV.") + parser.add_argument("--notify-webhook-url", help="Webhook URL to send the notification. If not provided, falls back to config.json key 'webhook-url'.") + parser.add_argument("--notify-title", help="Notification title (required if using --notify-webhook-url).") + parser.add_argument("--notify-http-proxy", help="HTTP proxy (optional).") + parser.add_argument("--notify-https-proxy", help="HTTPS proxy (optional).") + parser.add_argument("--notify-test", action="store_true", help="Dry-run notification: print payload instead of sending.") return parser -def main(): - parser = build_parser() - args = parser.parse_args() +def resolve_webhook(config_path, cli_webhook): + if cli_webhook: + return cli_webhook + if not config_path: + return None + try: + cfg = cleaner.load_config(config_path) + except Exception as e: + print(f"❌ Failed to load config file: {e}", file=sys.stderr) + sys.exit(1) + return cfg.get("webhook-url") or cfg.get("webhook_url") - if args.command == "cleanup": - cleaner.run_cleanup(args) - return - if args.command == "notify": - names, ids, fieldnames, rows = pending_from_csv(args.csv_path) - if not names: +def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False): + names, ids, fieldnames, rows = pending_from_csv(csv_path) + if not names: + return + content = "\n".join(names) + webhook = resolve_webhook(config_path, webhook_url) + if notify_test: + print(f"[TEST] Would send notification to {webhook or ''}") + print(f"[TEST] Title: {title or ''}") + print(f"[TEST] Content:\n{content}") + if webhook: + if not title: + print("❌ --notify-title is required when using --notify-webhook-url", file=sys.stderr) sys.exit(1) - content = "\n".join(names) - if args.webhook_url: - if not args.title: - print("❌ --title is required when using --webhook-url", file=sys.stderr) - sys.exit(1) + if not notify_test: try: send_notification( - webhook_url=args.webhook_url, - title=args.title, + webhook_url=webhook, + title=title, content=content, - http_proxy=args.http_proxy, - https_proxy=args.https_proxy, + http_proxy=http_proxy, + https_proxy=https_proxy, ) except Exception as e: print(f"❌ Failed to send notification: {e}", file=sys.stderr) sys.exit(1) - else: - for name in names: - print(name) - mark_notified(args.csv_path, rows, fieldnames, ids) + else: + for name in names: + print(name) + mark_notified(csv_path, rows, fieldnames, ids) + + +def main(): + parser = build_parser() + args = parser.parse_args() + + # Notify-only path + if args.notify_only: + if not args.csv_path: + print("❌ --csv-path is required for notify-only mode", file=sys.stderr) + sys.exit(1) + run_notify_flow( + csv_path=args.csv_path, + config_path=args.config, + webhook_url=args.notify_webhook_url, + title=args.notify_title, + http_proxy=args.notify_http_proxy, + https_proxy=args.notify_https_proxy, + notify_test=args.notify_test, + ) sys.exit(0) - parser.print_help() - sys.exit(1) + # Cleanup path (default) + if not args.config: + print("❌ --config is required to run cleanup", file=sys.stderr) + sys.exit(1) + cleaner.run_cleanup(args) + + wants_notify = any([ + args.notify_webhook_url, + args.notify_title, + args.notify_http_proxy, + args.notify_https_proxy, + args.notify_test, + ]) + if wants_notify: + if not args.csv_path: + print("❌ --csv-path is required to send notifications after cleanup", file=sys.stderr) + sys.exit(1) + run_notify_flow( + csv_path=args.csv_path, + config_path=args.config, + webhook_url=args.notify_webhook_url, + title=args.notify_title, + http_proxy=args.notify_http_proxy, + https_proxy=args.notify_https_proxy, + notify_test=args.notify_test, + ) if __name__ == "__main__": From 4a4a365b5608af1c00b12141f4d44da83436b638 Mon Sep 17 00:00:00 2001 From: idelcano Date: Wed, 26 Nov 2025 12:26:15 +0100 Subject: [PATCH 15/23] refactor sql --- DHIS2/file_garbage_remover/cleaner.py | 57 ++++-------------- .../file_garbage_remover_docker.py | 59 ++++--------------- DHIS2/file_garbage_remover/sql_queries.py | 46 +++++++++++++++ 3 files changed, 67 insertions(+), 95 deletions(-) create mode 100644 DHIS2/file_garbage_remover/sql_queries.py diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index eeb86436..fb4a16dd 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -10,53 +10,16 @@ import psycopg2 from csv_utils import append_items, deduplicate_items - -SQL_FIND_ORPHANS_DOCUMENTS = """ - SELECT fileresourceid, storagekey, name, created - FROM fileresource fr - WHERE NOT EXISTS ( - SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid - ) - AND fr.uid NOT IN ( - SELECT url FROM document - ) - AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; -""" - -SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ - SELECT fileresourceid, uid, storagekey, name, created - FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; -""" - -SQL_INSERT_AUDIT = """ - INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; -""" - -SQL_DELETE_ORIGINAL = """ - DELETE FROM fileresource WHERE fileresourceid = %(fid)s; -""" - -SQL_CREATE_TABLE_IF_NOT_EXIST = """ - CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; -""" - -SQL_EVENT_FILE_UIDS = """ - SELECT eventdatavalues - FROM event - WHERE programstageid - IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid - IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; -""" - -SQL_DATA_VALUE_UIDS = """ - SELECT dv.value - FROM datavalue dv - WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') -""" - -SQL_TRACKER_ATTRIBUTE_UIDS = """ -select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); -""" +from sql_queries import ( + SQL_CREATE_TABLE_IF_NOT_EXIST, + SQL_DATA_VALUE_UIDS, + SQL_DELETE_ORIGINAL, + SQL_EVENT_FILE_UIDS, + SQL_FIND_DATA_VALUES_FILE_RESOURCES, + SQL_FIND_ORPHANS_DOCUMENTS, + SQL_INSERT_AUDIT, + SQL_TRACKER_ATTRIBUTE_UIDS, +) def log(message): diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index 3075e232..8c4e47b7 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -8,53 +8,16 @@ from datetime import datetime from csv_utils import append_items, deduplicate_items - -SQL_FIND_ORPHANS_DOCUMENTS = """ - SELECT fileresourceid, storagekey, name, created - FROM fileresource fr - WHERE NOT EXISTS ( - SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid - ) - AND fr.uid NOT IN ( - SELECT url FROM document - ) - AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; -""" - -SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ - SELECT fileresourceid, uid, storagekey, name, created - FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; -""" - -SQL_EVENT_FILE_UIDS = """ - SELECT eventdatavalues - FROM event - WHERE programstageid - IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid - IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; -""" - -SQL_DATA_VALUE_UIDS = """ - SELECT dv.value - FROM datavalue dv - WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') -""" - -SQL_TRACKER_ATTRIBUTE_UIDS = """ -select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); -""" - -SQL_INSERT_AUDIT = """ - INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = {fid}; -""" - -SQL_DELETE_ORIGINAL = """ - DELETE FROM fileresource WHERE fileresourceid = {fid}; -""" - -SQL_CREATE_TABLE_IF_NOT_EXIST = """ - CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; -""" +from sql_queries import ( + SQL_CREATE_TABLE_IF_NOT_EXIST, + SQL_DATA_VALUE_UIDS, + SQL_DELETE_ORIGINAL, + SQL_EVENT_FILE_UIDS, + SQL_FIND_DATA_VALUES_FILE_RESOURCES, + SQL_FIND_ORPHANS_DOCUMENTS, + SQL_INSERT_AUDIT, + SQL_TRACKER_ATTRIBUTE_UIDS, +) def run(cmd, capture=False): @@ -172,7 +135,7 @@ def ensure_audit_table(instance): def archive_and_delete(instance, fileresourceid): fid = int(fileresourceid) - sql = SQL_INSERT_AUDIT.format(fid=fid) + SQL_DELETE_ORIGINAL.format(fid=fid) + sql = (SQL_INSERT_AUDIT % {"fid": fid}) + (SQL_DELETE_ORIGINAL % {"fid": fid}) run_sql(instance, sql) diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py new file mode 100644 index 00000000..bb6e82e5 --- /dev/null +++ b/DHIS2/file_garbage_remover/sql_queries.py @@ -0,0 +1,46 @@ +SQL_FIND_ORPHANS_DOCUMENTS = """ + SELECT fileresourceid, storagekey, name, created + FROM fileresource fr + WHERE NOT EXISTS ( + SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid + ) + AND fr.uid NOT IN ( + SELECT url FROM document + ) + AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; +""" + +SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ + SELECT fileresourceid, uid, storagekey, name, created + FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; +""" + +SQL_INSERT_AUDIT = """ + INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +SQL_DELETE_ORIGINAL = """ + DELETE FROM fileresource WHERE fileresourceid = %(fid)s; +""" + +SQL_CREATE_TABLE_IF_NOT_EXIST = """ + CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; +""" + +SQL_EVENT_FILE_UIDS = """ + SELECT eventdatavalues + FROM event + WHERE programstageid + IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid + IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; +""" + +SQL_DATA_VALUE_UIDS = """ + SELECT dv.value + FROM datavalue dv + WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') +""" + +SQL_TRACKER_ATTRIBUTE_UIDS = """ +select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); +""" From a96e57153cc9c2a0b0f9c937389e9ee56dbb90d7 Mon Sep 17 00:00:00 2001 From: idelcano Date: Wed, 26 Nov 2025 13:00:09 +0100 Subject: [PATCH 16/23] read proxy from config file --- DHIS2/file_garbage_remover/main.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py index 09896b83..8a1c87f3 100644 --- a/DHIS2/file_garbage_remover/main.py +++ b/DHIS2/file_garbage_remover/main.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import argparse -import json import sys import cleaner @@ -30,8 +29,10 @@ def build_parser(): def resolve_webhook(config_path, cli_webhook): - if cli_webhook: - return cli_webhook + return cli_webhook or _from_config(config_path, ["webhook-url", "webhook_url"]) + + +def _from_config(config_path, keys): if not config_path: return None try: @@ -39,7 +40,14 @@ def resolve_webhook(config_path, cli_webhook): except Exception as e: print(f"❌ Failed to load config file: {e}", file=sys.stderr) sys.exit(1) - return cfg.get("webhook-url") or cfg.get("webhook_url") + for key in keys: + if key in cfg: + return cfg.get(key) + return None + + +def resolve_proxy(config_path, cli_value, keys): + return cli_value or _from_config(config_path, keys) def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False): @@ -48,6 +56,8 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht return content = "\n".join(names) webhook = resolve_webhook(config_path, webhook_url) + http_proxy = resolve_proxy(config_path, http_proxy, ["notify-http-proxy", "notify_http_proxy", "http-proxy", "http_proxy"]) + https_proxy = resolve_proxy(config_path, https_proxy, ["notify-https-proxy", "notify_https_proxy", "https-proxy", "https_proxy"]) if notify_test: print(f"[TEST] Would send notification to {webhook or ''}") print(f"[TEST] Title: {title or ''}") From 4ddd6212a2a030841bfd7204db4c890de121c5d6 Mon Sep 17 00:00:00 2001 From: idelcano Date: Thu, 27 Nov 2025 13:40:25 +0100 Subject: [PATCH 17/23] added store files in csv as already notified parameter and truncate messages --- DHIS2/file_garbage_remover/Readme.md | 14 +++++++++- DHIS2/file_garbage_remover/cleaner.py | 3 +++ .../file_garbage_remover_docker.py | 5 ++++ .../file_garbage_remover_tomcat.py | 4 +++ DHIS2/file_garbage_remover/main.py | 26 +++++++++++++------ 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index 0c2c627d..2b344ba1 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -21,6 +21,16 @@ python3 main.py \ - Add `--force` to delete/move for real (tomcat) or delete in container (docker). - Add `--notify-test` to print the payload instead of sending. - To notify only (no cleanup): `python3 main.py --notify-only --csv-path /tmp/fg.csv --config /path/to/config.json --notify-title "..." [--notify-test]` +- To skip sending and save in CSV as notified: `--save-all-as-notified`. + +### Example (tomcat cleanup, dry-run, save as nnotified) +``` +DB_PASSWORD_FG=your_password python3 main.py \ + --config /path/to/config.json \ + --csv-path /tmp/fg_tomcat.csv \ + --save-all-as-notified +``` +Add `--force` to move/delete files and files in DB. config.json needs the cleanup settings plus the webhook (if you want notifications): ``` @@ -31,7 +41,9 @@ config.json needs the cleanup settings plus the webhook (if you want notificatio "db_user": "", "file_base_path": "", "temp_file_path": "", - "webhook-url": "https://your.webhook.url" + "webhook-url": "https://your.webhook.url", + "notify-http-proxy": "http://openproxy.who.int:8080", + "notify-https-proxy": "http://openproxy.who.int:8080" } ``` diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index fb4a16dd..7975f86a 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -241,6 +241,9 @@ def run_cleanup(args): remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary) if args.csv_path: + if getattr(args, "save_all_as_notified", False): + for item in summary.get("items", []): + item["notified"] = True overwrite_csv = bool(args.force) and not args.maintain_csv items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index 8c4e47b7..b7f73dfb 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -145,6 +145,7 @@ def main(): parser.add_argument("--csv-path", help="CSV file to log orphan entries.") parser.add_argument("--force", action="store_true", help="Actually delete files in container.") parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).") + parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.") args = parser.parse_args() container_id = find_container_id(args.instance) @@ -162,6 +163,10 @@ def main(): rows.extend(get_orphan_datavalues(args.instance)) print(f"Found {len(rows)} orphan entries") + if args.save_all_as_notified: + for row in rows: + row["notified"] = True + for row in rows: delete_files(container_id, row, dry_run) if dry_run: diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index ced18f32..a1475370 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -179,6 +179,7 @@ def main(): parser.add_argument("--config", required=True, help="Path to config.json file.") parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.") parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") + parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.") args = parser.parse_args() config = load_config(args.config) @@ -223,6 +224,9 @@ def main(): remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary) if args.csv_path: + if args.save_all_as_notified: + for item in summary.get("items", []): + item["notified"] = True overwrite_csv = bool(args.force) and not args.maintain_csv write_csv(args.csv_path, summary, overwrite=overwrite_csv) emit_summary(summary) diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py index 8a1c87f3..342bef73 100644 --- a/DHIS2/file_garbage_remover/main.py +++ b/DHIS2/file_garbage_remover/main.py @@ -24,6 +24,8 @@ def build_parser(): parser.add_argument("--notify-http-proxy", help="HTTP proxy (optional).") parser.add_argument("--notify-https-proxy", help="HTTPS proxy (optional).") parser.add_argument("--notify-test", action="store_true", help="Dry-run notification: print payload instead of sending.") + parser.add_argument("--save-all-as-notified", action="store_true", help="Mark CSV entries as notified even if no notification is sent.") + parser.add_argument("--notify-max-lines", type=int, default=100, help="Maximum number of lines to include in notification content (default 100).") return parser @@ -50,11 +52,15 @@ def resolve_proxy(config_path, cli_value, keys): return cli_value or _from_config(config_path, keys) -def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False): +def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=50): names, ids, fieldnames, rows = pending_from_csv(csv_path) - if not names: + if not names and not save_all_as_notified: return - content = "\n".join(names) + max_lines = notify_max_lines if notify_max_lines and notify_max_lines > 0 else None + truncated = names[:max_lines] if max_lines else names + content = "\n".join(truncated) + if max_lines and len(names) > max_lines: + content += f"\n... (truncated {len(names) - max_lines} more)" webhook = resolve_webhook(config_path, webhook_url) http_proxy = resolve_proxy(config_path, http_proxy, ["notify-http-proxy", "notify_http_proxy", "http-proxy", "http_proxy"]) https_proxy = resolve_proxy(config_path, https_proxy, ["notify-https-proxy", "notify_https_proxy", "https-proxy", "https_proxy"]) @@ -78,10 +84,10 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht except Exception as e: print(f"❌ Failed to send notification: {e}", file=sys.stderr) sys.exit(1) - else: - for name in names: - print(name) - mark_notified(csv_path, rows, fieldnames, ids) + # mark entries as notified + ids_to_mark = ids if not save_all_as_notified else set(str(row.get("id")) for row in rows) + if ids_to_mark: + mark_notified(csv_path, rows, fieldnames, ids_to_mark) def main(): @@ -101,6 +107,7 @@ def main(): http_proxy=args.notify_http_proxy, https_proxy=args.notify_https_proxy, notify_test=args.notify_test, + notify_max_lines=args.notify_max_lines, ) sys.exit(0) @@ -117,7 +124,9 @@ def main(): args.notify_https_proxy, args.notify_test, ]) - if wants_notify: + if args.save_all_as_notified: + return #nothing to report + elif wants_notify: if not args.csv_path: print("❌ --csv-path is required to send notifications after cleanup", file=sys.stderr) sys.exit(1) @@ -129,6 +138,7 @@ def main(): http_proxy=args.notify_http_proxy, https_proxy=args.notify_https_proxy, notify_test=args.notify_test, + notify_max_lines=args.notify_max_lines, ) From 926c0f09af3a00b41a90e80749165d426120759c Mon Sep 17 00:00:00 2001 From: idelcano Date: Thu, 27 Nov 2025 13:53:28 +0100 Subject: [PATCH 18/23] refactor duplicate code --- DHIS2/file_garbage_remover/cleaner.py | 20 +-- .../file_garbage_remover_docker.py | 4 +- .../file_garbage_remover_tomcat.py | 125 +++--------------- 3 files changed, 19 insertions(+), 130 deletions(-) diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index 7975f86a..153d4056 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -9,6 +9,7 @@ import subprocess import psycopg2 +from common import load_config, log from csv_utils import append_items, deduplicate_items from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, @@ -22,22 +23,6 @@ ) -def log(message): - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if isinstance(message, str): - message = re.sub( - r"(postgresql://[^:]+:)([^@]+)(@)", - r"\1*****\3", - message - ) - print(f"[{timestamp}] {message}") - - -def load_config(path): - with open(path, "r") as f: - return json.load(f) - - def get_events(cursor): cursor.execute(SQL_EVENT_FILE_UIDS) return [row[0] for row in cursor.fetchall() if row[0]] @@ -242,8 +227,7 @@ def run_cleanup(args): if args.csv_path: if getattr(args, "save_all_as_notified", False): - for item in summary.get("items", []): - item["notified"] = True + mark_items_notified(summary.get("items", [])) overwrite_csv = bool(args.force) and not args.maintain_csv items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index b7f73dfb..3217241f 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -8,6 +8,7 @@ from datetime import datetime from csv_utils import append_items, deduplicate_items +from common import mark_items_notified from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, SQL_DATA_VALUE_UIDS, @@ -164,8 +165,7 @@ def main(): print(f"Found {len(rows)} orphan entries") if args.save_all_as_notified: - for row in rows: - row["notified"] = True + mark_items_notified(rows) for row in rows: delete_files(container_id, row, dry_run) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index a1475370..b13605ff 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -1,78 +1,24 @@ #!/usr/bin/env python3 import argparse -import csv -import json import os -import re import shutil import sys from datetime import datetime import psycopg2 -SQL_FIND_ORPHANS_DOCUMENTS = """ - SELECT fileresourceid, storagekey, name, created - FROM fileresource fr - WHERE NOT EXISTS ( - SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid - ) - AND fr.uid NOT IN ( - SELECT url FROM document - ) - AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; -""" - -SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ - SELECT fileresourceid, uid, storagekey, name, created - FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; -""" - -SQL_INSERT_AUDIT = """ - INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s; -""" - -SQL_DELETE_ORIGINAL = """ - DELETE FROM fileresource WHERE fileresourceid = %(fid)s; -""" - -SQL_CREATE_TABLE_IF_NOT_EXIST = """ - CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA; -""" - -SQL_EVENT_FILE_UIDS = """ - SELECT eventdatavalues - FROM event - WHERE programstageid - IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid - IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f'; -""" - -SQL_DATA_VALUE_UIDS = """ - SELECT dv.value - FROM datavalue dv - WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE') -""" - -SQL_TRACKER_ATTRIBUTE_UIDS = """ -select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE'); -""" - - -def log(message): - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if isinstance(message, str): - # Detect pattern like: postgresql://user:password@host - message = re.sub( - r"(postgresql://[^:]+:)([^@]+)(@)", - r"\1*****\3", - message - ) - print(f"[{timestamp}] {message}") - - -def load_config(path): - with open(path, "r") as f: - return json.load(f) +from common import load_config, log, mark_items_notified +from csv_utils import deduplicate_items, append_items +from sql_queries import ( + SQL_CREATE_TABLE_IF_NOT_EXIST, + SQL_DATA_VALUE_UIDS, + SQL_DELETE_ORIGINAL, + SQL_EVENT_FILE_UIDS, + SQL_FIND_DATA_VALUES_FILE_RESOURCES, + SQL_FIND_ORPHANS_DOCUMENTS, + SQL_INSERT_AUDIT, + SQL_TRACKER_ATTRIBUTE_UIDS, +) def get_events(cursor): @@ -109,7 +55,6 @@ def get_matching_files(storage_key, base_dir, folder): storage_key = storage_key.replace(folder + "/", "") base_dir = os.path.join(base_dir, folder) matching = [] - for filename in os.listdir(base_dir): if filename.startswith(storage_key): fullpath = os.path.join(base_dir, filename) @@ -225,10 +170,10 @@ def main(): if args.csv_path: if args.save_all_as_notified: - for item in summary.get("items", []): - item["notified"] = True + mark_items_notified(summary.get("items", [])) overwrite_csv = bool(args.force) and not args.maintain_csv - write_csv(args.csv_path, summary, overwrite=overwrite_csv) + items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) + append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) emit_summary(summary) @@ -322,45 +267,5 @@ def emit_summary(summary): log(f" file: {f}") -def write_csv(csv_path, summary, overwrite=False): - if not csv_path: - return - file_exists = os.path.isfile(csv_path) - mode = "w" if overwrite else "a" - want_header = overwrite or not file_exists - fieldnames = ["id", "name", "created", "storagekey", "folder", "action", "files", "notified"] - existing_ids = set() - if not overwrite and file_exists: - try: - with open(csv_path, "r", newline="") as f: - reader = csv.DictReader(f) - for row in reader: - if "id" in row and row["id"]: - existing_ids.add(str(row["id"])) - except Exception as e: - log(f"⚠️ Could not read existing CSV for deduplication: {e}") - try: - with open(csv_path, mode, newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - if want_header: - writer.writeheader() - for item in summary.get("items", []): - if str(item.get("id")) in existing_ids: - continue - writer.writerow({ - "id": item.get("id"), - "name": item.get("name"), - "created": item.get("created"), - "storagekey": item.get("storagekey"), - "folder": item.get("folder"), - "action": item.get("action"), - "files": "|".join(item.get("files") or []), - "notified": str(item.get("notified", False)).lower(), - }) - log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}") - except Exception as e: - log(f"❌ Failed to write CSV {csv_path}: {e}") - - if __name__ == "__main__": main() From 4f5eaccbf515e48c9425d1379e8580e038706bcc Mon Sep 17 00:00:00 2001 From: idelcano Date: Thu, 27 Nov 2025 19:12:15 +0100 Subject: [PATCH 19/23] refactor --- DHIS2/file_garbage_remover/common.py | 79 +++++++++++++++++ .../file_garbage_remover_docker.py | 47 +++------- .../file_garbage_remover_tomcat.py | 87 +++++++------------ DHIS2/file_garbage_remover/main.py | 9 +- 4 files changed, 122 insertions(+), 100 deletions(-) create mode 100644 DHIS2/file_garbage_remover/common.py diff --git a/DHIS2/file_garbage_remover/common.py b/DHIS2/file_garbage_remover/common.py new file mode 100644 index 00000000..8e2d9ae4 --- /dev/null +++ b/DHIS2/file_garbage_remover/common.py @@ -0,0 +1,79 @@ +import json +import os +import re +from datetime import datetime + + +def log(message): + """Simple logger with timestamp and DB URL password masking.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if isinstance(message, str): + message = re.sub( + r"(postgresql://[^:]+:)([^@]+)(@)", + r"\1*****\3", + message + ) + print(f"[{timestamp}] {message}") + + +def load_config(path): + with open(path, "r") as f: + return json.load(f) + + +def mark_items_notified(items): + for item in items: + item["notified"] = True + + +def emit_summary(items, mode, log_fn=print): + log_fn(f"Summary ({mode}): {len(items)} fileresource entries processed.") + for item in items: + files = item.get("files") or [""] + log_fn(f" - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}") + for f in files: + log_fn(f" file: {f}") + + +def _iso(value): + return value.isoformat() if hasattr(value, "isoformat") else value + + +def build_document_items(raw_rows): + items = [] + for fid, storagekey, name, created in raw_rows: + items.append({ + "id": fid, + "name": name, + "storagekey": storagekey, + "folder": "document", + "files": [], + "created": _iso(created), + "detection_date": datetime.utcnow().isoformat(), + "action": "would_move_and_delete", + "notified": False, + }) + return items + + +def build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob): + items = [] + for fid, uid, storagekey, name, created in raw_rows: + if uid in datavalue_uids: + continue + if uid in event_blob: + continue + if uid in tracker_uids: + continue + items.append({ + "id": fid, + "name": name, + "storagekey": storagekey, + "folder": "dataValue", + "files": [], + "created": _iso(created), + "detection_date": datetime.utcnow().isoformat(), + "action": "would_move_and_delete", + "notified": False, + }) + return items diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index 3217241f..b4b541ae 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -8,7 +8,12 @@ from datetime import datetime from csv_utils import append_items, deduplicate_items -from common import mark_items_notified +from common import ( + build_datavalue_items, + build_document_items, + emit_summary, + mark_items_notified, +) from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, SQL_DATA_VALUE_UIDS, @@ -70,20 +75,8 @@ def parse_tab_rows(output, expected_cols): def get_orphan_documents(instance): out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS) - rows = [] - for fid, storagekey, name, created in parse_tab_rows(out, 4): - rows.append({ - "id": fid, - "name": name, - "storagekey": storagekey, - "folder": "document", - "files": [], - "created": created, - "detection_date": datetime.utcnow().isoformat(), - "action": "would_move_and_delete", - "notified": False, - }) - return rows + raw_rows = parse_tab_rows(out, 4) + return build_document_items(raw_rows) def get_orphan_datavalues(instance): @@ -91,27 +84,7 @@ def get_orphan_datavalues(instance): datavalue_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_DATA_VALUE_UIDS), 1)) tracker_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_TRACKER_ATTRIBUTE_UIDS), 1)) event_blob = "\n".join(r[0] for r in parse_tab_rows(run_sql(instance, SQL_EVENT_FILE_UIDS), 1)) - - orphans = [] - for fid, uid, storagekey, name, created in data_rows: - if uid in datavalue_uids: - continue - if uid in event_blob: - continue - if uid in tracker_uids: - continue - orphans.append({ - "id": fid, - "name": name, - "storagekey": storagekey, - "folder": "dataValue", - "files": [], - "created": created, - "detection_date": datetime.utcnow().isoformat(), - "action": "would_move_and_delete", - "notified": False, - }) - return orphans + return build_datavalue_items(data_rows, datavalue_uids, tracker_uids, event_blob) def delete_files(container_id, row, dry_run): @@ -185,7 +158,7 @@ def main(): items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv) append_items(args.csv_path, items, overwrite=overwrite_csv) - print(f"Summary: processed {len(rows)} orphan entries") + emit_summary(rows, "DRY-RUN" if dry_run else "FORCE", log_fn=print) if __name__ == "__main__": diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index b13605ff..9a0f6906 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -7,7 +7,14 @@ import psycopg2 -from common import load_config, log, mark_items_notified +from common import ( + build_datavalue_items, + build_document_items, + emit_summary, + load_config, + log, + mark_items_notified, +) from csv_utils import deduplicate_items, append_items from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, @@ -31,15 +38,6 @@ def get_event_blob(cursor): return "\n".join(json.dumps(e) for e in event_texts) -def get_data_value_file_resources(cursor): - cursor.execute(""" - SELECT fileresourceid, uid, storagekey - FROM fileresource - WHERE domain = 'DATA_VALUE' - """) - return cursor.fetchall() - - def get_all_datavalue_uids(cursor): cursor.execute(SQL_DATA_VALUE_UIDS) return set(row[0] for row in cursor.fetchall() if row[0]) @@ -174,59 +172,50 @@ def main(): overwrite_csv = bool(args.force) and not args.maintain_csv items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) - emit_summary(summary) + emit_summary(summary.get("items", []), summary.get("mode", "TEST"), log_fn=log) def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary): log("Querying orphaned fileresource entries...") cur.execute(SQL_FIND_ORPHANS_DOCUMENTS) - rows = cur.fetchall() + raw_rows = cur.fetchall() + items = build_document_items(raw_rows) - if not rows: + if not items: log("✅ No orphaned fileresource entries found.") return - log(f"Found {len(rows)} \"document\" orphaned entries.") - process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary) + log(f"Found {len(items)} \"document\" orphaned entries.") + process_orphan_items(items, file_base_path, temp_file_path, dry_run, cur, conn, summary) def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary): log("Querying orphaned fileresource entries...") cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES) - rows = cur.fetchall() - + raw_rows = cur.fetchall() # get all file resource datavalues datavalue_uids = get_all_datavalue_uids(cur) # get all events with file dataelement values event_blob = get_event_blob(cur) tracker_uids = get_all_tracker_uids(cur) - # 3. Filter out referenced file resources - orphan_data_values = [] - for fileresourceid, uid, storagekey, name, created in rows: - if uid in datavalue_uids: - continue # Referenced in datavalue - if uid in event_blob: - continue # Referenced in event - if uid in tracker_uids: - continue # referenced in a trackerentityattribute - orphan_data_values.append((fileresourceid, storagekey, name, created)) - - if not orphan_data_values: + items = build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob) + + if not items: log("✅ No fileResources entries found.") return - log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.") - process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary) + log(f"Found {len(items)} \"data_value\" orphaned entries.") + process_orphan_items(items, file_base_path, temp_file_path, dry_run, cur, conn, summary) -def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary): +def process_orphan_items(orphan_items, file_base_path, temp_file_path, dry_run, cur, conn, summary): count = 0 - for entry in orphan_data_values: - # entries may come as (id, storagekey, name) - fileresourceid, storagekey = entry[0], entry[1] - name = entry[2] if len(entry) > 2 else "" - created = entry[3] if len(entry) > 3 else None + for item in orphan_items: + fileresourceid = item.get("id") + storagekey = item.get("storagekey", "") + name = item.get("name", "") + folder = item.get("folder", "") if not fileresourceid: log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}") continue @@ -239,16 +228,9 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry # Always update the db also if the files not existed on disk update_db(fileresourceid, cur, conn, dry_run) rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches] - summary["items"].append({ - "id": fileresourceid, - "name": name, - "storagekey": storagekey, - "folder": folder, - "files": rel_matches, - "created": created.isoformat() if hasattr(created, "isoformat") else created, - "action": "would_move_and_delete" if dry_run else "moved_and_deleted", - "notified": False - }) + item["files"] = rel_matches + item["action"] = "would_move_and_delete" if dry_run else "moved_and_deleted" + summary["items"].append(item) except Exception as e: log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}") sys.exit(1) @@ -256,16 +238,5 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}") -def emit_summary(summary): - mode = summary.get("mode", "TEST") - items = summary.get("items", []) - log(f"Summary ({mode}): {len(items)} fileresource entries processed.") - for item in items: - files = item.get("files") or [""] - log(f" - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}") - for f in files: - log(f" file: {f}") - - if __name__ == "__main__": main() diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py index 342bef73..e02a92ef 100644 --- a/DHIS2/file_garbage_remover/main.py +++ b/DHIS2/file_garbage_remover/main.py @@ -52,9 +52,9 @@ def resolve_proxy(config_path, cli_value, keys): return cli_value or _from_config(config_path, keys) -def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=50): +def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=100): names, ids, fieldnames, rows = pending_from_csv(csv_path) - if not names and not save_all_as_notified: + if not names: return max_lines = notify_max_lines if notify_max_lines and notify_max_lines > 0 else None truncated = names[:max_lines] if max_lines else names @@ -85,9 +85,8 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht print(f"❌ Failed to send notification: {e}", file=sys.stderr) sys.exit(1) # mark entries as notified - ids_to_mark = ids if not save_all_as_notified else set(str(row.get("id")) for row in rows) - if ids_to_mark: - mark_notified(csv_path, rows, fieldnames, ids_to_mark) + if ids: + mark_notified(csv_path, rows, fieldnames, ids) def main(): From a7721fdf4636b41c2a6a0f39b745d1ac491462a5 Mon Sep 17 00:00:00 2001 From: idelcano Date: Fri, 28 Nov 2025 10:40:11 +0100 Subject: [PATCH 20/23] simplify parameters and workflod, improve unique row detection(id+uid), and add save-all-as-notified --- DHIS2/file_garbage_remover/cleaner.py | 13 ++++---- DHIS2/file_garbage_remover/common.py | 4 ++- DHIS2/file_garbage_remover/csv_utils.py | 31 +++++++++++++++++-- .../file_garbage_remover_docker.py | 13 ++++---- .../file_garbage_remover_tomcat.py | 10 +++--- DHIS2/file_garbage_remover/main.py | 1 - DHIS2/file_garbage_remover/sql_queries.py | 2 +- 7 files changed, 52 insertions(+), 22 deletions(-) diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index 153d4056..c5a3074c 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -10,7 +10,7 @@ import psycopg2 from common import load_config, log -from csv_utils import append_items, deduplicate_items +from csv_utils import append_items, deduplicate_items, merge_notified_flags from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, SQL_DATA_VALUE_UIDS, @@ -228,9 +228,10 @@ def run_cleanup(args): if args.csv_path: if getattr(args, "save_all_as_notified", False): mark_items_notified(summary.get("items", [])) - overwrite_csv = bool(args.force) and not args.maintain_csv - items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) - append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) + else: + merge_notified_flags(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), log=log) + items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), overwrite=False, log=log) + append_items(args.csv_path, items, overwrite=False, log=log) emit_summary(summary) @@ -245,8 +246,8 @@ def run_docker_cleanup(args): cmd = [sys.executable, script_path, "--instance", args.docker_instance] if args.csv_path: cmd.extend(["--csv-path", args.csv_path]) - if args.maintain_csv: - cmd.append("--maintain-csv") + if getattr(args, "save_all_as_notified", False): + cmd.append("--save-all-as-notified") if args.force: cmd.append("--force") log(f"Running docker cleanup via: {' '.join(cmd)}") diff --git a/DHIS2/file_garbage_remover/common.py b/DHIS2/file_garbage_remover/common.py index 8e2d9ae4..8483d941 100644 --- a/DHIS2/file_garbage_remover/common.py +++ b/DHIS2/file_garbage_remover/common.py @@ -41,9 +41,10 @@ def _iso(value): def build_document_items(raw_rows): items = [] - for fid, storagekey, name, created in raw_rows: + for fid, uid, storagekey, name, created in raw_rows: items.append({ "id": fid, + "uid": uid, "name": name, "storagekey": storagekey, "folder": "document", @@ -67,6 +68,7 @@ def build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob): continue items.append({ "id": fid, + "uid": uid, "name": name, "storagekey": storagekey, "folder": "dataValue", diff --git a/DHIS2/file_garbage_remover/csv_utils.py b/DHIS2/file_garbage_remover/csv_utils.py index 12b1cd8b..a365c16e 100644 --- a/DHIS2/file_garbage_remover/csv_utils.py +++ b/DHIS2/file_garbage_remover/csv_utils.py @@ -1,12 +1,13 @@ import csv import os -DEFAULT_FIELDNAMES = ["id", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"] +DEFAULT_FIELDNAMES = ["id", "uid", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"] def serialize_item(item): return { "id": item.get("id"), + "uid": item.get("uid"), "name": item.get("name"), "created": item.get("created"), "detection_date": item.get("detection_date"), @@ -57,7 +58,7 @@ def write_rows(csv_path, rows, fieldnames=None): writer.writerows(rows) -def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False, log=None): +def deduplicate_items(csv_path, new_items, unique_keys=("id", "uid"), overwrite=False, log=None): """ Returns new_items filtered to avoid duplicates with existing CSV based on unique_keys. If overwrite=True, no filtering is applied (CSV will be rewritten). @@ -83,3 +84,29 @@ def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False, existing_keys.add(key) filtered.append(item) return filtered + + +def merge_notified_flags(csv_path, items, unique_keys=("id", "uid"), log=None): + """ + For each item, if an existing CSV has the same unique key with notified=true, + copy that notified flag into the item (unless already true). + """ + if not os.path.isfile(csv_path): + return items + try: + rows, _ = read_rows(csv_path) + except Exception as e: + if log: + log(f"⚠️ Could not read existing CSV for notified merge: {e}") + return items + + notified_map = {} + for row in rows: + key = tuple((row.get(k) or "").strip() for k in unique_keys) + notified_map[key] = str(row.get("notified", "")).lower() == "true" + + for item in items: + key = tuple((str(item.get(k)) or "").strip() for k in unique_keys) + if notified_map.get(key): + item["notified"] = True + return items diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py index b4b541ae..2d506e54 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py @@ -7,7 +7,7 @@ import tempfile from datetime import datetime -from csv_utils import append_items, deduplicate_items +from csv_utils import append_items, deduplicate_items, merge_notified_flags from common import ( build_datavalue_items, build_document_items, @@ -75,7 +75,7 @@ def parse_tab_rows(output, expected_cols): def get_orphan_documents(instance): out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS) - raw_rows = parse_tab_rows(out, 4) + raw_rows = parse_tab_rows(out, 5) return build_document_items(raw_rows) @@ -118,7 +118,6 @@ def main(): parser.add_argument("--instance", required=True, help="d2-docker instance name (e.g. docker.eyeseetea.com/project/dhis2-data:2.41-test)") parser.add_argument("--csv-path", help="CSV file to log orphan entries.") parser.add_argument("--force", action="store_true", help="Actually delete files in container.") - parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).") parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.") args = parser.parse_args() @@ -139,6 +138,9 @@ def main(): if args.save_all_as_notified: mark_items_notified(rows) + else: + if args.csv_path: + merge_notified_flags(args.csv_path, rows, unique_keys=("id", "uid")) for row in rows: delete_files(container_id, row, dry_run) @@ -154,9 +156,8 @@ def main(): sys.exit(1) if args.csv_path: - overwrite_csv = bool(args.force) and not args.maintain_csv - items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv) - append_items(args.csv_path, items, overwrite=overwrite_csv) + items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "uid"), overwrite=False) + append_items(args.csv_path, items, overwrite=False) emit_summary(rows, "DRY-RUN" if dry_run else "FORCE", log_fn=print) diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py index 9a0f6906..43e1d1d7 100644 --- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py +++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py @@ -120,8 +120,7 @@ def main(): parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.") parser.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes). Default if --force not provided.") parser.add_argument("--config", required=True, help="Path to config.json file.") - parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.") - parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") + parser.add_argument("--csv-path", help="Optional CSV file to record processed entries.") parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.") args = parser.parse_args() @@ -169,9 +168,10 @@ def main(): if args.csv_path: if args.save_all_as_notified: mark_items_notified(summary.get("items", [])) - overwrite_csv = bool(args.force) and not args.maintain_csv - items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log) - append_items(args.csv_path, items, overwrite=overwrite_csv, log=log) + else: + merge_notified_flags(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), log=log) + items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), overwrite=False, log=log) + append_items(args.csv_path, items, overwrite=False, log=log) emit_summary(summary.get("items", []), summary.get("mode", "TEST"), log_fn=log) diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py index e02a92ef..64d6a35e 100644 --- a/DHIS2/file_garbage_remover/main.py +++ b/DHIS2/file_garbage_remover/main.py @@ -13,7 +13,6 @@ def build_parser(): parser.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).") parser.add_argument("--config", help="Path to config.json file. Required unless --notify-only.") parser.add_argument("--csv-path", help="CSV file to record processed entries or to read notifications from.") - parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).") parser.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.") parser.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).") diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py index bb6e82e5..c64523e8 100644 --- a/DHIS2/file_garbage_remover/sql_queries.py +++ b/DHIS2/file_garbage_remover/sql_queries.py @@ -1,5 +1,5 @@ SQL_FIND_ORPHANS_DOCUMENTS = """ - SELECT fileresourceid, storagekey, name, created + SELECT fileresourceid, uid, storagekey, name, created FROM fileresource fr WHERE NOT EXISTS ( SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid From f241b0f183a1f65764970eebdaf644bbe1a19787 Mon Sep 17 00:00:00 2001 From: idelcano Date: Fri, 28 Nov 2025 13:10:56 +0100 Subject: [PATCH 21/23] import missing dependency --- DHIS2/file_garbage_remover/cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py index c5a3074c..47b03d42 100644 --- a/DHIS2/file_garbage_remover/cleaner.py +++ b/DHIS2/file_garbage_remover/cleaner.py @@ -9,7 +9,7 @@ import subprocess import psycopg2 -from common import load_config, log +from common import load_config, log, mark_items_notified from csv_utils import append_items, deduplicate_items, merge_notified_flags from sql_queries import ( SQL_CREATE_TABLE_IF_NOT_EXIST, From d2b8cdc9106124d60b63911f793fec538eddb126 Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 22 Dec 2025 18:27:57 +0100 Subject: [PATCH 22/23] avoid identify as orphan files created/updated in the last 24 --- DHIS2/file_garbage_remover/sql_queries.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py index c64523e8..5eadcba1 100644 --- a/DHIS2/file_garbage_remover/sql_queries.py +++ b/DHIS2/file_garbage_remover/sql_queries.py @@ -7,12 +7,15 @@ AND fr.uid NOT IN ( SELECT url FROM document ) - AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%'; + AND fr.domain = 'DOCUMENT' and fr.storagekey like '%document%' + AND COALESCE(fr.lastupdated, fr.created, NOW()) < (NOW() - INTERVAL '24 hours'); """ SQL_FIND_DATA_VALUES_FILE_RESOURCES = """ SELECT fileresourceid, uid, storagekey, name, created - FROM fileresource fr where fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'; + FROM fileresource fr + WHERE fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%' + AND COALESCE(fr.lastupdated, fr.created, NOW()) < (NOW() - INTERVAL '24 hours'); """ SQL_INSERT_AUDIT = """ From 83f0bf0eac9e6df03173546a9930ae770494e10d Mon Sep 17 00:00:00 2001 From: idelcano Date: Mon, 22 Dec 2025 18:44:33 +0100 Subject: [PATCH 23/23] recovery instructions --- DHIS2/file_garbage_remover/Readme.md | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md index 2b344ba1..e1394aa2 100644 --- a/DHIS2/file_garbage_remover/Readme.md +++ b/DHIS2/file_garbage_remover/Readme.md @@ -128,3 +128,50 @@ Deletes identified files directly inside the container. Production Environments: Always use --test mode before using --force to verify intended changes. Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered. + +## Restoring fileresource entries (tomcat) + +Manual steps if a resource was moved by mistake: +1) Restore the row into `fileresource` from `fileresourcesaudit` using a **new** `fileresourceid` to avoid collisions. +Column list (stable): `uid, code, created, lastupdated, name, contenttype, contentlength, contentmd5, storagekey, isassigned, domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner` +``` +INSERT INTO fileresource ( + fileresourceid, uid, code, created, lastupdated, name, contenttype, + contentlength, contentmd5, storagekey, isassigned, domain, userid, + lastupdatedby, hasmultiplestoragefiles, fileresourceowner +) +SELECT + nextval('fileresource_fileresourceid_seq'), uid, code, created, lastupdated, + name, contenttype, contentlength, contentmd5, storagekey, isassigned, + domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner +FROM fileresourcesaudit +WHERE fileresourceid = ; +``` +Run inside a transaction and verify before commit. + +2) Move the file back from `temp_file_path` to `file_base_path`, keeping the folder (`document` or `dataValue`). Use a wildcard to catch image variants (e.g., `..._max`, `..._min`): +``` +mv "//"* "//" +``` + +Example to restore entries moved in the last 24 hours: +``` +-- 1) Reinsert rows with new IDs +INSERT INTO fileresource ( + fileresourceid, uid, code, created, lastupdated, name, contenttype, + contentlength, contentmd5, storagekey, isassigned, domain, userid, + lastupdatedby, hasmultiplestoragefiles, fileresourceowner +) +SELECT + nextval('fileresource_fileresourceid_seq'), uid, code, created, lastupdated, + name, contenttype, contentlength, contentmd5, storagekey, isassigned, + domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner +FROM fileresourcesaudit +WHERE COALESCE(lastupdated, created, NOW()) >= (NOW() - INTERVAL '24 hours'); + +-- 2) Move files back (adjust paths) +find "" -type f -mtime -1 -print0 | while IFS= read -r -d '' f; do + rel="${f#/}" + mv "$f" "/$rel" +done +```