From 1c91ce611d7d76bf13148275bda57e200aad31b2 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Wed, 21 May 2025 13:36:01 +0200
Subject: [PATCH 01/23] added documents marked as to be removed clean script

---
 .../file_garbage_remover.py                   | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 DHIS2/post_clone_scripts/file_garbage_remover.py

diff --git a/DHIS2/post_clone_scripts/file_garbage_remover.py b/DHIS2/post_clone_scripts/file_garbage_remover.py
new file mode 100644
index 00000000..cf7cb107
--- /dev/null
+++ b/DHIS2/post_clone_scripts/file_garbage_remover.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+import subprocess
+import sys
+import argparse
+
+
+LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql"
+LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
+REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
+FILE_BASE_PATH = "/DHIS2_home/files"
+
+
+SQL_QUERY = """
+    SELECT storagekey
+    FROM fileresource fr
+    WHERE NOT EXISTS (
+        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
+    )
+    AND fr.uid NOT IN (
+        SELECT url FROM document
+    )
+    AND fr.domain = 'DOCUMENT';
+"""
+
+
+def run(command, check=True, capture=False):
+    """Run a shell command."""
+    print(f"$ {' '.join(command)}")
+    return subprocess.run(command, check=check, capture_output=capture, text=True)
+
+
+def find_container_id(name_pattern):
+    """Find the container ID based on a name substring."""
+    result = run(["docker", "ps", "--format", "{{.ID}}\t{{.Names}}"], capture=True)
+    for line in result.stdout.strip().splitlines():
+        cid, name = line.split("\t")
+        if name_pattern in name:
+            return cid
+    return None
+
+
+def slugify(instance_name):
+    """Convert instance name to container name slug."""
+    return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.")
+    parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)")
+    args = parser.parse_args()
+
+    instance_name = args.instance
+    container_slug = slugify(instance_name)
+    db_container_match = container_slug + "-db-1"
+    core_container_match = container_slug + "-core-1"
+
+    print("Writing SQL file...")
+    with open(LOCAL_SQL_FILE, "w") as f:
+        f.write(SQL_QUERY)
+
+    print("Running SQL with d2-docker and capturing output...")
+    result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True)
+
+    print("Saving result to file...")
+    with open(LOCAL_LIST_FILE, "w") as f:
+        for line in result.stdout.strip().splitlines():
+            line = line.strip()
+            if line and not line.lower().startswith("storagekey"):
+                f.write(f"{line}\n")
+
+    print("Identifying containers...")
+    db_container = find_container_id(db_container_match.replace("dhis2-data-", ""))
+    core_container = find_container_id(core_container_match.replace("dhis2-data-",""))
+    print(db_container)
+    print(core_container)
+    if not db_container or not core_container:
+        print("Could not find DB or Core container.")
+        print(f"Looked for: {db_container_match}, {core_container_match}")
+        sys.exit(1)
+
+    print(f"DB container: {db_container}")
+    print(f"Core container: {core_container}")
+
+    print("Copying file list to Core container...")
+    run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"])
+
+    print(f"Deleting orphaned files in Core container... {core_container} {db_container_match}")
+    delete_cmd = f"""
+    bash -c '
+    if [ ! -f "{REMOTE_LIST_FILE}" ]; then
+        echo "File list not found: {REMOTE_LIST_FILE}"
+        exit 1
+    fi
+
+    while IFS= read -r key; do
+        fullpath="{FILE_BASE_PATH}/$key"
+        echo "Deleting files: $fullpath*"
+        rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath"
+    done < "{REMOTE_LIST_FILE}"
+    '
+    """
+
+    run(["docker", "exec", core_container, "bash", "-c", delete_cmd])
+
+    print("Cleanup complete.")
+
+
+if __name__ == "__main__":
+    main()

From fe1ce44b170fc6700d1126e47127d9f254e57712 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 18:59:00 +0200
Subject: [PATCH 02/23] rename folder

---
 .../file_garbage_remover.py                                       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename DHIS2/{post_clone_scripts => file_garbage_remover}/file_garbage_remover.py (100%)

diff --git a/DHIS2/post_clone_scripts/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py
similarity index 100%
rename from DHIS2/post_clone_scripts/file_garbage_remover.py
rename to DHIS2/file_garbage_remover/file_garbage_remover.py

From 776c7efe9ddac9221442b95b4592967dfe15f213 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 18:59:38 +0200
Subject: [PATCH 03/23] remove unnecesary code and simplify script

---
 .../file_garbage_remover.py                   | 25 ++++++-------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py
index cf7cb107..ff8e1fea 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover.py
@@ -31,13 +31,10 @@ def run(command, check=True, capture=False):
 
 
 def find_container_id(name_pattern):
-    """Find the container ID based on a name substring."""
-    result = run(["docker", "ps", "--format", "{{.ID}}\t{{.Names}}"], capture=True)
-    for line in result.stdout.strip().splitlines():
-        cid, name = line.split("\t")
-        if name_pattern in name:
-            return cid
-    return None
+    """Find the container ID based on a name substring. Retrieves first match only"""
+    result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True)
+    lines = result.stdout.strip().splitlines()
+    return lines[0] if lines else None
 
 
 def slugify(instance_name):
@@ -52,8 +49,7 @@ def main():
 
     instance_name = args.instance
     container_slug = slugify(instance_name)
-    db_container_match = container_slug + "-db-1"
-    core_container_match = container_slug + "-core-1"
+    core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1"
 
     print("Writing SQL file...")
     with open(LOCAL_SQL_FILE, "w") as f:
@@ -70,22 +66,15 @@ def main():
                 f.write(f"{line}\n")
 
     print("Identifying containers...")
-    db_container = find_container_id(db_container_match.replace("dhis2-data-", ""))
-    core_container = find_container_id(core_container_match.replace("dhis2-data-",""))
-    print(db_container)
+    core_container = find_container_id(core_container_match)
     print(core_container)
-    if not db_container or not core_container:
-        print("Could not find DB or Core container.")
-        print(f"Looked for: {db_container_match}, {core_container_match}")
-        sys.exit(1)
 
-    print(f"DB container: {db_container}")
     print(f"Core container: {core_container}")
 
     print("Copying file list to Core container...")
     run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"])
 
-    print(f"Deleting orphaned files in Core container... {core_container} {db_container_match}")
+    print(f"Deleting orphaned files in Core container... {core_container}")
     delete_cmd = f"""
     bash -c '
     if [ ! -f "{REMOTE_LIST_FILE}" ]; then

From bdf37f41c6cd5056150bdf7aada823af0377df41 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 19:00:05 +0200
Subject: [PATCH 04/23] rename file

---
 .../file_garbage_remover_docker.py            | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 DHIS2/file_garbage_remover/file_garbage_remover_docker.py

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
new file mode 100644
index 00000000..ff8e1fea
--- /dev/null
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import subprocess
+import sys
+import argparse
+
+
+LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql"
+LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
+REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
+FILE_BASE_PATH = "/DHIS2_home/files"
+
+
+SQL_QUERY = """
+    SELECT storagekey
+    FROM fileresource fr
+    WHERE NOT EXISTS (
+        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
+    )
+    AND fr.uid NOT IN (
+        SELECT url FROM document
+    )
+    AND fr.domain = 'DOCUMENT';
+"""
+
+
+def run(command, check=True, capture=False):
+    """Run a shell command."""
+    print(f"$ {' '.join(command)}")
+    return subprocess.run(command, check=check, capture_output=capture, text=True)
+
+
+def find_container_id(name_pattern):
+    """Find the container ID based on a name substring. Retrieves first match only"""
+    result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True)
+    lines = result.stdout.strip().splitlines()
+    return lines[0] if lines else None
+
+
+def slugify(instance_name):
+    """Convert instance name to container name slug."""
+    return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.")
+    parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)")
+    args = parser.parse_args()
+
+    instance_name = args.instance
+    container_slug = slugify(instance_name)
+    core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1"
+
+    print("Writing SQL file...")
+    with open(LOCAL_SQL_FILE, "w") as f:
+        f.write(SQL_QUERY)
+
+    print("Running SQL with d2-docker and capturing output...")
+    result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True)
+
+    print("Saving result to file...")
+    with open(LOCAL_LIST_FILE, "w") as f:
+        for line in result.stdout.strip().splitlines():
+            line = line.strip()
+            if line and not line.lower().startswith("storagekey"):
+                f.write(f"{line}\n")
+
+    print("Identifying containers...")
+    core_container = find_container_id(core_container_match)
+    print(core_container)
+
+    print(f"Core container: {core_container}")
+
+    print("Copying file list to Core container...")
+    run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"])
+
+    print(f"Deleting orphaned files in Core container... {core_container}")
+    delete_cmd = f"""
+    bash -c '
+    if [ ! -f "{REMOTE_LIST_FILE}" ]; then
+        echo "File list not found: {REMOTE_LIST_FILE}"
+        exit 1
+    fi
+
+    while IFS= read -r key; do
+        fullpath="{FILE_BASE_PATH}/$key"
+        echo "Deleting files: $fullpath*"
+        rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath"
+    done < "{REMOTE_LIST_FILE}"
+    '
+    """
+
+    run(["docker", "exec", core_container, "bash", "-c", delete_cmd])
+
+    print("Cleanup complete.")
+
+
+if __name__ == "__main__":
+    main()

From 6f7613be830722bd477041890850b84c181852a4 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 19:03:43 +0200
Subject: [PATCH 05/23] Added tomcat version

---
 .../file_garbage_remover_tomcat.py            | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
new file mode 100644
index 00000000..a1e03f5e
--- /dev/null
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+import re
+
+import psycopg2
+import shutil
+import os
+import argparse
+import json
+import sys
+from datetime import datetime
+
+SQL_FIND_ORPHANS = """
+    SELECT fileresourceid, storagekey
+    FROM fileresource fr
+    WHERE NOT EXISTS (
+        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
+    )
+    AND fr.uid NOT IN (
+        SELECT url FROM document
+    )
+    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
+"""
+
+SQL_INSERT_AUDIT = """
+    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+SQL_DELETE_ORIGINAL = """
+    DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+def log(message):
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if isinstance(message, str):
+        # Detect pattern like: postgresql://user:password@host
+        message = re.sub(
+            r"(postgresql://[^:]+:)([^@]+)(@)",
+            r"\1*****\3",
+            message
+        )
+    print(f"[{timestamp}] {message}")
+
+def load_config(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+def get_matching_document_files(storage_key, base_dir):
+    storage_key = storage_key.replace("document/", "")
+    base_dir = os.path.join(base_dir, "document")
+    matching = []
+
+    for filename in os.listdir(base_dir):
+        if filename.startswith(storage_key):
+            fullpath = os.path.join(base_dir, filename)
+            if os.path.isfile(fullpath):
+                matching.append(fullpath)
+    return matching
+
+def update_db(fileresourceid, cursor, conn, dry_run=False):
+    if dry_run:
+        log("[DRY RUN] Would execute:")
+        log(f"  {SQL_INSERT_AUDIT.strip()} with fileresourceid = '{fileresourceid}'")
+        log(f"  {SQL_DELETE_ORIGINAL.strip()} with fileresourceid = '{fileresourceid}'")
+    else:
+        try:
+            cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid})
+            cursor.execute(SQL_DELETE_ORIGINAL, {"fid": fileresourceid})
+            conn.commit()
+            log(f"Archived and deleted fileresourceid {fileresourceid}")
+        except Exception as e:
+            log(f"❌ Failed DB update for fileresourceid {fileresourceid}: {e}")
+            conn.rollback()
+            raise
+
+def move_files(file_list, file_base_path, temp_file_path, dry_run=False):
+    base_dir = os.path.join(file_base_path, "document")
+    dest_dir = os.path.join(temp_file_path, "document")
+    for src in file_list:
+        rel_path = os.path.relpath(src, base_dir)
+        log(rel_path)
+        dst = os.path.join(dest_dir, rel_path)
+        log(dst)
+        dst_dir = os.path.dirname(dst)
+        log(dst_dir)
+        try:
+            os.makedirs(dst_dir, exist_ok=True)
+        except Exception as e:
+            log(f"❌ Failed to create directory {dst_dir}: {e}")
+            raise
+
+        log(f"{'[DRY RUN] ' if dry_run else ''}Moving {src} -> {dst}")
+        if not dry_run:
+            try:
+                shutil.move(src, dst)
+            except Exception as e:
+                log(f"❌ Failed to move {src}: {e}")
+                raise
+
+
+
+
+def ensure_audit_table_exists(cursor, dry_run=False):
+    if dry_run:
+        log("[DRY RUN] Would execute:")
+        log(" CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;")
+    else:
+        log("Ensuring 'fileresourcesaudit' table exists...")
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
+        """)
+        log("'fileresourcesaudit' table ready.")
+
+def main():
+    parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes).")
+    group.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
+    parser.add_argument("--config", required=True, help="Path to config.json file.")
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    required_keys = ["db_host", "db_port", "db_name", "db_user", "file_base_path", "temp_file_path"]
+    missing_keys = [k for k in required_keys if not config.get(k)]
+
+    if missing_keys:
+        log(f"❌ Missing required config keys: {', '.join(missing_keys)}")
+        sys.exit(1)
+
+    db_password = os.environ.get("DB_PASSWORD_FILE_G")
+    if not db_password:
+        log("❌ Missing required environment variable: DB_PASSWORD")
+        sys.exit(1)
+
+    db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}"
+    file_base_path = config["file_base_path"]
+    temp_file_path = config["temp_file_path"]
+
+    if not os.path.isdir(file_base_path) or not os.path.isdir(temp_file_path):
+        log("❌ Both 'file_base_path' and 'temp_file_path' must exist and be directories.")
+        log(f"    file_base_path: {file_base_path} -> {'OK' if os.path.isdir(file_base_path) else 'INVALID'}")
+        log(f"    temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}")
+        sys.exit(1)
+
+    dry_run = args.test
+    log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}")
+    log(f"Connecting to database: {db_url}")
+    log(f"File base path: {file_base_path}")
+    log(f"Temporary move path: {temp_file_path}")
+
+    with psycopg2.connect(dsn=db_url) as conn:
+        with conn.cursor() as cur:
+            ensure_audit_table_exists(cur, dry_run)
+            log("Querying orphaned fileresource entries...")
+            cur.execute(SQL_FIND_ORPHANS)
+            rows = cur.fetchall()
+
+            if not rows:
+                log("✅ No orphaned fileresource entries found.")
+                return
+
+            log(f"Found {len(rows)} orphaned entries.")
+            count = 0
+            for fileresourceid, storagekey in rows:
+                if not fileresourceid:
+                    log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
+                    continue
+
+                try:
+                    matches = get_matching_document_files(storagekey, file_base_path)
+                    if not matches:
+                        raise Exception(f"No files found for storage_key: {storagekey}")
+                    move_files(matches, file_base_path, temp_file_path, dry_run)
+                    update_db(fileresourceid, cur, conn, dry_run)
+                except Exception as e:
+                    log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}")
+                    sys.exit(1)
+                count = count + 1
+            log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
+
+if __name__ == "__main__":
+    main()

From 546761127482e522223d80bf4c0f3418890b7d48 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 19:03:57 +0200
Subject: [PATCH 06/23] remove duplicate old file

---
 .../file_garbage_remover.py                   | 99 -------------------
 1 file changed, 99 deletions(-)
 delete mode 100644 DHIS2/file_garbage_remover/file_garbage_remover.py

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover.py b/DHIS2/file_garbage_remover/file_garbage_remover.py
deleted file mode 100644
index ff8e1fea..00000000
--- a/DHIS2/file_garbage_remover/file_garbage_remover.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-
-import subprocess
-import sys
-import argparse
-
-
-LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql"
-LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
-REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
-FILE_BASE_PATH = "/DHIS2_home/files"
-
-
-SQL_QUERY = """
-    SELECT storagekey
-    FROM fileresource fr
-    WHERE NOT EXISTS (
-        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
-    )
-    AND fr.uid NOT IN (
-        SELECT url FROM document
-    )
-    AND fr.domain = 'DOCUMENT';
-"""
-
-
-def run(command, check=True, capture=False):
-    """Run a shell command."""
-    print(f"$ {' '.join(command)}")
-    return subprocess.run(command, check=check, capture_output=capture, text=True)
-
-
-def find_container_id(name_pattern):
-    """Find the container ID based on a name substring. Retrieves first match only"""
-    result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True)
-    lines = result.stdout.strip().splitlines()
-    return lines[0] if lines else None
-
-
-def slugify(instance_name):
-    """Convert instance name to container name slug."""
-    return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.")
-    parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)")
-    args = parser.parse_args()
-
-    instance_name = args.instance
-    container_slug = slugify(instance_name)
-    core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1"
-
-    print("Writing SQL file...")
-    with open(LOCAL_SQL_FILE, "w") as f:
-        f.write(SQL_QUERY)
-
-    print("Running SQL with d2-docker and capturing output...")
-    result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True)
-
-    print("Saving result to file...")
-    with open(LOCAL_LIST_FILE, "w") as f:
-        for line in result.stdout.strip().splitlines():
-            line = line.strip()
-            if line and not line.lower().startswith("storagekey"):
-                f.write(f"{line}\n")
-
-    print("Identifying containers...")
-    core_container = find_container_id(core_container_match)
-    print(core_container)
-
-    print(f"Core container: {core_container}")
-
-    print("Copying file list to Core container...")
-    run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"])
-
-    print(f"Deleting orphaned files in Core container... {core_container}")
-    delete_cmd = f"""
-    bash -c '
-    if [ ! -f "{REMOTE_LIST_FILE}" ]; then
-        echo "File list not found: {REMOTE_LIST_FILE}"
-        exit 1
-    fi
-
-    while IFS= read -r key; do
-        fullpath="{FILE_BASE_PATH}/$key"
-        echo "Deleting files: $fullpath*"
-        rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath"
-    done < "{REMOTE_LIST_FILE}"
-    '
-    """
-
-    run(["docker", "exec", core_container, "bash", "-c", delete_cmd])
-
-    print("Cleanup complete.")
-
-
-if __name__ == "__main__":
-    main()

From b047fcedb6fa3aedbe34c9910899090f66e3c116 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 7 Jul 2025 19:09:04 +0200
Subject: [PATCH 07/23] Added readme

---
 DHIS2/file_garbage_remover/Readme.md | 72 ++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 DHIS2/file_garbage_remover/Readme.md

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
new file mode 100644
index 00000000..faf1ff77
--- /dev/null
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -0,0 +1,72 @@
+# DHIS2 File Garbage Remover Scripts
+
+This folder contains two Python scripts designed to manage orphaned resources in DHIS2 instances. 
+These scripts identify files with no database references and take appropriate actions depending on the environment, production (tomcat) or testing (docker).
+
+Included Scripts
+
+## file_garbage_remover_tomcat.py
+
+### Description:
+
+Designed for production environments. Identifies orphaned file resources (only documents for now), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table.
+
+### Usage:
+
+Run the script in either test or force mode.
+
+Test Mode (dry-run):
+
+```./file_garbage_remover_tomcat.py --test --config /path/to/config.json```
+
+Force Mode (apply changes):
+
+```
+export DB_PASSWORD_FILE_G='your_password'
+./file_garbage_remover_tomcat.py --force --config /path/to/config.json
+```
+
+config.json File Requirements:
+```
+{
+  "db_host": "",
+  "db_port": "",
+  "db_name": "",
+  "db_user": "",
+  "file_base_path": "",
+  "temp_file_path": ""
+}
+```
+
+Ensure file_base_path and temp_file_path exist and are valid directories.
+
+Password should be provided through the environment variable DB_PASSWORD_FILE_G.
+
+## file_garbage_remover_docker.py
+
+### Description:
+
+Intended for d2-docker testing environments. 
+Identifies orphaned files in a Dockerized DHIS2 instance and directly deletes them from the container. This script does not archive or move files, permanently removing identified resources.
+
+### Usage:
+
+Run the script specifying the Docker DHIS2 instance:
+
+```
+./file_garbage_remover_docker.py --instance docker.eyeseetea.com/project/dhis2-data:2.41-test
+```
+
+### Operation:
+
+Executes an SQL query within the DHIS2 container using d2-docker run-sql.
+
+Copies the generated file list into the container.
+
+Deletes identified files directly inside the container.
+
+# Precautions
+
+Production Environments: Always use --test mode before using --force to verify intended changes.
+
+Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered.
\ No newline at end of file

From b98b7bb7e2f7b45ee2ff648d6ec318a5da453c69 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 14 Jul 2025 17:15:36 +0200
Subject: [PATCH 08/23] fix readme

---
 DHIS2/file_garbage_remover/Readme.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index faf1ff77..8a259966 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -17,7 +17,10 @@ Run the script in either test or force mode.
 
 Test Mode (dry-run):
 
-```./file_garbage_remover_tomcat.py --test --config /path/to/config.json```
+```
+export DB_PASSWORD_FILE_G='your_password'
+./file_garbage_remover_tomcat.py --test --config /path/to/config.json
+```
 
 Force Mode (apply changes):
 
@@ -42,6 +45,19 @@ Ensure file_base_path and temp_file_path exist and are valid directories.
 
 Password should be provided through the environment variable DB_PASSWORD_FILE_G.
 
+Bash wrapper example:
+```
+#!/bin/bash
+MODE="${1:-test}"
+[[ "$MODE" != "test" && "$MODE" != "force" ]] && echo "Invalid mode." && exit 1
+
+DB_PASSWORD_FILE_G=db_password
+
+python3 /path/to/script/bin/file_garbage_remover/file_garbage_remover_tomcat.py --config /path/to/script/bin/file_garbage_remover/config.json --$MODE 2>&1 | tee -a /path/to/logs/orphan_cleanup.log
+
+unset DB_PASSWORD_FILE_G
+```
+
 ## file_garbage_remover_docker.py
 
 ### Description:

From 3c6062ad8b7e06319ae124337f5cda6e1607ab5e Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 14 Jul 2025 17:25:45 +0200
Subject: [PATCH 09/23] Refactoring to remove magic values and fix log comment

---
 .../file_garbage_remover_tomcat.py               | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index a1e03f5e..5268f053 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -29,6 +29,10 @@
     DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
 """
 
+SQL_CREATE_TABLE_IF_NOT_EXIST = """
+    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
+"""
+
 def log(message):
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     if isinstance(message, str):
@@ -59,8 +63,10 @@ def get_matching_document_files(storage_key, base_dir):
 def update_db(fileresourceid, cursor, conn, dry_run=False):
     if dry_run:
         log("[DRY RUN] Would execute:")
-        log(f"  {SQL_INSERT_AUDIT.strip()} with fileresourceid = '{fileresourceid}'")
-        log(f"  {SQL_DELETE_ORIGINAL.strip()} with fileresourceid = '{fileresourceid}'")
+        log(f"  {SQL_INSERT_AUDIT.strip()}")
+        log(f"  with parameters: {{'fid': '{fileresourceid}'}}")
+        log(f"  {SQL_DELETE_ORIGINAL.strip()} ")
+        log(f"  with parameters: {{'fid': '{fileresourceid}'}}")
     else:
         try:
             cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid})
@@ -102,12 +108,10 @@ def move_files(file_list, file_base_path, temp_file_path, dry_run=False):
 def ensure_audit_table_exists(cursor, dry_run=False):
     if dry_run:
         log("[DRY RUN] Would execute:")
-        log(" CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;")
+        log(SQL_CREATE_TABLE_IF_NOT_EXIST)
     else:
         log("Ensuring 'fileresourcesaudit' table exists...")
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
-        """)
+        cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST)
         log("'fileresourcesaudit' table ready.")
 
 def main():

From 0de1af1273b3a65068d31a83b2d98fa31de0939c Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 14 Jul 2025 17:27:56 +0200
Subject: [PATCH 10/23] change db variable name

---
 DHIS2/file_garbage_remover/Readme.md                   | 10 +++++-----
 .../file_garbage_remover_tomcat.py                     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index 8a259966..a71416cf 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -18,14 +18,14 @@ Run the script in either test or force mode.
 Test Mode (dry-run):
 
 ```
-export DB_PASSWORD_FILE_G='your_password'
+export DB_PASSWORD_FG='your_password'
 ./file_garbage_remover_tomcat.py --test --config /path/to/config.json
 ```
 
 Force Mode (apply changes):
 
 ```
-export DB_PASSWORD_FILE_G='your_password'
+export DB_PASSWORD_FG='your_password'
 ./file_garbage_remover_tomcat.py --force --config /path/to/config.json
 ```
 
@@ -43,7 +43,7 @@ config.json File Requirements:
 
 Ensure file_base_path and temp_file_path exist and are valid directories.
 
-Password should be provided through the environment variable DB_PASSWORD_FILE_G.
+Password should be provided through the environment variable DB_PASSWORD_FG.
 
 Bash wrapper example:
 ```
@@ -51,11 +51,11 @@ Bash wrapper example:
 MODE="${1:-test}"
 [[ "$MODE" != "test" && "$MODE" != "force" ]] && echo "Invalid mode." && exit 1
 
-DB_PASSWORD_FILE_G=db_password
+DB_PASSWORD_FG=db_password
 
 python3 /path/to/script/bin/file_garbage_remover/file_garbage_remover_tomcat.py --config /path/to/script/bin/file_garbage_remover/config.json --$MODE 2>&1 | tee -a /path/to/logs/orphan_cleanup.log
 
-unset DB_PASSWORD_FILE_G
+unset DB_PASSWORD_FG
 ```
 
 ## file_garbage_remover_docker.py
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index 5268f053..2e064281 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -130,7 +130,7 @@ def main():
         log(f"❌ Missing required config keys: {', '.join(missing_keys)}")
         sys.exit(1)
 
-    db_password = os.environ.get("DB_PASSWORD_FILE_G")
+    db_password = os.environ.get("DB_PASSWORD_FG")
     if not db_password:
         log("❌ Missing required environment variable: DB_PASSWORD")
         sys.exit(1)

From 9ebf3e31107a8a9525aa08fcd631ff1f14bcd2eb Mon Sep 17 00:00:00 2001
From: Carlos Garcia Bautista <carlos@eyeseetea.com>
Date: Thu, 17 Jul 2025 15:03:54 +0000
Subject: [PATCH 11/23] Edited error reported to the user to reflect the
 correct name of the environmental variable

---
 DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index 2e064281..b813c8d0 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -132,7 +132,7 @@ def main():
 
     db_password = os.environ.get("DB_PASSWORD_FG")
     if not db_password:
-        log("❌ Missing required environment variable: DB_PASSWORD")
+        log("❌ Missing required environment variable: DB_PASSWORD_FG")
         sys.exit(1)
 
     db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}"

From 8048428f4e6b172f2570264ce173bc8dfb639ba0 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Fri, 25 Jul 2025 16:12:54 +0200
Subject: [PATCH 12/23] Update script to clean data value file resources
 excluding actives in event, datavalue, trackedentityattribute values.

---
 DHIS2/file_garbage_remover/Readme.md          |   2 +-
 .../file_garbage_remover_tomcat.py            | 177 ++++++++++++++----
 2 files changed, 138 insertions(+), 41 deletions(-)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index a71416cf..886a8f5c 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -9,7 +9,7 @@ Included Scripts
 
 ### Description:
 
-Designed for production environments. Identifies orphaned file resources (only documents for now), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table.
+Designed for production environments. Identifies orphaned file resources (documents or datavalues (files or images attached to a value)), moves the files to a temporary directory, and archives corresponding database entries into a special table(fileresourcesaudit) before deleting them from the original database table.
 
 ### Usage:
 
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index b813c8d0..97660547 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
-import re
-
-import psycopg2
-import shutil
-import os
 import argparse
 import json
+import os
+import re
+import shutil
 import sys
 from datetime import datetime
 
-SQL_FIND_ORPHANS = """
+import psycopg2
+
+SQL_FIND_ORPHANS_DOCUMENTS = """
     SELECT fileresourceid, storagekey
     FROM fileresource fr
     WHERE NOT EXISTS (
@@ -21,6 +21,11 @@
     AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
 """
 
+SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
+    SELECT fileresourceid, uid, storagekey
+    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
+"""
+
 SQL_INSERT_AUDIT = """
     INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
 """
@@ -33,6 +38,25 @@
     CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
 """
 
+SQL_EVENT_FILE_UIDS = """
+        SELECT eventdatavalues
+        FROM event
+        WHERE programstageid 
+        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
+        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
+"""
+
+SQL_DATA_VALUE_UIDS = """
+        SELECT dv.value
+        FROM datavalue dv
+        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
+"""
+
+SQL_TRACKER_ATTRIBUTE_UIDS = """
+select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
+"""
+
+
 def log(message):
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     if isinstance(message, str):
@@ -44,13 +68,45 @@ def log(message):
         )
     print(f"[{timestamp}] {message}")
 
+
 def load_config(path):
     with open(path, "r") as f:
         return json.load(f)
 
-def get_matching_document_files(storage_key, base_dir):
-    storage_key = storage_key.replace("document/", "")
-    base_dir = os.path.join(base_dir, "document")
+
+def get_events(cursor):
+    cursor.execute(SQL_EVENT_FILE_UIDS)
+    return [row[0] for row in cursor.fetchall() if row[0]]
+
+
+def get_event_blob(cursor):
+    event_texts = get_events(cursor)
+    return "\n".join(json.dumps(e) for e in event_texts)
+
+
+def get_data_value_file_resources(cursor):
+    cursor.execute("""
+        SELECT fileresourceid, uid, storagekey
+        FROM fileresource
+        WHERE domain = 'DATA_VALUE'
+    """)
+    return cursor.fetchall()
+
+
+def get_all_datavalue_uids(cursor):
+    cursor.execute(SQL_DATA_VALUE_UIDS)
+    return set(row[0] for row in cursor.fetchall() if row[0])
+
+
+def get_all_tracker_uids(cursor):
+    cursor.execute(SQL_TRACKER_ATTRIBUTE_UIDS)
+    return set(row[0] for row in cursor.fetchall() if row[0])
+
+
+def get_matching_files(storage_key, base_dir, folder):
+    # Folder (document or dataValue)
+    storage_key = storage_key.replace(folder + "/", "")
+    base_dir = os.path.join(base_dir, folder)
     matching = []
 
     for filename in os.listdir(base_dir):
@@ -60,6 +116,7 @@ def get_matching_document_files(storage_key, base_dir):
                 matching.append(fullpath)
     return matching
 
+
 def update_db(fileresourceid, cursor, conn, dry_run=False):
     if dry_run:
         log("[DRY RUN] Would execute:")
@@ -78,9 +135,10 @@ def update_db(fileresourceid, cursor, conn, dry_run=False):
             conn.rollback()
             raise
 
-def move_files(file_list, file_base_path, temp_file_path, dry_run=False):
-    base_dir = os.path.join(file_base_path, "document")
-    dest_dir = os.path.join(temp_file_path, "document")
+
+def move_files(file_list, file_base_path, temp_file_path, folder, dry_run):
+    base_dir = os.path.join(file_base_path, folder)
+    dest_dir = os.path.join(temp_file_path, folder)
     for src in file_list:
         rel_path = os.path.relpath(src, base_dir)
         log(rel_path)
@@ -103,8 +161,6 @@ def move_files(file_list, file_base_path, temp_file_path, dry_run=False):
                 raise
 
 
-
-
 def ensure_audit_table_exists(cursor, dry_run=False):
     if dry_run:
         log("[DRY RUN] Would execute:")
@@ -114,6 +170,7 @@ def ensure_audit_table_exists(cursor, dry_run=False):
         cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST)
         log("'fileresourcesaudit' table ready.")
 
+
 def main():
     parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.")
     group = parser.add_mutually_exclusive_group(required=True)
@@ -154,32 +211,72 @@ def main():
     with psycopg2.connect(dsn=db_url) as conn:
         with conn.cursor() as cur:
             ensure_audit_table_exists(cur, dry_run)
-            log("Querying orphaned fileresource entries...")
-            cur.execute(SQL_FIND_ORPHANS)
-            rows = cur.fetchall()
-
-            if not rows:
-                log("✅ No orphaned fileresource entries found.")
-                return
-
-            log(f"Found {len(rows)} orphaned entries.")
-            count = 0
-            for fileresourceid, storagekey in rows:
-                if not fileresourceid:
-                    log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
-                    continue
-
-                try:
-                    matches = get_matching_document_files(storagekey, file_base_path)
-                    if not matches:
-                        raise Exception(f"No files found for storage_key: {storagekey}")
-                    move_files(matches, file_base_path, temp_file_path, dry_run)
-                    update_db(fileresourceid, cur, conn, dry_run)
-                except Exception as e:
-                    log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}")
-                    sys.exit(1)
-                count = count + 1
-            log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
+            remove_documents(file_base_path, temp_file_path, dry_run, cur, conn)
+            remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn)
+
+
+def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn):
+    log("Querying orphaned fileresource entries...")
+    cur.execute(SQL_FIND_ORPHANS_DOCUMENTS)
+    rows = cur.fetchall()
+
+    if not rows:
+        log("✅ No orphaned fileresource entries found.")
+        return
+
+    log(f"Found {len(rows)} \"document\" orphaned entries.")
+    process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document")
+
+
+def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn):
+    log("Querying orphaned fileresource entries...")
+    cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES)
+    rows = cur.fetchall()
+
+    # get all file resource datavalues
+    datavalue_uids = get_all_datavalue_uids(cur)
+    # get all events with file dataelement values
+    event_blob = get_event_blob(cur)
+
+    tracker_uids = get_all_tracker_uids(cur)
+    # 3. Filter out referenced file resources
+    orphan_data_values = []
+    for fileresourceid, uid, storagekey in rows:
+        if uid in datavalue_uids:
+            continue  # Referenced in datavalue
+        if uid in event_blob:
+            continue  # Referenced in event
+        if uid in tracker_uids:
+            continue # referenced in a trackerentityattribute
+        orphan_data_values.append((fileresourceid, storagekey))
+
+    if not orphan_data_values:
+        log("✅ No fileResources entries found.")
+        return
+
+    log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.")
+    process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue")
+
+
+def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder):
+    count = 0
+    for fileresourceid, storagekey in orphan_data_values:
+        if not fileresourceid:
+            log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
+            continue
+        try:
+            matches = get_matching_files(storagekey, file_base_path, "dataValue")
+            if not matches:
+                log(f"No files found for storage_key: {storagekey}")
+                continue
+            move_files(matches, file_base_path, temp_file_path, folder, dry_run)
+            update_db(fileresourceid, cur, conn, dry_run)
+        except Exception as e:
+            log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}")
+            sys.exit(1)
+        count = count + 1
+    log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
+
 
 if __name__ == "__main__":
     main()

From 274d5d58e9d5b7c0b0b537169bb06e04224f63df Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 24 Nov 2025 18:52:07 +0100
Subject: [PATCH 13/23] update to add notifications and refactor

---
 DHIS2/file_garbage_remover/cleaner.py         | 301 ++++++++++++++++++
 DHIS2/file_garbage_remover/csv_utils.py       |  85 +++++
 .../file_garbage_remover_tomcat.py            | 120 +++++--
 DHIS2/file_garbage_remover/main.py            |  71 +++++
 DHIS2/file_garbage_remover/notifier.py        |  68 ++++
 5 files changed, 625 insertions(+), 20 deletions(-)
 create mode 100644 DHIS2/file_garbage_remover/cleaner.py
 create mode 100644 DHIS2/file_garbage_remover/csv_utils.py
 create mode 100644 DHIS2/file_garbage_remover/main.py
 create mode 100644 DHIS2/file_garbage_remover/notifier.py

diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
new file mode 100644
index 00000000..2e7ccd37
--- /dev/null
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+import json
+import os
+import re
+import shutil
+import sys
+from datetime import datetime
+
+import subprocess
+import psycopg2
+
+from csv_utils import append_items, deduplicate_items
+
+SQL_FIND_ORPHANS_DOCUMENTS = """
+    SELECT fileresourceid, storagekey, name, created
+    FROM fileresource fr
+    WHERE NOT EXISTS (
+        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
+    )
+    AND fr.uid NOT IN (
+        SELECT url FROM document
+    )
+    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
+"""
+
+SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
+    SELECT fileresourceid, uid, storagekey, name, created
+    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
+"""
+
+SQL_INSERT_AUDIT = """
+    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+SQL_DELETE_ORIGINAL = """
+    DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+SQL_CREATE_TABLE_IF_NOT_EXIST = """
+    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
+"""
+
+SQL_EVENT_FILE_UIDS = """
+        SELECT eventdatavalues
+        FROM event
+        WHERE programstageid 
+        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
+        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
+"""
+
+SQL_DATA_VALUE_UIDS = """
+        SELECT dv.value
+        FROM datavalue dv
+        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
+"""
+
+SQL_TRACKER_ATTRIBUTE_UIDS = """
+select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
+"""
+
+
+def log(message):
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if isinstance(message, str):
+        message = re.sub(
+            r"(postgresql://[^:]+:)([^@]+)(@)",
+            r"\1*****\3",
+            message
+        )
+    print(f"[{timestamp}] {message}")
+
+
+def load_config(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def get_events(cursor):
+    cursor.execute(SQL_EVENT_FILE_UIDS)
+    return [row[0] for row in cursor.fetchall() if row[0]]
+
+
+def get_event_blob(cursor):
+    event_texts = get_events(cursor)
+    return "\n".join(json.dumps(e) for e in event_texts)
+
+
+def get_all_datavalue_uids(cursor):
+    cursor.execute(SQL_DATA_VALUE_UIDS)
+    return set(row[0] for row in cursor.fetchall() if row[0])
+
+
+def get_all_tracker_uids(cursor):
+    cursor.execute(SQL_TRACKER_ATTRIBUTE_UIDS)
+    return set(row[0] for row in cursor.fetchall() if row[0])
+
+
+def get_matching_files(storage_key, base_dir, folder):
+    storage_key = storage_key.replace(folder + "/", "")
+    base_dir = os.path.join(base_dir, folder)
+    if not os.path.isdir(base_dir):
+        log(f"⚠️ Skipping search: base directory does not exist: {base_dir}")
+        return []
+    matching = []
+    for filename in os.listdir(base_dir):
+        if filename.startswith(storage_key):
+            fullpath = os.path.join(base_dir, filename)
+            if os.path.isfile(fullpath):
+                matching.append(fullpath)
+    return matching
+
+
+def update_db(fileresourceid, cursor, conn, dry_run=False):
+    if dry_run:
+        log("[DRY RUN] Would execute:")
+        log(f"  {SQL_INSERT_AUDIT.strip()}")
+        log(f"  with parameters: {{'fid': '{fileresourceid}'}}")
+        log(f"  {SQL_DELETE_ORIGINAL.strip()} ")
+        log(f"  with parameters: {{'fid': '{fileresourceid}'}}")
+    else:
+        try:
+            cursor.execute(SQL_INSERT_AUDIT, {"fid": fileresourceid})
+            cursor.execute(SQL_DELETE_ORIGINAL, {"fid": fileresourceid})
+            conn.commit()
+            log(f"Archived and deleted fileresourceid {fileresourceid}")
+        except Exception as e:
+            log(f"❌ Failed DB update for fileresourceid {fileresourceid}: {e}")
+            conn.rollback()
+            raise
+
+
+def move_files(file_list, file_base_path, temp_file_path, folder, dry_run):
+    base_dir = os.path.join(file_base_path, folder)
+    dest_dir = os.path.join(temp_file_path, folder)
+    for src in file_list:
+        rel_path = os.path.relpath(src, base_dir)
+        dst = os.path.join(dest_dir, rel_path)
+        dst_dir = os.path.dirname(dst)
+        os.makedirs(dst_dir, exist_ok=True)
+        log(f"{'[DRY RUN] ' if dry_run else ''}Moving {src} -> {dst}")
+        if not dry_run:
+            shutil.move(src, dst)
+
+
+def ensure_audit_table_exists(cursor, dry_run=False):
+    if dry_run:
+        log("[DRY RUN] Would execute:")
+        log(SQL_CREATE_TABLE_IF_NOT_EXIST)
+    else:
+        log("Ensuring 'fileresourcesaudit' table exists...")
+        cursor.execute(SQL_CREATE_TABLE_IF_NOT_EXIST)
+        log("'fileresourcesaudit' table ready.")
+
+
+def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary):
+    log("Querying orphaned fileresource entries (documents)...")
+    cur.execute(SQL_FIND_ORPHANS_DOCUMENTS)
+    rows = cur.fetchall()
+    if not rows:
+        log("✅ No orphaned fileresource entries found (documents).")
+        return
+    log(f"Found {len(rows)} \"document\" orphaned entries.")
+    process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary)
+
+
+def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary):
+    log("Querying orphaned fileresource entries (data values)...")
+    cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES)
+    rows = cur.fetchall()
+
+    datavalue_uids = get_all_datavalue_uids(cur)
+    event_blob = get_event_blob(cur)
+    tracker_uids = get_all_tracker_uids(cur)
+
+    orphan_data_values = []
+    for fileresourceid, uid, storagekey, name, created in rows:
+        if uid in datavalue_uids:
+            continue  # Referenced in datavalue
+        if uid in event_blob:
+            continue  # Referenced in event
+        if uid in tracker_uids:
+            continue  # Referenced in a tracker attribute
+        orphan_data_values.append((fileresourceid, storagekey, name, created))
+
+    if not orphan_data_values:
+        log("✅ No fileResources entries found (data values).")
+        return
+
+    log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.")
+    process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary)
+
+
+def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary):
+    count = 0
+    for entry in orphan_data_values:
+        fileresourceid, storagekey = entry[0], entry[1]
+        name = entry[2] if len(entry) > 2 else ""
+        created = entry[3] if len(entry) > 3 else None
+        if not fileresourceid:
+            log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
+            continue
+        matches = get_matching_files(storagekey, file_base_path, folder)
+        if not matches:
+            log(f"No files found for storage_key: {storagekey} (folder={folder})")
+        else:
+            move_files(matches, file_base_path, temp_file_path, folder, dry_run)
+        update_db(fileresourceid, cur, conn, dry_run)
+        rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches]
+        summary["items"].append({
+            "id": fileresourceid,
+            "name": name,
+            "storagekey": storagekey,
+            "folder": folder,
+            "files": rel_matches,
+            "created": created.isoformat() if hasattr(created, "isoformat") else created,
+            "detection_date": datetime.utcnow().isoformat(),
+            "action": "would_move_and_delete" if dry_run else "moved_and_deleted",
+            "notified": False
+        })
+        count += 1
+    log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
+
+
+def emit_summary(summary):
+    mode = summary.get("mode", "TEST")
+    items = summary.get("items", [])
+    log(f"Summary ({mode}): {len(items)} fileresource entries processed.")
+    for item in items:
+        files = item.get("files") or ["<missing from disk>"]
+        log(f"  - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}")
+        for f in files:
+            log(f"      file: {f}")
+
+
+def run_cleanup(args):
+    if args.mode == "docker":
+        run_docker_cleanup(args)
+        return
+
+    config = load_config(args.config)
+    required_keys = ["db_host", "db_port", "db_name", "db_user", "file_base_path", "temp_file_path"]
+    missing_keys = [k for k in required_keys if not config.get(k)]
+    if missing_keys:
+        log(f"❌ Missing required config keys: {', '.join(missing_keys)}")
+        sys.exit(1)
+
+    db_password = os.environ.get("DB_PASSWORD_FG")
+    if not db_password:
+        log("❌ Missing required environment variable: DB_PASSWORD_FG")
+        sys.exit(1)
+
+    db_url = f"postgresql://{config['db_user']}:{db_password}@{config['db_host']}:{config['db_port']}/{config['db_name']}"
+    file_base_path = config["file_base_path"]
+    temp_file_path = config["temp_file_path"]
+
+    if not os.path.isdir(file_base_path) or not os.path.isdir(temp_file_path):
+        log("❌ Both 'file_base_path' and 'temp_file_path' must exist and be directories.")
+        log(f"    file_base_path: {file_base_path} -> {'OK' if os.path.isdir(file_base_path) else 'INVALID'}")
+        log(f"    temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}")
+        sys.exit(1)
+
+    if args.force and args.test:
+        log("❌ Choose either --force or --test, not both.")
+        sys.exit(1)
+
+    dry_run = not args.force
+    log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}")
+    log(f"Connecting to database: {db_url}")
+    log(f"File base path: {file_base_path}")
+    log(f"Temporary move path: {temp_file_path}")
+
+    summary = {"mode": "TEST" if dry_run else "FORCE", "items": []}
+
+    with psycopg2.connect(dsn=db_url) as conn:
+        with conn.cursor() as cur:
+            ensure_audit_table_exists(cur, dry_run)
+            remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary)
+            remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary)
+
+    if args.csv_path:
+        overwrite_csv = bool(args.force) and not args.maintain_csv
+        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
+        append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
+    emit_summary(summary)
+
+
+def run_docker_cleanup(args):
+    if not args.docker_instance:
+        log("❌ --docker-instance is required when --mode=docker")
+        sys.exit(1)
+    script_path = os.path.join(os.path.dirname(__file__), "file_garbage_remover_docker.py")
+    if not os.path.isfile(script_path):
+        log(f"❌ Docker cleanup script not found at {script_path}")
+        sys.exit(1)
+    cmd = [sys.executable, script_path, "--instance", args.docker_instance]
+    log(f"Running docker cleanup via: {' '.join(cmd)}")
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        log(f"❌ Docker cleanup failed: {e}")
+        sys.exit(e.returncode)
diff --git a/DHIS2/file_garbage_remover/csv_utils.py b/DHIS2/file_garbage_remover/csv_utils.py
new file mode 100644
index 00000000..12b1cd8b
--- /dev/null
+++ b/DHIS2/file_garbage_remover/csv_utils.py
@@ -0,0 +1,85 @@
+import csv
+import os
+
+DEFAULT_FIELDNAMES = ["id", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"]
+
+
+def serialize_item(item):
+    return {
+        "id": item.get("id"),
+        "name": item.get("name"),
+        "created": item.get("created"),
+        "detection_date": item.get("detection_date"),
+        "storagekey": item.get("storagekey"),
+        "folder": item.get("folder"),
+        "action": item.get("action"),
+        "files": "|".join(item.get("files") or []),
+        "notified": str(item.get("notified", False)).lower(),
+    }
+
+
+def append_items(csv_path, items, overwrite=False, log=None):
+    """
+    Append serialized items to CSV. If overwrite=True, start fresh.
+    """
+    mode = "w" if overwrite else "a"
+    file_exists = os.path.isfile(csv_path)
+    want_header = overwrite or not file_exists
+    try:
+        with open(csv_path, mode, newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=DEFAULT_FIELDNAMES)
+            if want_header:
+                writer.writeheader()
+            for item in items:
+                writer.writerow(serialize_item(item))
+        if log:
+            log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}")
+    except Exception as e:
+        if log:
+            log(f"❌ Failed to write CSV {csv_path}: {e}")
+        else:
+            raise
+
+
+def read_rows(csv_path):
+    with open(csv_path, newline="") as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames or DEFAULT_FIELDNAMES
+        rows = list(reader)
+    return rows, fieldnames
+
+
+def write_rows(csv_path, rows, fieldnames=None):
+    fieldnames = fieldnames or DEFAULT_FIELDNAMES
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False, log=None):
+    """
+    Returns new_items filtered to avoid duplicates with existing CSV based on unique_keys.
+    If overwrite=True, no filtering is applied (CSV will be rewritten).
+    """
+    if overwrite or not os.path.isfile(csv_path):
+        return new_items
+
+    existing_keys = set()
+    try:
+        rows, _ = read_rows(csv_path)
+        for row in rows:
+            key = tuple((row.get(k) or "").strip() for k in unique_keys)
+            existing_keys.add(key)
+    except Exception as e:
+        if log:
+            log(f"⚠️ Could not read existing CSV for deduplication: {e}")
+
+    filtered = []
+    for item in new_items:
+        key = tuple((str(item.get(k)) or "").strip() for k in unique_keys)
+        if key in existing_keys:
+            continue
+        existing_keys.add(key)
+        filtered.append(item)
+    return filtered
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index 97660547..ced18f32 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import csv
 import json
 import os
 import re
@@ -10,7 +11,7 @@
 import psycopg2
 
 SQL_FIND_ORPHANS_DOCUMENTS = """
-    SELECT fileresourceid, storagekey
+    SELECT fileresourceid, storagekey, name, created
     FROM fileresource fr
     WHERE NOT EXISTS (
         SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
@@ -22,7 +23,7 @@
 """
 
 SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
-    SELECT fileresourceid, uid, storagekey
+    SELECT fileresourceid, uid, storagekey, name, created
     FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
 """
 
@@ -173,10 +174,11 @@ def ensure_audit_table_exists(cursor, dry_run=False):
 
 def main():
     parser = argparse.ArgumentParser(description="Move orphaned DHIS2 file resources and archive DB entries.")
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes).")
-    group.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
+    parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
+    parser.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes). Default if --force not provided.")
     parser.add_argument("--config", required=True, help="Path to config.json file.")
+    parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.")
+    parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
     args = parser.parse_args()
 
     config = load_config(args.config)
@@ -202,20 +204,31 @@ def main():
         log(f"    temp_file_path: {temp_file_path} -> {'OK' if os.path.isdir(temp_file_path) else 'INVALID'}")
         sys.exit(1)
 
-    dry_run = args.test
+    if args.force and args.test:
+        log("❌ Choose either --force or --test, not both.")
+        sys.exit(1)
+
+    dry_run = not args.force  # default to test mode unless --force is provided
     log(f"{'Running in TEST mode' if dry_run else 'Running in FORCE mode'}")
     log(f"Connecting to database: {db_url}")
     log(f"File base path: {file_base_path}")
     log(f"Temporary move path: {temp_file_path}")
 
+    summary = {"mode": "TEST" if dry_run else "FORCE", "items": []}
+
     with psycopg2.connect(dsn=db_url) as conn:
         with conn.cursor() as cur:
             ensure_audit_table_exists(cur, dry_run)
-            remove_documents(file_base_path, temp_file_path, dry_run, cur, conn)
-            remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn)
+            remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary)
+            remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary)
 
+    if args.csv_path:
+        overwrite_csv = bool(args.force) and not args.maintain_csv
+        write_csv(args.csv_path, summary, overwrite=overwrite_csv)
+    emit_summary(summary)
 
-def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn):
+
+def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary):
     log("Querying orphaned fileresource entries...")
     cur.execute(SQL_FIND_ORPHANS_DOCUMENTS)
     rows = cur.fetchall()
@@ -225,10 +238,10 @@ def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn):
         return
 
     log(f"Found {len(rows)} \"document\" orphaned entries.")
-    process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document")
+    process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary)
 
 
-def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn):
+def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary):
     log("Querying orphaned fileresource entries...")
     cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES)
     rows = cur.fetchall()
@@ -241,36 +254,52 @@ def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn):
     tracker_uids = get_all_tracker_uids(cur)
     # 3. Filter out referenced file resources
     orphan_data_values = []
-    for fileresourceid, uid, storagekey in rows:
+    for fileresourceid, uid, storagekey, name, created in rows:
         if uid in datavalue_uids:
             continue  # Referenced in datavalue
         if uid in event_blob:
             continue  # Referenced in event
         if uid in tracker_uids:
             continue # referenced in a trackerentityattribute
-        orphan_data_values.append((fileresourceid, storagekey))
+        orphan_data_values.append((fileresourceid, storagekey, name, created))
 
     if not orphan_data_values:
         log("✅ No fileResources entries found.")
         return
 
     log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.")
-    process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue")
+    process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary)
 
 
-def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder):
+def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary):
     count = 0
-    for fileresourceid, storagekey in orphan_data_values:
+    for entry in orphan_data_values:
+        # entries may come as (id, storagekey, name)
+        fileresourceid, storagekey = entry[0], entry[1]
+        name = entry[2] if len(entry) > 2 else ""
+        created = entry[3] if len(entry) > 3 else None
         if not fileresourceid:
             log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
             continue
         try:
-            matches = get_matching_files(storagekey, file_base_path, "dataValue")
+            matches = get_matching_files(storagekey, file_base_path, folder)
             if not matches:
-                log(f"No files found for storage_key: {storagekey}")
-                continue
-            move_files(matches, file_base_path, temp_file_path, folder, dry_run)
+                log(f"No files found for storage_key: {storagekey} (folder={folder})")
+            else:
+                move_files(matches, file_base_path, temp_file_path, folder, dry_run)
+            # Always update the db also if the files not existed on disk
             update_db(fileresourceid, cur, conn, dry_run)
+            rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches]
+            summary["items"].append({
+                "id": fileresourceid,
+                "name": name,
+                "storagekey": storagekey,
+                "folder": folder,
+                "files": rel_matches,
+                "created": created.isoformat() if hasattr(created, "isoformat") else created,
+                "action": "would_move_and_delete" if dry_run else "moved_and_deleted",
+                "notified": False
+            })
         except Exception as e:
             log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}")
             sys.exit(1)
@@ -278,5 +307,56 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry
     log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
 
 
+def emit_summary(summary):
+    mode = summary.get("mode", "TEST")
+    items = summary.get("items", [])
+    log(f"Summary ({mode}): {len(items)} fileresource entries processed.")
+    for item in items:
+        files = item.get("files") or ["<missing from disk>"]
+        log(f"  - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}")
+        for f in files:
+            log(f"      file: {f}")
+
+
+def write_csv(csv_path, summary, overwrite=False):
+    if not csv_path:
+        return
+    file_exists = os.path.isfile(csv_path)
+    mode = "w" if overwrite else "a"
+    want_header = overwrite or not file_exists
+    fieldnames = ["id", "name", "created", "storagekey", "folder", "action", "files", "notified"]
+    existing_ids = set()
+    if not overwrite and file_exists:
+        try:
+            with open(csv_path, "r", newline="") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    if "id" in row and row["id"]:
+                        existing_ids.add(str(row["id"]))
+        except Exception as e:
+            log(f"⚠️ Could not read existing CSV for deduplication: {e}")
+    try:
+        with open(csv_path, mode, newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            if want_header:
+                writer.writeheader()
+            for item in summary.get("items", []):
+                if str(item.get("id")) in existing_ids:
+                    continue
+                writer.writerow({
+                    "id": item.get("id"),
+                    "name": item.get("name"),
+                    "created": item.get("created"),
+                    "storagekey": item.get("storagekey"),
+                    "folder": item.get("folder"),
+                    "action": item.get("action"),
+                    "files": "|".join(item.get("files") or []),
+                    "notified": str(item.get("notified", False)).lower(),
+                })
+        log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}")
+    except Exception as e:
+        log(f"❌ Failed to write CSV {csv_path}: {e}")
+
+
 if __name__ == "__main__":
     main()
diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
new file mode 100644
index 00000000..f44e2233
--- /dev/null
+++ b/DHIS2/file_garbage_remover/main.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+import argparse
+import sys
+
+import cleaner
+from notifier import mark_notified, pending_from_csv, send_notification
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="DHIS2 file garbage remover and notifier.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    cleanup = subparsers.add_parser("cleanup", help="Move/cleanup orphaned file resources and archive DB entries.")
+    cleanup.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
+    cleanup.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).")
+    cleanup.add_argument("--config", required=True, help="Path to config.json file.")
+    cleanup.add_argument("--csv-path", help="Optional CSV file to record processed entries.")
+    cleanup.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
+    cleanup.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.")
+    cleanup.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).")
+
+    notify = subparsers.add_parser("notify", help="Print recent, non-notified filenames from CSV and mark them notified.")
+    notify.add_argument("--csv-path", required=True, help="Path to the CSV generated by cleanup.")
+    notify.add_argument("--webhook-url", help="Webhook URL to send the notification.")
+    notify.add_argument("--title", help="Notification title (required if --webhook-url).")
+    notify.add_argument("--http-proxy", help="HTTP proxy (optional).")
+    notify.add_argument("--https-proxy", help="HTTPS proxy (optional).")
+
+    return parser
+
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+
+    if args.command == "cleanup":
+        cleaner.run_cleanup(args)
+        return
+
+    if args.command == "notify":
+        names, ids, fieldnames, rows = pending_from_csv(args.csv_path)
+        if not names:
+            sys.exit(1)
+        content = "\n".join(names)
+        if args.webhook_url:
+            if not args.title:
+                print("❌ --title is required when using --webhook-url", file=sys.stderr)
+                sys.exit(1)
+            try:
+                send_notification(
+                    webhook_url=args.webhook_url,
+                    title=args.title,
+                    content=content,
+                    http_proxy=args.http_proxy,
+                    https_proxy=args.https_proxy,
+                )
+            except Exception as e:
+                print(f"❌ Failed to send notification: {e}", file=sys.stderr)
+                sys.exit(1)
+        else:
+            for name in names:
+                print(name)
+        mark_notified(args.csv_path, rows, fieldnames, ids)
+        sys.exit(0)
+
+    parser.print_help()
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/DHIS2/file_garbage_remover/notifier.py b/DHIS2/file_garbage_remover/notifier.py
new file mode 100644
index 00000000..870f0f52
--- /dev/null
+++ b/DHIS2/file_garbage_remover/notifier.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import os
+import sys
+import requests
+
+from csv_utils import read_rows, write_rows
+
+
+def pending_from_csv(csv_path):
+    rows, fieldnames = read_rows(csv_path)
+
+    notify_ids = set()
+    names_to_print = []
+    for row in rows:
+        notified = str(row.get("notified", "")).lower() == "true"
+        if notified:
+            continue
+        name = row.get("name", "")
+        if name:
+            names_to_print.append(name)
+        notify_ids.add(str(row.get("id")))
+
+    return names_to_print, notify_ids, fieldnames, rows
+
+
+def mark_notified(csv_path, rows, fieldnames, notify_ids):
+    if not notify_ids:
+        return
+    for row in rows:
+        if str(row.get("id")) in notify_ids:
+            row["notified"] = "true"
+    write_rows(csv_path, rows, fieldnames=fieldnames)
+
+
+def send_notification(webhook_url, title, content, http_proxy=None, https_proxy=None):
+    proxies = {
+        "http": http_proxy or os.getenv("http_proxy", ""),
+        "https": https_proxy or os.getenv("https_proxy", "")
+    }
+    payload = {"text": f"**{title}**\n{content}"}
+    response = requests.post(
+        webhook_url,
+        json=payload,
+        headers={"Content-Type": "application/json"},
+        proxies=proxies,
+        verify=True,
+        timeout=10,
+    )
+    response.raise_for_status()
+    return response.status_code
+
+
+def main():
+    if len(sys.argv) < 2:
+        sys.exit(1)
+    csv_path = sys.argv[1]
+    days = int(sys.argv[2]) if len(sys.argv) > 2 else None
+    names, ids, fieldnames, rows = pending_from_csv(csv_path, days)
+    if not names:
+        sys.exit(1)
+    for name in names:
+        print(name)
+    mark_notified(csv_path, rows, fieldnames, ids)
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()

From 1b712281dc5bd939baf17d7a47db2dbb7e13fe05 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Wed, 26 Nov 2025 11:29:37 +0100
Subject: [PATCH 14/23] improve script fixing error, adding notify, improve
 workflow and fix docker remover

---
 DHIS2/file_garbage_remover/Readme.md          |  32 ++-
 DHIS2/file_garbage_remover/cleaner.py         |   6 +
 .../file_garbage_remover_docker.py            | 259 +++++++++++++-----
 DHIS2/file_garbage_remover/main.py            | 137 ++++++---
 4 files changed, 325 insertions(+), 109 deletions(-)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index 886a8f5c..0c2c627d 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -5,6 +5,36 @@ These scripts identify files with no database references and take appropriate ac
 
 Included Scripts
 
+## main.py (recommended entry point)
+
+Run cleanup (tomcat or docker) and optionally send a notification in one command. Dry-run is the default unless you add `--force`.
+
+### Example (docker cleanup + notify using webhook from config.json)
+```
+python3 main.py \
+  --mode docker \
+  --docker-instance docker.eyeseetea.com/widpit/dhis2-data:2.41-widp-dev-test \
+  --csv-path /tmp/fg_docker.csv \
+  --config /path/to/config.json \
+  --notify-title "Orphans widp-dev-test"
+```
+- Add `--force` to delete/move for real (tomcat) or delete in container (docker).  
+- Add `--notify-test` to print the payload instead of sending.  
+- To notify only (no cleanup): `python3 main.py --notify-only --csv-path /tmp/fg.csv --config /path/to/config.json --notify-title "..." [--notify-test]`
+
+config.json needs the cleanup settings plus the webhook (if you want notifications):
+```
+{
+  "db_host": "",
+  "db_port": "",
+  "db_name": "",
+  "db_user": "",
+  "file_base_path": "",
+  "temp_file_path": "",
+  "webhook-url": "https://your.webhook.url"
+}
+```
+
 ## file_garbage_remover_tomcat.py
 
 ### Description:
@@ -85,4 +115,4 @@ Deletes identified files directly inside the container.
 
 Production Environments: Always use --test mode before using --force to verify intended changes.
 
-Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered.
\ No newline at end of file
+Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered.
diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index 2e7ccd37..eeb86436 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -293,6 +293,12 @@ def run_docker_cleanup(args):
         log(f"❌ Docker cleanup script not found at {script_path}")
         sys.exit(1)
     cmd = [sys.executable, script_path, "--instance", args.docker_instance]
+    if args.csv_path:
+        cmd.extend(["--csv-path", args.csv_path])
+        if args.maintain_csv:
+            cmd.append("--maintain-csv")
+    if args.force:
+        cmd.append("--force")
     log(f"Running docker cleanup via: {' '.join(cmd)}")
     try:
         subprocess.run(cmd, check=True)
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index ff8e1fea..3075e232 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -1,18 +1,16 @@
 #!/usr/bin/env python3
 
+import argparse
+import os
 import subprocess
 import sys
-import argparse
-
+import tempfile
+from datetime import datetime
 
-LOCAL_SQL_FILE = "/tmp/find_orphan_files.sql"
-LOCAL_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
-REMOTE_LIST_FILE = "/tmp/list_of_files_to_be_removed.txt"
-FILE_BASE_PATH = "/DHIS2_home/files"
+from csv_utils import append_items, deduplicate_items
 
-
-SQL_QUERY = """
-    SELECT storagekey
+SQL_FIND_ORPHANS_DOCUMENTS = """
+    SELECT fileresourceid, storagekey, name, created
     FROM fileresource fr
     WHERE NOT EXISTS (
         SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
@@ -20,79 +18,206 @@
     AND fr.uid NOT IN (
         SELECT url FROM document
     )
-    AND fr.domain = 'DOCUMENT';
+    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
 """
 
+SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
+    SELECT fileresourceid, uid, storagekey, name, created
+    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
+"""
 
-def run(command, check=True, capture=False):
-    """Run a shell command."""
-    print(f"$ {' '.join(command)}")
-    return subprocess.run(command, check=check, capture_output=capture, text=True)
-
-
-def find_container_id(name_pattern):
-    """Find the container ID based on a name substring. Retrieves first match only"""
-    result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={name_pattern}"], capture=True)
-    lines = result.stdout.strip().splitlines()
-    return lines[0] if lines else None
-
-
-def slugify(instance_name):
-    """Convert instance name to container name slug."""
-    return "d2-docker-" + instance_name.replace("/", "-").replace(":", "-").replace(".", "-")
-
+SQL_EVENT_FILE_UIDS = """
+        SELECT eventdatavalues
+        FROM event
+        WHERE programstageid 
+        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
+        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
+"""
 
-def main():
-    parser = argparse.ArgumentParser(description="Remove orphaned DHIS2 file resources from Core container.")
-    parser.add_argument("-i", "--instance", required=True, help="d2-docker instance name (e.g. /usr/bin/python3 file_garbage_remover.py -i docker.eyeseetea.com/widpit/dhis2-data:2.42-widp-preprod-cont-indiv)")
-    args = parser.parse_args()
+SQL_DATA_VALUE_UIDS = """
+        SELECT dv.value
+        FROM datavalue dv
+        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
+"""
 
-    instance_name = args.instance
-    container_slug = slugify(instance_name)
-    core_container_match = container_slug.replace("dhis2-data-", "") + "-core-1"
+SQL_TRACKER_ATTRIBUTE_UIDS = """
+select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
+"""
 
-    print("Writing SQL file...")
-    with open(LOCAL_SQL_FILE, "w") as f:
-        f.write(SQL_QUERY)
+SQL_INSERT_AUDIT = """
+    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = {fid};
+"""
 
-    print("Running SQL with d2-docker and capturing output...")
-    result = run(["d2-docker", "run-sql", "-i", instance_name, LOCAL_SQL_FILE], capture=True)
+SQL_DELETE_ORIGINAL = """
+    DELETE FROM fileresource WHERE fileresourceid = {fid};
+"""
 
-    print("Saving result to file...")
-    with open(LOCAL_LIST_FILE, "w") as f:
-        for line in result.stdout.strip().splitlines():
-            line = line.strip()
-            if line and not line.lower().startswith("storagekey"):
-                f.write(f"{line}\n")
+SQL_CREATE_TABLE_IF_NOT_EXIST = """
+    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
+"""
 
-    print("Identifying containers...")
-    core_container = find_container_id(core_container_match)
-    print(core_container)
 
-    print(f"Core container: {core_container}")
+def run(cmd, capture=False):
+    kwargs = {"check": True}
+    if capture:
+        kwargs["stdout"] = subprocess.PIPE
+        kwargs["stderr"] = subprocess.PIPE
+        kwargs["text"] = True
+    return subprocess.run(cmd, **kwargs)
 
-    print("Copying file list to Core container...")
-    run(["docker", "cp", LOCAL_LIST_FILE, f"{core_container}:{REMOTE_LIST_FILE}"])
 
-    print(f"Deleting orphaned files in Core container... {core_container}")
-    delete_cmd = f"""
-    bash -c '
-    if [ ! -f "{REMOTE_LIST_FILE}" ]; then
-        echo "File list not found: {REMOTE_LIST_FILE}"
-        exit 1
-    fi
+def find_container_id(instance):
+    slug = "d2-docker-" + instance.replace("/", "-").replace(":", "-").replace(".", "-")
+    core = slug.replace("dhis2-data-", "") + "-core-1"
+    result = run(["docker", "ps", "--format", "{{.ID}}", "-f", f"name={core}"], capture=True)
+    lines = result.stdout.strip().splitlines()
+    if not lines:
+        return None
+    return lines[0]
+
+
+def run_sql(instance, sql):
+    with tempfile.NamedTemporaryFile("w", delete=False) as f:
+        f.write(sql)
+        tmp_path = f.name
+    try:
+        result = run(["d2-docker", "run-sql", "-i", instance, tmp_path], capture=True)
+        return result.stdout
+    finally:
+        try:
+            os.remove(tmp_path)
+        except OSError:
+            pass
+
+
+def parse_tab_rows(output, expected_cols):
+    rows = []
+    for line in output.strip().splitlines():
+        if not line or line.lower().startswith("fileresourceid") or line.lower().startswith("value") or line.lower().startswith("eventdatavalues"):
+            continue
+        if line.startswith("(") and "row" in line:
+            continue
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) < expected_cols:
+            continue
+        rows.append(parts)
+    return rows
+
+
+def get_orphan_documents(instance):
+    out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS)
+    rows = []
+    for fid, storagekey, name, created in parse_tab_rows(out, 4):
+        rows.append({
+            "id": fid,
+            "name": name,
+            "storagekey": storagekey,
+            "folder": "document",
+            "files": [],
+            "created": created,
+            "detection_date": datetime.utcnow().isoformat(),
+            "action": "would_move_and_delete",
+            "notified": False,
+        })
+    return rows
+
+
+def get_orphan_datavalues(instance):
+    data_rows = parse_tab_rows(run_sql(instance, SQL_FIND_DATA_VALUES_FILE_RESOURCES), 5)
+    datavalue_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_DATA_VALUE_UIDS), 1))
+    tracker_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_TRACKER_ATTRIBUTE_UIDS), 1))
+    event_blob = "\n".join(r[0] for r in parse_tab_rows(run_sql(instance, SQL_EVENT_FILE_UIDS), 1))
+
+    orphans = []
+    for fid, uid, storagekey, name, created in data_rows:
+        if uid in datavalue_uids:
+            continue
+        if uid in event_blob:
+            continue
+        if uid in tracker_uids:
+            continue
+        orphans.append({
+            "id": fid,
+            "name": name,
+            "storagekey": storagekey,
+            "folder": "dataValue",
+            "files": [],
+            "created": created,
+            "detection_date": datetime.utcnow().isoformat(),
+            "action": "would_move_and_delete",
+            "notified": False,
+        })
+    return orphans
+
+
+def delete_files(container_id, row, dry_run):
+    base = "/DHIS2_home/files"
+    storagekey = row["storagekey"]
+    folder = row["folder"]
+    prefix = os.path.join(base, folder, storagekey.replace(f"{folder}/", ""))
+    cmd = ["docker", "exec", container_id, "bash", "-c", f"rm -v {prefix}*"]
+    if dry_run:
+        print(f"[DRY RUN] {' '.join(cmd)}")
+    else:
+        try:
+            run(cmd)
+        except subprocess.CalledProcessError as e:
+            print(f"⚠️  File deletion failed (likely missing): {' '.join(cmd)} -> {e}")
+            # Continue with DB cleanup and CSV logging even if file is already gone.
+
+
+def ensure_audit_table(instance):
+    run_sql(instance, SQL_CREATE_TABLE_IF_NOT_EXIST)
+
+
+def archive_and_delete(instance, fileresourceid):
+    fid = int(fileresourceid)
+    sql = SQL_INSERT_AUDIT.format(fid=fid) + SQL_DELETE_ORIGINAL.format(fid=fid)
+    run_sql(instance, sql)
 
-    while IFS= read -r key; do
-        fullpath="{FILE_BASE_PATH}/$key"
-        echo "Deleting files: $fullpath*"
-        rm -v "$fullpath"* 2>/dev/null || echo "Nothing to delete for $fullpath"
-    done < "{REMOTE_LIST_FILE}"
-    '
-    """
 
-    run(["docker", "exec", core_container, "bash", "-c", delete_cmd])
+def main():
+    parser = argparse.ArgumentParser(description="Find/delete orphan file resources in d2-docker instance and log to CSV.")
+    parser.add_argument("--instance", required=True, help="d2-docker instance name (e.g. docker.eyeseetea.com/project/dhis2-data:2.41-test)")
+    parser.add_argument("--csv-path", help="CSV file to log orphan entries.")
+    parser.add_argument("--force", action="store_true", help="Actually delete files in container.")
+    parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).")
+    args = parser.parse_args()
 
-    print("Cleanup complete.")
+    container_id = find_container_id(args.instance)
+    if not container_id:
+        print("❌ Could not find core container for instance", file=sys.stderr)
+        sys.exit(1)
+
+    dry_run = not args.force
+    print(f"Running in {'DRY-RUN' if dry_run else 'FORCE'} mode against container {container_id}")
+
+    ensure_audit_table(args.instance)
+
+    rows = []
+    rows.extend(get_orphan_documents(args.instance))
+    rows.extend(get_orphan_datavalues(args.instance))
+    print(f"Found {len(rows)} orphan entries")
+
+    for row in rows:
+        delete_files(container_id, row, dry_run)
+        if dry_run:
+            print(f"[DRY RUN] Skipping DB archive/delete for {row['id']} ({row.get('name', '')}) action={row.get('action')}")
+            continue
+        try:
+            archive_and_delete(args.instance, row["id"])
+            row["action"] = "moved_and_deleted"
+        except Exception as e:
+            print(f"❌ Failed DB archive/delete for {row['id']}: {e}", file=sys.stderr)
+            if not dry_run:
+                sys.exit(1)
+
+    if args.csv_path:
+        overwrite_csv = bool(args.force) and not args.maintain_csv
+        items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv)
+        append_items(args.csv_path, items, overwrite=overwrite_csv)
+
+    print(f"Summary: processed {len(rows)} orphan entries")
 
 
 if __name__ == "__main__":
diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
index f44e2233..09896b83 100644
--- a/DHIS2/file_garbage_remover/main.py
+++ b/DHIS2/file_garbage_remover/main.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import json
 import sys
 
 import cleaner
@@ -8,63 +9,117 @@
 
 def build_parser():
     parser = argparse.ArgumentParser(description="DHIS2 file garbage remover and notifier.")
-    subparsers = parser.add_subparsers(dest="command", required=True)
-
-    cleanup = subparsers.add_parser("cleanup", help="Move/cleanup orphaned file resources and archive DB entries.")
-    cleanup.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
-    cleanup.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).")
-    cleanup.add_argument("--config", required=True, help="Path to config.json file.")
-    cleanup.add_argument("--csv-path", help="Optional CSV file to record processed entries.")
-    cleanup.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
-    cleanup.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.")
-    cleanup.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).")
-
-    notify = subparsers.add_parser("notify", help="Print recent, non-notified filenames from CSV and mark them notified.")
-    notify.add_argument("--csv-path", required=True, help="Path to the CSV generated by cleanup.")
-    notify.add_argument("--webhook-url", help="Webhook URL to send the notification.")
-    notify.add_argument("--title", help="Notification title (required if --webhook-url).")
-    notify.add_argument("--http-proxy", help="HTTP proxy (optional).")
-    notify.add_argument("--https-proxy", help="HTTPS proxy (optional).")
+    # Cleanup options
+    parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
+    parser.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).")
+    parser.add_argument("--config", help="Path to config.json file. Required unless --notify-only.")
+    parser.add_argument("--csv-path", help="CSV file to record processed entries or to read notifications from.")
+    parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
+    parser.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.")
+    parser.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).")
+
+    # Notification options
+    parser.add_argument("--notify-only", action="store_true", help="Skip cleanup and only send notifications from CSV.")
+    parser.add_argument("--notify-webhook-url", help="Webhook URL to send the notification. If not provided, falls back to config.json key 'webhook-url'.")
+    parser.add_argument("--notify-title", help="Notification title (required if using --notify-webhook-url).")
+    parser.add_argument("--notify-http-proxy", help="HTTP proxy (optional).")
+    parser.add_argument("--notify-https-proxy", help="HTTPS proxy (optional).")
+    parser.add_argument("--notify-test", action="store_true", help="Dry-run notification: print payload instead of sending.")
 
     return parser
 
 
-def main():
-    parser = build_parser()
-    args = parser.parse_args()
+def resolve_webhook(config_path, cli_webhook):
+    if cli_webhook:
+        return cli_webhook
+    if not config_path:
+        return None
+    try:
+        cfg = cleaner.load_config(config_path)
+    except Exception as e:
+        print(f"❌ Failed to load config file: {e}", file=sys.stderr)
+        sys.exit(1)
+    return cfg.get("webhook-url") or cfg.get("webhook_url")
 
-    if args.command == "cleanup":
-        cleaner.run_cleanup(args)
-        return
 
-    if args.command == "notify":
-        names, ids, fieldnames, rows = pending_from_csv(args.csv_path)
-        if not names:
+def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False):
+    names, ids, fieldnames, rows = pending_from_csv(csv_path)
+    if not names:
+        return
+    content = "\n".join(names)
+    webhook = resolve_webhook(config_path, webhook_url)
+    if notify_test:
+        print(f"[TEST] Would send notification to {webhook or '<no-webhook-configured>'}")
+        print(f"[TEST] Title: {title or '<no-title>'}")
+        print(f"[TEST] Content:\n{content}")
+    if webhook:
+        if not title:
+            print("❌ --notify-title is required when using --notify-webhook-url", file=sys.stderr)
             sys.exit(1)
-        content = "\n".join(names)
-        if args.webhook_url:
-            if not args.title:
-                print("❌ --title is required when using --webhook-url", file=sys.stderr)
-                sys.exit(1)
+        if not notify_test:
             try:
                 send_notification(
-                    webhook_url=args.webhook_url,
-                    title=args.title,
+                    webhook_url=webhook,
+                    title=title,
                     content=content,
-                    http_proxy=args.http_proxy,
-                    https_proxy=args.https_proxy,
+                    http_proxy=http_proxy,
+                    https_proxy=https_proxy,
                 )
             except Exception as e:
                 print(f"❌ Failed to send notification: {e}", file=sys.stderr)
                 sys.exit(1)
-        else:
-            for name in names:
-                print(name)
-        mark_notified(args.csv_path, rows, fieldnames, ids)
+    else:
+        for name in names:
+            print(name)
+    mark_notified(csv_path, rows, fieldnames, ids)
+
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+
+    # Notify-only path
+    if args.notify_only:
+        if not args.csv_path:
+            print("❌ --csv-path is required for notify-only mode", file=sys.stderr)
+            sys.exit(1)
+        run_notify_flow(
+            csv_path=args.csv_path,
+            config_path=args.config,
+            webhook_url=args.notify_webhook_url,
+            title=args.notify_title,
+            http_proxy=args.notify_http_proxy,
+            https_proxy=args.notify_https_proxy,
+            notify_test=args.notify_test,
+        )
         sys.exit(0)
 
-    parser.print_help()
-    sys.exit(1)
+    # Cleanup path (default)
+    if not args.config:
+        print("❌ --config is required to run cleanup", file=sys.stderr)
+        sys.exit(1)
+    cleaner.run_cleanup(args)
+
+    wants_notify = any([
+        args.notify_webhook_url,
+        args.notify_title,
+        args.notify_http_proxy,
+        args.notify_https_proxy,
+        args.notify_test,
+    ])
+    if wants_notify:
+        if not args.csv_path:
+            print("❌ --csv-path is required to send notifications after cleanup", file=sys.stderr)
+            sys.exit(1)
+        run_notify_flow(
+            csv_path=args.csv_path,
+            config_path=args.config,
+            webhook_url=args.notify_webhook_url,
+            title=args.notify_title,
+            http_proxy=args.notify_http_proxy,
+            https_proxy=args.notify_https_proxy,
+            notify_test=args.notify_test,
+        )
 
 
 if __name__ == "__main__":

From 4a4a365b5608af1c00b12141f4d44da83436b638 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Wed, 26 Nov 2025 12:26:15 +0100
Subject: [PATCH 15/23] refactor sql

---
 DHIS2/file_garbage_remover/cleaner.py         | 57 ++++--------------
 .../file_garbage_remover_docker.py            | 59 ++++---------------
 DHIS2/file_garbage_remover/sql_queries.py     | 46 +++++++++++++++
 3 files changed, 67 insertions(+), 95 deletions(-)
 create mode 100644 DHIS2/file_garbage_remover/sql_queries.py

diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index eeb86436..fb4a16dd 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -10,53 +10,16 @@
 import psycopg2
 
 from csv_utils import append_items, deduplicate_items
-
-SQL_FIND_ORPHANS_DOCUMENTS = """
-    SELECT fileresourceid, storagekey, name, created
-    FROM fileresource fr
-    WHERE NOT EXISTS (
-        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
-    )
-    AND fr.uid NOT IN (
-        SELECT url FROM document
-    )
-    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
-"""
-
-SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
-    SELECT fileresourceid, uid, storagekey, name, created
-    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
-"""
-
-SQL_INSERT_AUDIT = """
-    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
-"""
-
-SQL_DELETE_ORIGINAL = """
-    DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
-"""
-
-SQL_CREATE_TABLE_IF_NOT_EXIST = """
-    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
-"""
-
-SQL_EVENT_FILE_UIDS = """
-        SELECT eventdatavalues
-        FROM event
-        WHERE programstageid 
-        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
-        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
-"""
-
-SQL_DATA_VALUE_UIDS = """
-        SELECT dv.value
-        FROM datavalue dv
-        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
-"""
-
-SQL_TRACKER_ATTRIBUTE_UIDS = """
-select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
-"""
+from sql_queries import (
+    SQL_CREATE_TABLE_IF_NOT_EXIST,
+    SQL_DATA_VALUE_UIDS,
+    SQL_DELETE_ORIGINAL,
+    SQL_EVENT_FILE_UIDS,
+    SQL_FIND_DATA_VALUES_FILE_RESOURCES,
+    SQL_FIND_ORPHANS_DOCUMENTS,
+    SQL_INSERT_AUDIT,
+    SQL_TRACKER_ATTRIBUTE_UIDS,
+)
 
 
 def log(message):
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index 3075e232..8c4e47b7 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -8,53 +8,16 @@
 from datetime import datetime
 
 from csv_utils import append_items, deduplicate_items
-
-SQL_FIND_ORPHANS_DOCUMENTS = """
-    SELECT fileresourceid, storagekey, name, created
-    FROM fileresource fr
-    WHERE NOT EXISTS (
-        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
-    )
-    AND fr.uid NOT IN (
-        SELECT url FROM document
-    )
-    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
-"""
-
-SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
-    SELECT fileresourceid, uid, storagekey, name, created
-    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
-"""
-
-SQL_EVENT_FILE_UIDS = """
-        SELECT eventdatavalues
-        FROM event
-        WHERE programstageid 
-        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
-        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
-"""
-
-SQL_DATA_VALUE_UIDS = """
-        SELECT dv.value
-        FROM datavalue dv
-        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
-"""
-
-SQL_TRACKER_ATTRIBUTE_UIDS = """
-select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
-"""
-
-SQL_INSERT_AUDIT = """
-    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = {fid};
-"""
-
-SQL_DELETE_ORIGINAL = """
-    DELETE FROM fileresource WHERE fileresourceid = {fid};
-"""
-
-SQL_CREATE_TABLE_IF_NOT_EXIST = """
-    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
-"""
+from sql_queries import (
+    SQL_CREATE_TABLE_IF_NOT_EXIST,
+    SQL_DATA_VALUE_UIDS,
+    SQL_DELETE_ORIGINAL,
+    SQL_EVENT_FILE_UIDS,
+    SQL_FIND_DATA_VALUES_FILE_RESOURCES,
+    SQL_FIND_ORPHANS_DOCUMENTS,
+    SQL_INSERT_AUDIT,
+    SQL_TRACKER_ATTRIBUTE_UIDS,
+)
 
 
 def run(cmd, capture=False):
@@ -172,7 +135,7 @@ def ensure_audit_table(instance):
 
 def archive_and_delete(instance, fileresourceid):
     fid = int(fileresourceid)
-    sql = SQL_INSERT_AUDIT.format(fid=fid) + SQL_DELETE_ORIGINAL.format(fid=fid)
+    sql = (SQL_INSERT_AUDIT % {"fid": fid}) + (SQL_DELETE_ORIGINAL % {"fid": fid})
     run_sql(instance, sql)
 
 
diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py
new file mode 100644
index 00000000..bb6e82e5
--- /dev/null
+++ b/DHIS2/file_garbage_remover/sql_queries.py
@@ -0,0 +1,46 @@
+SQL_FIND_ORPHANS_DOCUMENTS = """
+    SELECT fileresourceid, storagekey, name, created
+    FROM fileresource fr
+    WHERE NOT EXISTS (
+        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
+    )
+    AND fr.uid NOT IN (
+        SELECT url FROM document
+    )
+    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
+"""
+
+SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
+    SELECT fileresourceid, uid, storagekey, name, created
+    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
+"""
+
+SQL_INSERT_AUDIT = """
+    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+SQL_DELETE_ORIGINAL = """
+    DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
+"""
+
+SQL_CREATE_TABLE_IF_NOT_EXIST = """
+    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
+"""
+
+SQL_EVENT_FILE_UIDS = """
+        SELECT eventdatavalues
+        FROM event
+        WHERE programstageid 
+        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
+        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
+"""
+
+SQL_DATA_VALUE_UIDS = """
+        SELECT dv.value
+        FROM datavalue dv
+        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
+"""
+
+SQL_TRACKER_ATTRIBUTE_UIDS = """
+select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
+"""

From a96e57153cc9c2a0b0f9c937389e9ee56dbb90d7 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Wed, 26 Nov 2025 13:00:09 +0100
Subject: [PATCH 16/23] read proxy from config file

---
 DHIS2/file_garbage_remover/main.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
index 09896b83..8a1c87f3 100644
--- a/DHIS2/file_garbage_remover/main.py
+++ b/DHIS2/file_garbage_remover/main.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 import argparse
-import json
 import sys
 
 import cleaner
@@ -30,8 +29,10 @@ def build_parser():
 
 
 def resolve_webhook(config_path, cli_webhook):
-    if cli_webhook:
-        return cli_webhook
+    return cli_webhook or _from_config(config_path, ["webhook-url", "webhook_url"])
+
+
+def _from_config(config_path, keys):
     if not config_path:
         return None
     try:
@@ -39,7 +40,14 @@ def resolve_webhook(config_path, cli_webhook):
     except Exception as e:
         print(f"❌ Failed to load config file: {e}", file=sys.stderr)
         sys.exit(1)
-    return cfg.get("webhook-url") or cfg.get("webhook_url")
+    for key in keys:
+        if key in cfg:
+            return cfg.get(key)
+    return None
+
+
+def resolve_proxy(config_path, cli_value, keys):
+    return cli_value or _from_config(config_path, keys)
 
 
 def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False):
@@ -48,6 +56,8 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht
         return
     content = "\n".join(names)
     webhook = resolve_webhook(config_path, webhook_url)
+    http_proxy = resolve_proxy(config_path, http_proxy, ["notify-http-proxy", "notify_http_proxy", "http-proxy", "http_proxy"])
+    https_proxy = resolve_proxy(config_path, https_proxy, ["notify-https-proxy", "notify_https_proxy", "https-proxy", "https_proxy"])
     if notify_test:
         print(f"[TEST] Would send notification to {webhook or '<no-webhook-configured>'}")
         print(f"[TEST] Title: {title or '<no-title>'}")

From 4ddd6212a2a030841bfd7204db4c890de121c5d6 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Thu, 27 Nov 2025 13:40:25 +0100
Subject: [PATCH 17/23] added store files in csv as already notified parameter
 and truncate messages

---
 DHIS2/file_garbage_remover/Readme.md          | 14 +++++++++-
 DHIS2/file_garbage_remover/cleaner.py         |  3 +++
 .../file_garbage_remover_docker.py            |  5 ++++
 .../file_garbage_remover_tomcat.py            |  4 +++
 DHIS2/file_garbage_remover/main.py            | 26 +++++++++++++------
 5 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index 0c2c627d..2b344ba1 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -21,6 +21,16 @@ python3 main.py \
 - Add `--force` to delete/move for real (tomcat) or delete in container (docker).  
 - Add `--notify-test` to print the payload instead of sending.  
 - To notify only (no cleanup): `python3 main.py --notify-only --csv-path /tmp/fg.csv --config /path/to/config.json --notify-title "..." [--notify-test]`
+- To skip sending and save in CSV as notified: `--save-all-as-notified`.
+
+### Example (tomcat cleanup, dry-run, save as nnotified)
+```
+DB_PASSWORD_FG=your_password python3 main.py \
+  --config /path/to/config.json \
+  --csv-path /tmp/fg_tomcat.csv \
+  --save-all-as-notified
+```
+Add `--force` to move/delete files and files in DB.
 
 config.json needs the cleanup settings plus the webhook (if you want notifications):
 ```
@@ -31,7 +41,9 @@ config.json needs the cleanup settings plus the webhook (if you want notificatio
   "db_user": "",
   "file_base_path": "",
   "temp_file_path": "",
-  "webhook-url": "https://your.webhook.url"
+  "webhook-url": "https://your.webhook.url",
+  "notify-http-proxy": "http://openproxy.who.int:8080",
+  "notify-https-proxy": "http://openproxy.who.int:8080"
 }
 ```
 
diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index fb4a16dd..7975f86a 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -241,6 +241,9 @@ def run_cleanup(args):
             remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary)
 
     if args.csv_path:
+        if getattr(args, "save_all_as_notified", False):
+            for item in summary.get("items", []):
+                item["notified"] = True
         overwrite_csv = bool(args.force) and not args.maintain_csv
         items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
         append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index 8c4e47b7..b7f73dfb 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -145,6 +145,7 @@ def main():
     parser.add_argument("--csv-path", help="CSV file to log orphan entries.")
     parser.add_argument("--force", action="store_true", help="Actually delete files in container.")
     parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).")
+    parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.")
     args = parser.parse_args()
 
     container_id = find_container_id(args.instance)
@@ -162,6 +163,10 @@ def main():
     rows.extend(get_orphan_datavalues(args.instance))
     print(f"Found {len(rows)} orphan entries")
 
+    if args.save_all_as_notified:
+        for row in rows:
+            row["notified"] = True
+
     for row in rows:
         delete_files(container_id, row, dry_run)
         if dry_run:
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index ced18f32..a1475370 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -179,6 +179,7 @@ def main():
     parser.add_argument("--config", required=True, help="Path to config.json file.")
     parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.")
     parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
+    parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.")
     args = parser.parse_args()
 
     config = load_config(args.config)
@@ -223,6 +224,9 @@ def main():
             remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary)
 
     if args.csv_path:
+        if args.save_all_as_notified:
+            for item in summary.get("items", []):
+                item["notified"] = True
         overwrite_csv = bool(args.force) and not args.maintain_csv
         write_csv(args.csv_path, summary, overwrite=overwrite_csv)
     emit_summary(summary)
diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
index 8a1c87f3..342bef73 100644
--- a/DHIS2/file_garbage_remover/main.py
+++ b/DHIS2/file_garbage_remover/main.py
@@ -24,6 +24,8 @@ def build_parser():
     parser.add_argument("--notify-http-proxy", help="HTTP proxy (optional).")
     parser.add_argument("--notify-https-proxy", help="HTTPS proxy (optional).")
     parser.add_argument("--notify-test", action="store_true", help="Dry-run notification: print payload instead of sending.")
+    parser.add_argument("--save-all-as-notified", action="store_true", help="Mark CSV entries as notified even if no notification is sent.")
+    parser.add_argument("--notify-max-lines", type=int, default=100, help="Maximum number of lines to include in notification content (default 100).")
 
     return parser
 
@@ -50,11 +52,15 @@ def resolve_proxy(config_path, cli_value, keys):
     return cli_value or _from_config(config_path, keys)
 
 
-def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False):
+def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=50):
     names, ids, fieldnames, rows = pending_from_csv(csv_path)
-    if not names:
+    if not names and not save_all_as_notified:
         return
-    content = "\n".join(names)
+    max_lines = notify_max_lines if notify_max_lines and notify_max_lines > 0 else None
+    truncated = names[:max_lines] if max_lines else names
+    content = "\n".join(truncated)
+    if max_lines and len(names) > max_lines:
+        content += f"\n... (truncated {len(names) - max_lines} more)"
     webhook = resolve_webhook(config_path, webhook_url)
     http_proxy = resolve_proxy(config_path, http_proxy, ["notify-http-proxy", "notify_http_proxy", "http-proxy", "http_proxy"])
     https_proxy = resolve_proxy(config_path, https_proxy, ["notify-https-proxy", "notify_https_proxy", "https-proxy", "https_proxy"])
@@ -78,10 +84,10 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht
             except Exception as e:
                 print(f"❌ Failed to send notification: {e}", file=sys.stderr)
                 sys.exit(1)
-    else:
-        for name in names:
-            print(name)
-    mark_notified(csv_path, rows, fieldnames, ids)
+    # mark entries as notified
+    ids_to_mark = ids if not save_all_as_notified else set(str(row.get("id")) for row in rows)
+    if ids_to_mark:
+        mark_notified(csv_path, rows, fieldnames, ids_to_mark)
 
 
 def main():
@@ -101,6 +107,7 @@ def main():
             http_proxy=args.notify_http_proxy,
             https_proxy=args.notify_https_proxy,
             notify_test=args.notify_test,
+            notify_max_lines=args.notify_max_lines,
         )
         sys.exit(0)
 
@@ -117,7 +124,9 @@ def main():
         args.notify_https_proxy,
         args.notify_test,
     ])
-    if wants_notify:
+    if args.save_all_as_notified:
+        return #nothing to report
+    elif wants_notify:
         if not args.csv_path:
             print("❌ --csv-path is required to send notifications after cleanup", file=sys.stderr)
             sys.exit(1)
@@ -129,6 +138,7 @@ def main():
             http_proxy=args.notify_http_proxy,
             https_proxy=args.notify_https_proxy,
             notify_test=args.notify_test,
+            notify_max_lines=args.notify_max_lines,
         )
 
 

From 926c0f09af3a00b41a90e80749165d426120759c Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Thu, 27 Nov 2025 13:53:28 +0100
Subject: [PATCH 18/23] refactor duplicate code

---
 DHIS2/file_garbage_remover/cleaner.py         |  20 +--
 .../file_garbage_remover_docker.py            |   4 +-
 .../file_garbage_remover_tomcat.py            | 125 +++---------------
 3 files changed, 19 insertions(+), 130 deletions(-)

diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index 7975f86a..153d4056 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -9,6 +9,7 @@
 import subprocess
 import psycopg2
 
+from common import load_config, log
 from csv_utils import append_items, deduplicate_items
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,
@@ -22,22 +23,6 @@
 )
 
 
-def log(message):
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if isinstance(message, str):
-        message = re.sub(
-            r"(postgresql://[^:]+:)([^@]+)(@)",
-            r"\1*****\3",
-            message
-        )
-    print(f"[{timestamp}] {message}")
-
-
-def load_config(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
 def get_events(cursor):
     cursor.execute(SQL_EVENT_FILE_UIDS)
     return [row[0] for row in cursor.fetchall() if row[0]]
@@ -242,8 +227,7 @@ def run_cleanup(args):
 
     if args.csv_path:
         if getattr(args, "save_all_as_notified", False):
-            for item in summary.get("items", []):
-                item["notified"] = True
+            mark_items_notified(summary.get("items", []))
         overwrite_csv = bool(args.force) and not args.maintain_csv
         items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
         append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index b7f73dfb..3217241f 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -8,6 +8,7 @@
 from datetime import datetime
 
 from csv_utils import append_items, deduplicate_items
+from common import mark_items_notified
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,
     SQL_DATA_VALUE_UIDS,
@@ -164,8 +165,7 @@ def main():
     print(f"Found {len(rows)} orphan entries")
 
     if args.save_all_as_notified:
-        for row in rows:
-            row["notified"] = True
+        mark_items_notified(rows)
 
     for row in rows:
         delete_files(container_id, row, dry_run)
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index a1475370..b13605ff 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -1,78 +1,24 @@
 #!/usr/bin/env python3
 import argparse
-import csv
-import json
 import os
-import re
 import shutil
 import sys
 from datetime import datetime
 
 import psycopg2
 
-SQL_FIND_ORPHANS_DOCUMENTS = """
-    SELECT fileresourceid, storagekey, name, created
-    FROM fileresource fr
-    WHERE NOT EXISTS (
-        SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid
-    )
-    AND fr.uid NOT IN (
-        SELECT url FROM document
-    )
-    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
-"""
-
-SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
-    SELECT fileresourceid, uid, storagekey, name, created
-    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
-"""
-
-SQL_INSERT_AUDIT = """
-    INSERT INTO fileresourcesaudit SELECT * FROM fileresource WHERE fileresourceid = %(fid)s;
-"""
-
-SQL_DELETE_ORIGINAL = """
-    DELETE FROM fileresource WHERE fileresourceid = %(fid)s;
-"""
-
-SQL_CREATE_TABLE_IF_NOT_EXIST = """
-    CREATE TABLE IF NOT EXISTS fileresourcesaudit AS TABLE fileresource WITH NO DATA;
-"""
-
-SQL_EVENT_FILE_UIDS = """
-        SELECT eventdatavalues
-        FROM event
-        WHERE programstageid 
-        IN (SELECT programstageid FROM programstagedataelement WHERE dataelementid  
-        IN (SELECT dataelementid FROM dataelement WHERE valuetype='FILE_RESOURCE' or valuetype='IMAGE')) and deleted='f';
-"""
-
-SQL_DATA_VALUE_UIDS = """
-        SELECT dv.value
-        FROM datavalue dv
-        WHERE dv.value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE')
-"""
-
-SQL_TRACKER_ATTRIBUTE_UIDS = """
-select value from trackedentityattributevalue where value IN (SELECT uid FROM fileresource WHERE domain='DATA_VALUE');
-"""
-
-
-def log(message):
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if isinstance(message, str):
-        # Detect pattern like: postgresql://user:password@host
-        message = re.sub(
-            r"(postgresql://[^:]+:)([^@]+)(@)",
-            r"\1*****\3",
-            message
-        )
-    print(f"[{timestamp}] {message}")
-
-
-def load_config(path):
-    with open(path, "r") as f:
-        return json.load(f)
+from common import load_config, log, mark_items_notified
+from csv_utils import deduplicate_items, append_items
+from sql_queries import (
+    SQL_CREATE_TABLE_IF_NOT_EXIST,
+    SQL_DATA_VALUE_UIDS,
+    SQL_DELETE_ORIGINAL,
+    SQL_EVENT_FILE_UIDS,
+    SQL_FIND_DATA_VALUES_FILE_RESOURCES,
+    SQL_FIND_ORPHANS_DOCUMENTS,
+    SQL_INSERT_AUDIT,
+    SQL_TRACKER_ATTRIBUTE_UIDS,
+)
 
 
 def get_events(cursor):
@@ -109,7 +55,6 @@ def get_matching_files(storage_key, base_dir, folder):
     storage_key = storage_key.replace(folder + "/", "")
     base_dir = os.path.join(base_dir, folder)
     matching = []
-
     for filename in os.listdir(base_dir):
         if filename.startswith(storage_key):
             fullpath = os.path.join(base_dir, filename)
@@ -225,10 +170,10 @@ def main():
 
     if args.csv_path:
         if args.save_all_as_notified:
-            for item in summary.get("items", []):
-                item["notified"] = True
+            mark_items_notified(summary.get("items", []))
         overwrite_csv = bool(args.force) and not args.maintain_csv
-        write_csv(args.csv_path, summary, overwrite=overwrite_csv)
+        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
+        append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
     emit_summary(summary)
 
 
@@ -322,45 +267,5 @@ def emit_summary(summary):
             log(f"      file: {f}")
 
 
-def write_csv(csv_path, summary, overwrite=False):
-    if not csv_path:
-        return
-    file_exists = os.path.isfile(csv_path)
-    mode = "w" if overwrite else "a"
-    want_header = overwrite or not file_exists
-    fieldnames = ["id", "name", "created", "storagekey", "folder", "action", "files", "notified"]
-    existing_ids = set()
-    if not overwrite and file_exists:
-        try:
-            with open(csv_path, "r", newline="") as f:
-                reader = csv.DictReader(f)
-                for row in reader:
-                    if "id" in row and row["id"]:
-                        existing_ids.add(str(row["id"]))
-        except Exception as e:
-            log(f"⚠️ Could not read existing CSV for deduplication: {e}")
-    try:
-        with open(csv_path, mode, newline="") as f:
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-            if want_header:
-                writer.writeheader()
-            for item in summary.get("items", []):
-                if str(item.get("id")) in existing_ids:
-                    continue
-                writer.writerow({
-                    "id": item.get("id"),
-                    "name": item.get("name"),
-                    "created": item.get("created"),
-                    "storagekey": item.get("storagekey"),
-                    "folder": item.get("folder"),
-                    "action": item.get("action"),
-                    "files": "|".join(item.get("files") or []),
-                    "notified": str(item.get("notified", False)).lower(),
-                })
-        log(f"CSV summary {'overwritten' if overwrite else 'appended'} at {csv_path}")
-    except Exception as e:
-        log(f"❌ Failed to write CSV {csv_path}: {e}")
-
-
 if __name__ == "__main__":
     main()

From 4f5eaccbf515e48c9425d1379e8580e038706bcc Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Thu, 27 Nov 2025 19:12:15 +0100
Subject: [PATCH 19/23] refactor

---
 DHIS2/file_garbage_remover/common.py          | 79 +++++++++++++++++
 .../file_garbage_remover_docker.py            | 47 +++-------
 .../file_garbage_remover_tomcat.py            | 87 +++++++------------
 DHIS2/file_garbage_remover/main.py            |  9 +-
 4 files changed, 122 insertions(+), 100 deletions(-)
 create mode 100644 DHIS2/file_garbage_remover/common.py

diff --git a/DHIS2/file_garbage_remover/common.py b/DHIS2/file_garbage_remover/common.py
new file mode 100644
index 00000000..8e2d9ae4
--- /dev/null
+++ b/DHIS2/file_garbage_remover/common.py
@@ -0,0 +1,79 @@
+import json
+import os
+import re
+from datetime import datetime
+
+
+def log(message):
+    """Simple logger with timestamp and DB URL password masking."""
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if isinstance(message, str):
+        message = re.sub(
+            r"(postgresql://[^:]+:)([^@]+)(@)",
+            r"\1*****\3",
+            message
+        )
+    print(f"[{timestamp}] {message}")
+
+
+def load_config(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def mark_items_notified(items):
+    for item in items:
+        item["notified"] = True
+
+
+def emit_summary(items, mode, log_fn=print):
+    log_fn(f"Summary ({mode}): {len(items)} fileresource entries processed.")
+    for item in items:
+        files = item.get("files") or ["<missing from disk>"]
+        log_fn(f"  - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}")
+        for f in files:
+            log_fn(f"      file: {f}")
+
+
+def _iso(value):
+    return value.isoformat() if hasattr(value, "isoformat") else value
+
+
+def build_document_items(raw_rows):
+    items = []
+    for fid, storagekey, name, created in raw_rows:
+        items.append({
+            "id": fid,
+            "name": name,
+            "storagekey": storagekey,
+            "folder": "document",
+            "files": [],
+            "created": _iso(created),
+            "detection_date": datetime.utcnow().isoformat(),
+            "action": "would_move_and_delete",
+            "notified": False,
+        })
+    return items
+
+
+def build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob):
+    items = []
+    for fid, uid, storagekey, name, created in raw_rows:
+        if uid in datavalue_uids:
+            continue
+        if uid in event_blob:
+            continue
+        if uid in tracker_uids:
+            continue
+        items.append({
+            "id": fid,
+            "name": name,
+            "storagekey": storagekey,
+            "folder": "dataValue",
+            "files": [],
+            "created": _iso(created),
+            "detection_date": datetime.utcnow().isoformat(),
+            "action": "would_move_and_delete",
+            "notified": False,
+        })
+    return items
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index 3217241f..b4b541ae 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -8,7 +8,12 @@
 from datetime import datetime
 
 from csv_utils import append_items, deduplicate_items
-from common import mark_items_notified
+from common import (
+    build_datavalue_items,
+    build_document_items,
+    emit_summary,
+    mark_items_notified,
+)
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,
     SQL_DATA_VALUE_UIDS,
@@ -70,20 +75,8 @@ def parse_tab_rows(output, expected_cols):
 
 def get_orphan_documents(instance):
     out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS)
-    rows = []
-    for fid, storagekey, name, created in parse_tab_rows(out, 4):
-        rows.append({
-            "id": fid,
-            "name": name,
-            "storagekey": storagekey,
-            "folder": "document",
-            "files": [],
-            "created": created,
-            "detection_date": datetime.utcnow().isoformat(),
-            "action": "would_move_and_delete",
-            "notified": False,
-        })
-    return rows
+    raw_rows = parse_tab_rows(out, 4)
+    return build_document_items(raw_rows)
 
 
 def get_orphan_datavalues(instance):
@@ -91,27 +84,7 @@ def get_orphan_datavalues(instance):
     datavalue_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_DATA_VALUE_UIDS), 1))
     tracker_uids = set(r[0] for r in parse_tab_rows(run_sql(instance, SQL_TRACKER_ATTRIBUTE_UIDS), 1))
     event_blob = "\n".join(r[0] for r in parse_tab_rows(run_sql(instance, SQL_EVENT_FILE_UIDS), 1))
-
-    orphans = []
-    for fid, uid, storagekey, name, created in data_rows:
-        if uid in datavalue_uids:
-            continue
-        if uid in event_blob:
-            continue
-        if uid in tracker_uids:
-            continue
-        orphans.append({
-            "id": fid,
-            "name": name,
-            "storagekey": storagekey,
-            "folder": "dataValue",
-            "files": [],
-            "created": created,
-            "detection_date": datetime.utcnow().isoformat(),
-            "action": "would_move_and_delete",
-            "notified": False,
-        })
-    return orphans
+    return build_datavalue_items(data_rows, datavalue_uids, tracker_uids, event_blob)
 
 
 def delete_files(container_id, row, dry_run):
@@ -185,7 +158,7 @@ def main():
         items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv)
         append_items(args.csv_path, items, overwrite=overwrite_csv)
 
-    print(f"Summary: processed {len(rows)} orphan entries")
+    emit_summary(rows, "DRY-RUN" if dry_run else "FORCE", log_fn=print)
 
 
 if __name__ == "__main__":
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index b13605ff..9a0f6906 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -7,7 +7,14 @@
 
 import psycopg2
 
-from common import load_config, log, mark_items_notified
+from common import (
+    build_datavalue_items,
+    build_document_items,
+    emit_summary,
+    load_config,
+    log,
+    mark_items_notified,
+)
 from csv_utils import deduplicate_items, append_items
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,
@@ -31,15 +38,6 @@ def get_event_blob(cursor):
     return "\n".join(json.dumps(e) for e in event_texts)
 
 
-def get_data_value_file_resources(cursor):
-    cursor.execute("""
-        SELECT fileresourceid, uid, storagekey
-        FROM fileresource
-        WHERE domain = 'DATA_VALUE'
-    """)
-    return cursor.fetchall()
-
-
 def get_all_datavalue_uids(cursor):
     cursor.execute(SQL_DATA_VALUE_UIDS)
     return set(row[0] for row in cursor.fetchall() if row[0])
@@ -174,59 +172,50 @@ def main():
         overwrite_csv = bool(args.force) and not args.maintain_csv
         items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
         append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
-    emit_summary(summary)
+    emit_summary(summary.get("items", []), summary.get("mode", "TEST"), log_fn=log)
 
 
 def remove_documents(file_base_path, temp_file_path, dry_run, cur, conn, summary):
     log("Querying orphaned fileresource entries...")
     cur.execute(SQL_FIND_ORPHANS_DOCUMENTS)
-    rows = cur.fetchall()
+    raw_rows = cur.fetchall()
+    items = build_document_items(raw_rows)
 
-    if not rows:
+    if not items:
         log("✅ No orphaned fileresource entries found.")
         return
 
-    log(f"Found {len(rows)} \"document\" orphaned entries.")
-    process_orphan_files(rows, file_base_path, temp_file_path, dry_run, cur, conn, "document", summary)
+    log(f"Found {len(items)} \"document\" orphaned entries.")
+    process_orphan_items(items, file_base_path, temp_file_path, dry_run, cur, conn, summary)
 
 
 def remove_datavalues(file_base_path, temp_file_path, dry_run, cur, conn, summary):
     log("Querying orphaned fileresource entries...")
     cur.execute(SQL_FIND_DATA_VALUES_FILE_RESOURCES)
-    rows = cur.fetchall()
-
+    raw_rows = cur.fetchall()
     # get all file resource datavalues
     datavalue_uids = get_all_datavalue_uids(cur)
     # get all events with file dataelement values
     event_blob = get_event_blob(cur)
 
     tracker_uids = get_all_tracker_uids(cur)
-    # 3. Filter out referenced file resources
-    orphan_data_values = []
-    for fileresourceid, uid, storagekey, name, created in rows:
-        if uid in datavalue_uids:
-            continue  # Referenced in datavalue
-        if uid in event_blob:
-            continue  # Referenced in event
-        if uid in tracker_uids:
-            continue # referenced in a trackerentityattribute
-        orphan_data_values.append((fileresourceid, storagekey, name, created))
-
-    if not orphan_data_values:
+    items = build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob)
+
+    if not items:
         log("✅ No fileResources entries found.")
         return
 
-    log(f"Found {len(orphan_data_values)} \"data_value\" orphaned entries.")
-    process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, "dataValue", summary)
+    log(f"Found {len(items)} \"data_value\" orphaned entries.")
+    process_orphan_items(items, file_base_path, temp_file_path, dry_run, cur, conn, summary)
 
 
-def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry_run, cur, conn, folder, summary):
+def process_orphan_items(orphan_items, file_base_path, temp_file_path, dry_run, cur, conn, summary):
     count = 0
-    for entry in orphan_data_values:
-        # entries may come as (id, storagekey, name)
-        fileresourceid, storagekey = entry[0], entry[1]
-        name = entry[2] if len(entry) > 2 else ""
-        created = entry[3] if len(entry) > 3 else None
+    for item in orphan_items:
+        fileresourceid = item.get("id")
+        storagekey = item.get("storagekey", "")
+        name = item.get("name", "")
+        folder = item.get("folder", "")
         if not fileresourceid:
             log(f"⚠️ Skipping row with empty/null fileresourceid: {fileresourceid}")
             continue
@@ -239,16 +228,9 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry
             # Always update the db also if the files not existed on disk
             update_db(fileresourceid, cur, conn, dry_run)
             rel_matches = [os.path.join(folder, os.path.relpath(m, os.path.join(file_base_path, folder))) for m in matches]
-            summary["items"].append({
-                "id": fileresourceid,
-                "name": name,
-                "storagekey": storagekey,
-                "folder": folder,
-                "files": rel_matches,
-                "created": created.isoformat() if hasattr(created, "isoformat") else created,
-                "action": "would_move_and_delete" if dry_run else "moved_and_deleted",
-                "notified": False
-            })
+            item["files"] = rel_matches
+            item["action"] = "would_move_and_delete" if dry_run else "moved_and_deleted"
+            summary["items"].append(item)
         except Exception as e:
             log(f"❌ Aborting due to error with fileresourceid {fileresourceid}: {e}")
             sys.exit(1)
@@ -256,16 +238,5 @@ def process_orphan_files(orphan_data_values, file_base_path, temp_file_path, dry
     log(f"{count} file(s) {'would be moved' if dry_run else 'were successfully moved and deleted'}")
 
 
-def emit_summary(summary):
-    mode = summary.get("mode", "TEST")
-    items = summary.get("items", [])
-    log(f"Summary ({mode}): {len(items)} fileresource entries processed.")
-    for item in items:
-        files = item.get("files") or ["<missing from disk>"]
-        log(f"  - id={item.get('id')} name=\"{item.get('name')}\" storagekey={item.get('storagekey')} action={item.get('action')}")
-        for f in files:
-            log(f"      file: {f}")
-
-
 if __name__ == "__main__":
     main()
diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
index 342bef73..e02a92ef 100644
--- a/DHIS2/file_garbage_remover/main.py
+++ b/DHIS2/file_garbage_remover/main.py
@@ -52,9 +52,9 @@ def resolve_proxy(config_path, cli_value, keys):
     return cli_value or _from_config(config_path, keys)
 
 
-def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=50):
+def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, http_proxy=None, https_proxy=None, notify_test=False, notify_max_lines=100):
     names, ids, fieldnames, rows = pending_from_csv(csv_path)
-    if not names and not save_all_as_notified:
+    if not names:
         return
     max_lines = notify_max_lines if notify_max_lines and notify_max_lines > 0 else None
     truncated = names[:max_lines] if max_lines else names
@@ -85,9 +85,8 @@ def run_notify_flow(csv_path, config_path=None, webhook_url=None, title=None, ht
                 print(f"❌ Failed to send notification: {e}", file=sys.stderr)
                 sys.exit(1)
     # mark entries as notified
-    ids_to_mark = ids if not save_all_as_notified else set(str(row.get("id")) for row in rows)
-    if ids_to_mark:
-        mark_notified(csv_path, rows, fieldnames, ids_to_mark)
+    if ids:
+        mark_notified(csv_path, rows, fieldnames, ids)
 
 
 def main():

From a7721fdf4636b41c2a6a0f39b745d1ac491462a5 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Fri, 28 Nov 2025 10:40:11 +0100
Subject: [PATCH 20/23] simplify parameters and workflod, improve unique row
 detection(id+uid), and add save-all-as-notified

---
 DHIS2/file_garbage_remover/cleaner.py         | 13 ++++----
 DHIS2/file_garbage_remover/common.py          |  4 ++-
 DHIS2/file_garbage_remover/csv_utils.py       | 31 +++++++++++++++++--
 .../file_garbage_remover_docker.py            | 13 ++++----
 .../file_garbage_remover_tomcat.py            | 10 +++---
 DHIS2/file_garbage_remover/main.py            |  1 -
 DHIS2/file_garbage_remover/sql_queries.py     |  2 +-
 7 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index 153d4056..c5a3074c 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -10,7 +10,7 @@
 import psycopg2
 
 from common import load_config, log
-from csv_utils import append_items, deduplicate_items
+from csv_utils import append_items, deduplicate_items, merge_notified_flags
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,
     SQL_DATA_VALUE_UIDS,
@@ -228,9 +228,10 @@ def run_cleanup(args):
     if args.csv_path:
         if getattr(args, "save_all_as_notified", False):
             mark_items_notified(summary.get("items", []))
-        overwrite_csv = bool(args.force) and not args.maintain_csv
-        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
-        append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
+        else:
+            merge_notified_flags(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), log=log)
+        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), overwrite=False, log=log)
+        append_items(args.csv_path, items, overwrite=False, log=log)
     emit_summary(summary)
 
 
@@ -245,8 +246,8 @@ def run_docker_cleanup(args):
     cmd = [sys.executable, script_path, "--instance", args.docker_instance]
     if args.csv_path:
         cmd.extend(["--csv-path", args.csv_path])
-        if args.maintain_csv:
-            cmd.append("--maintain-csv")
+        if getattr(args, "save_all_as_notified", False):
+            cmd.append("--save-all-as-notified")
     if args.force:
         cmd.append("--force")
     log(f"Running docker cleanup via: {' '.join(cmd)}")
diff --git a/DHIS2/file_garbage_remover/common.py b/DHIS2/file_garbage_remover/common.py
index 8e2d9ae4..8483d941 100644
--- a/DHIS2/file_garbage_remover/common.py
+++ b/DHIS2/file_garbage_remover/common.py
@@ -41,9 +41,10 @@ def _iso(value):
 
 def build_document_items(raw_rows):
     items = []
-    for fid, storagekey, name, created in raw_rows:
+    for fid, uid, storagekey, name, created in raw_rows:
         items.append({
             "id": fid,
+            "uid": uid,
             "name": name,
             "storagekey": storagekey,
             "folder": "document",
@@ -67,6 +68,7 @@ def build_datavalue_items(raw_rows, datavalue_uids, tracker_uids, event_blob):
             continue
         items.append({
             "id": fid,
+            "uid": uid,
             "name": name,
             "storagekey": storagekey,
             "folder": "dataValue",
diff --git a/DHIS2/file_garbage_remover/csv_utils.py b/DHIS2/file_garbage_remover/csv_utils.py
index 12b1cd8b..a365c16e 100644
--- a/DHIS2/file_garbage_remover/csv_utils.py
+++ b/DHIS2/file_garbage_remover/csv_utils.py
@@ -1,12 +1,13 @@
 import csv
 import os
 
-DEFAULT_FIELDNAMES = ["id", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"]
+DEFAULT_FIELDNAMES = ["id", "uid", "name", "created", "detection_date", "storagekey", "folder", "action", "files", "notified"]
 
 
 def serialize_item(item):
     return {
         "id": item.get("id"),
+        "uid": item.get("uid"),
         "name": item.get("name"),
         "created": item.get("created"),
         "detection_date": item.get("detection_date"),
@@ -57,7 +58,7 @@ def write_rows(csv_path, rows, fieldnames=None):
         writer.writerows(rows)
 
 
-def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False, log=None):
+def deduplicate_items(csv_path, new_items, unique_keys=("id", "uid"), overwrite=False, log=None):
     """
     Returns new_items filtered to avoid duplicates with existing CSV based on unique_keys.
     If overwrite=True, no filtering is applied (CSV will be rewritten).
@@ -83,3 +84,29 @@ def deduplicate_items(csv_path, new_items, unique_keys=("id",), overwrite=False,
         existing_keys.add(key)
         filtered.append(item)
     return filtered
+
+
+def merge_notified_flags(csv_path, items, unique_keys=("id", "uid"), log=None):
+    """
+    For each item, if an existing CSV has the same unique key with notified=true,
+    copy that notified flag into the item (unless already true).
+    """
+    if not os.path.isfile(csv_path):
+        return items
+    try:
+        rows, _ = read_rows(csv_path)
+    except Exception as e:
+        if log:
+            log(f"⚠️ Could not read existing CSV for notified merge: {e}")
+        return items
+
+    notified_map = {}
+    for row in rows:
+        key = tuple((row.get(k) or "").strip() for k in unique_keys)
+        notified_map[key] = str(row.get("notified", "")).lower() == "true"
+
+    for item in items:
+        key = tuple((str(item.get(k)) or "").strip() for k in unique_keys)
+        if notified_map.get(key):
+            item["notified"] = True
+    return items
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
index b4b541ae..2d506e54 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_docker.py
@@ -7,7 +7,7 @@
 import tempfile
 from datetime import datetime
 
-from csv_utils import append_items, deduplicate_items
+from csv_utils import append_items, deduplicate_items, merge_notified_flags
 from common import (
     build_datavalue_items,
     build_document_items,
@@ -75,7 +75,7 @@ def parse_tab_rows(output, expected_cols):
 
 def get_orphan_documents(instance):
     out = run_sql(instance, SQL_FIND_ORPHANS_DOCUMENTS)
-    raw_rows = parse_tab_rows(out, 4)
+    raw_rows = parse_tab_rows(out, 5)
     return build_document_items(raw_rows)
 
 
@@ -118,7 +118,6 @@ def main():
     parser.add_argument("--instance", required=True, help="d2-docker instance name (e.g. docker.eyeseetea.com/project/dhis2-data:2.41-test)")
     parser.add_argument("--csv-path", help="CSV file to log orphan entries.")
     parser.add_argument("--force", action="store_true", help="Actually delete files in container.")
-    parser.add_argument("--maintain-csv", action="store_true", help="Append to CSV in force mode (do not overwrite).")
     parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.")
     args = parser.parse_args()
 
@@ -139,6 +138,9 @@ def main():
 
     if args.save_all_as_notified:
         mark_items_notified(rows)
+    else:
+        if args.csv_path:
+            merge_notified_flags(args.csv_path, rows, unique_keys=("id", "uid"))
 
     for row in rows:
         delete_files(container_id, row, dry_run)
@@ -154,9 +156,8 @@ def main():
                 sys.exit(1)
 
     if args.csv_path:
-        overwrite_csv = bool(args.force) and not args.maintain_csv
-        items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "name"), overwrite=overwrite_csv)
-        append_items(args.csv_path, items, overwrite=overwrite_csv)
+        items = deduplicate_items(args.csv_path, rows, unique_keys=("id", "uid"), overwrite=False)
+        append_items(args.csv_path, items, overwrite=False)
 
     emit_summary(rows, "DRY-RUN" if dry_run else "FORCE", log_fn=print)
 
diff --git a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
index 9a0f6906..43e1d1d7 100644
--- a/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
+++ b/DHIS2/file_garbage_remover/file_garbage_remover_tomcat.py
@@ -120,8 +120,7 @@ def main():
     parser.add_argument("--force", action="store_true", help="Apply changes: move files and modify DB.")
     parser.add_argument("--test", action="store_true", help="Run in dry-run mode (no changes). Default if --force not provided.")
     parser.add_argument("--config", required=True, help="Path to config.json file.")
-    parser.add_argument("--csv-path", help="Optional CSV file to record processed entries. In --force mode the file is rewritten unless --maintain-csv.")
-    parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
+    parser.add_argument("--csv-path", help="Optional CSV file to record processed entries.")
     parser.add_argument("--save-all-as-notified", action="store_true", help="Store CSV entries with notified=true even if not sent.")
     args = parser.parse_args()
 
@@ -169,9 +168,10 @@ def main():
     if args.csv_path:
         if args.save_all_as_notified:
             mark_items_notified(summary.get("items", []))
-        overwrite_csv = bool(args.force) and not args.maintain_csv
-        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "name"), overwrite=overwrite_csv, log=log)
-        append_items(args.csv_path, items, overwrite=overwrite_csv, log=log)
+        else:
+            merge_notified_flags(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), log=log)
+        items = deduplicate_items(args.csv_path, summary.get("items", []), unique_keys=("id", "uid"), overwrite=False, log=log)
+        append_items(args.csv_path, items, overwrite=False, log=log)
     emit_summary(summary.get("items", []), summary.get("mode", "TEST"), log_fn=log)
 
 
diff --git a/DHIS2/file_garbage_remover/main.py b/DHIS2/file_garbage_remover/main.py
index e02a92ef..64d6a35e 100644
--- a/DHIS2/file_garbage_remover/main.py
+++ b/DHIS2/file_garbage_remover/main.py
@@ -13,7 +13,6 @@ def build_parser():
     parser.add_argument("--test", action="store_true", help="Run in dry-run mode (default unless --force).")
     parser.add_argument("--config", help="Path to config.json file. Required unless --notify-only.")
     parser.add_argument("--csv-path", help="CSV file to record processed entries or to read notifications from.")
-    parser.add_argument("--maintain-csv", action="store_true", help="Keep CSV contents even in --force mode (append only).")
     parser.add_argument("--mode", choices=["tomcat", "docker"], default="tomcat", help="Cleanup mode: tomcat (default) or docker.")
     parser.add_argument("--docker-instance", help="d2-docker instance name (required for --mode=docker).")
 
diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py
index bb6e82e5..c64523e8 100644
--- a/DHIS2/file_garbage_remover/sql_queries.py
+++ b/DHIS2/file_garbage_remover/sql_queries.py
@@ -1,5 +1,5 @@
 SQL_FIND_ORPHANS_DOCUMENTS = """
-    SELECT fileresourceid, storagekey, name, created
+    SELECT fileresourceid, uid, storagekey, name, created
     FROM fileresource fr
     WHERE NOT EXISTS (
         SELECT 1 FROM document d WHERE d.fileresource = fr.fileresourceid

From f241b0f183a1f65764970eebdaf644bbe1a19787 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Fri, 28 Nov 2025 13:10:56 +0100
Subject: [PATCH 21/23] import missing dependency

---
 DHIS2/file_garbage_remover/cleaner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DHIS2/file_garbage_remover/cleaner.py b/DHIS2/file_garbage_remover/cleaner.py
index c5a3074c..47b03d42 100644
--- a/DHIS2/file_garbage_remover/cleaner.py
+++ b/DHIS2/file_garbage_remover/cleaner.py
@@ -9,7 +9,7 @@
 import subprocess
 import psycopg2
 
-from common import load_config, log
+from common import load_config, log, mark_items_notified
 from csv_utils import append_items, deduplicate_items, merge_notified_flags
 from sql_queries import (
     SQL_CREATE_TABLE_IF_NOT_EXIST,

From d2b8cdc9106124d60b63911f793fec538eddb126 Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 22 Dec 2025 18:27:57 +0100
Subject: [PATCH 22/23] avoid identify as orphan files created/updated in the
 last 24

---
 DHIS2/file_garbage_remover/sql_queries.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/DHIS2/file_garbage_remover/sql_queries.py b/DHIS2/file_garbage_remover/sql_queries.py
index c64523e8..5eadcba1 100644
--- a/DHIS2/file_garbage_remover/sql_queries.py
+++ b/DHIS2/file_garbage_remover/sql_queries.py
@@ -7,12 +7,15 @@
     AND fr.uid NOT IN (
         SELECT url FROM document
     )
-    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%';
+    AND fr.domain = 'DOCUMENT' and  fr.storagekey like '%document%'
+    AND COALESCE(fr.lastupdated, fr.created, NOW()) < (NOW() - INTERVAL '24 hours');
 """
 
 SQL_FIND_DATA_VALUES_FILE_RESOURCES = """
     SELECT fileresourceid, uid, storagekey, name, created
-    FROM fileresource fr where fr.domain = 'DATA_VALUE' and  fr.storagekey like '%dataValue%';
+    FROM fileresource fr
+    WHERE fr.domain = 'DATA_VALUE' and fr.storagekey like '%dataValue%'
+    AND COALESCE(fr.lastupdated, fr.created, NOW()) < (NOW() - INTERVAL '24 hours');
 """
 
 SQL_INSERT_AUDIT = """

From 83f0bf0eac9e6df03173546a9930ae770494e10d Mon Sep 17 00:00:00 2001
From: idelcano <ignaciodelcano@gmail.com>
Date: Mon, 22 Dec 2025 18:44:33 +0100
Subject: [PATCH 23/23] recovery instructions

---
 DHIS2/file_garbage_remover/Readme.md | 47 ++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/DHIS2/file_garbage_remover/Readme.md b/DHIS2/file_garbage_remover/Readme.md
index 2b344ba1..e1394aa2 100644
--- a/DHIS2/file_garbage_remover/Readme.md
+++ b/DHIS2/file_garbage_remover/Readme.md
@@ -128,3 +128,50 @@ Deletes identified files directly inside the container.
 Production Environments: Always use --test mode before using --force to verify intended changes.
 
 Docker/Test Environments: Files deleted by file_garbage_remover_docker.py cannot be recovered.
+
+## Restoring fileresource entries (tomcat)
+
+Manual steps if a resource was moved by mistake:
+1) Restore the row into `fileresource` from `fileresourcesaudit` using a **new** `fileresourceid` to avoid collisions.
+Column list (stable): `uid, code, created, lastupdated, name, contenttype, contentlength, contentmd5, storagekey, isassigned, domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner`
+```
+INSERT INTO fileresource (
+  fileresourceid, uid, code, created, lastupdated, name, contenttype,
+  contentlength, contentmd5, storagekey, isassigned, domain, userid,
+  lastupdatedby, hasmultiplestoragefiles, fileresourceowner
+)
+SELECT
+  nextval('fileresource_fileresourceid_seq'), uid, code, created, lastupdated,
+  name, contenttype, contentlength, contentmd5, storagekey, isassigned,
+  domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner
+FROM fileresourcesaudit
+WHERE fileresourceid = <OLD_ID>;
+```
+Run inside a transaction and verify before commit.
+
+2) Move the file back from `temp_file_path` to `file_base_path`, keeping the folder (`document` or `dataValue`). Use a wildcard to catch image variants (e.g., `..._max`, `..._min`):
+```
+mv "<temp_file_path>/<folder>/<file_prefix>"* "<file_base_path>/<folder>/"
+```
+
+Example to restore entries moved in the last 24 hours:
+```
+-- 1) Reinsert rows with new IDs
+INSERT INTO fileresource (
+  fileresourceid, uid, code, created, lastupdated, name, contenttype,
+  contentlength, contentmd5, storagekey, isassigned, domain, userid,
+  lastupdatedby, hasmultiplestoragefiles, fileresourceowner
+)
+SELECT
+  nextval('fileresource_fileresourceid_seq'), uid, code, created, lastupdated,
+  name, contenttype, contentlength, contentmd5, storagekey, isassigned,
+  domain, userid, lastupdatedby, hasmultiplestoragefiles, fileresourceowner
+FROM fileresourcesaudit
+WHERE COALESCE(lastupdated, created, NOW()) >= (NOW() - INTERVAL '24 hours');
+
+-- 2) Move files back (adjust paths)
+find "<temp_file_path>" -type f -mtime -1 -print0 | while IFS= read -r -d '' f; do
+  rel="${f#<temp_file_path>/}"
+  mv "$f" "<file_base_path>/$rel"
+done
+```