Redback-Operations · livnugaraa · Sep 5, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/image:PDF_scanner/file_handler.py b/image:PDF_scanner/file_handler.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Iterable, List
+
+TEXT_EXTS = {".txt", ".md"}
+IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"}
+PDF_EXTS = {".pdf"}
+
+def find_files(root: str | os.PathLike, patterns: Iterable[str] | None = None) -> List[Path]:
+    root = Path(root)
+    results: List[Path] = []
+    for p in root.rglob("*"):
+        if not p.is_file():
+            continue
+        if patterns:
+            for pat in patterns:
+                if p.match(pat):
+                    results.append(p)
+                    break
+        else:
+            ext = p.suffix.lower()
+            if ext in TEXT_EXTS | IMAGE_EXTS | PDF_EXTS:
+                results.append(p)
+    return results
+
+def read_file(path: str | os.PathLike, binary: bool = False) -> bytes | str:
+    mode = "rb" if binary else "r"
+    try:
+        with open(path, mode, encoding=None if binary else "utf-8", errors=None if binary else "replace") as f:
+            return f.read()
+    except Exception as e:
+        raise IOError(f"Failed to read {path}: {e}")
+
+# Function to get a valid directory path from the user
+def get_valid_path():
+    while True:
+        path = input("Enter the directory path to scan and save the files (press Enter to save in the project folder): ").strip()
+        path = path.strip('"').strip("'")  # Remove surrounding quotes if present
+        if not path:  # If no input is provided, use the current directory
+            print("No path provided. Files will be saved in the project folder.")
+            print("-" * 63)
+            return os.getcwd()
+        elif os.path.isdir(path):  # Validate the provided path
+            print("-" * 63)
+            return path
+
+        else:
+            print("We cannot find that path. Please enter a valid directory or press Enter to use the project folder.")
diff --git a/image:PDF_scanner/ocr_engine.py b/image:PDF_scanner/ocr_engine.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+from pathlib import Path
+import re
+
+import numpy as np
+from PIL import Image
+import pytesseract
+import cv2
+
+try:
+    from pdf2image import convert_from_path
+    PDF2IMAGE_AVAILABLE = True
+except Exception:
+    PDF2IMAGE_AVAILABLE = False
+
+@dataclass
+class OCRConfig:
+    dpi: int = 300
+    deskew: bool = True
+    binarize: bool = True
+    oem: int = 3
+    psm: int = 3
+    lang: str = "eng"
+
+def _to_cv(img: Image.Image) -> np.ndarray:
+    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+
+def _to_pil(arr: np.ndarray) -> Image.Image:
+    return Image.fromarray(cv2.cvtColor(arr, cv2.COLOR_BGR2RGB))
+
+def _normalize_dpi(img: Image.Image, target_dpi: int) -> Image.Image:
+    dpi = img.info.get("dpi", (target_dpi, target_dpi))[0]
+    if dpi < target_dpi:
+        scale = target_dpi / dpi
+        new_size = (int(img.width * scale), int(img.height * scale))
+        img = img.resize(new_size, Image.LANCZOS)
+        img.info["dpi"] = (target_dpi, target_dpi)
+    return img
+
+def _deskew(cv_img: np.ndarray) -> np.ndarray:
+    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.bitwise_not(gray)
+    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+    coords = np.column_stack(np.where(thresh > 0))
+    if coords.size == 0:
+        return cv_img
+    angle = cv2.minAreaRect(coords)[-1]
+    if angle < -45:
+        angle = -(90 + angle)
+    else:
+        angle = -angle
+    (h, w) = cv_img.shape[:2]
+    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
+    rotated = cv2.warpAffine(cv_img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
+    return rotated
+
+def _binarize(cv_img: np.ndarray) -> np.ndarray:
+    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                cv2.THRESH_BINARY, 35, 11)
+    return cv2.cvtColor(thr, cv2.COLOR_GRAY2BGR)
+
+def preprocess_image(img: Image.Image, cfg: OCRConfig) -> Image.Image:
+    img = _normalize_dpi(img, cfg.dpi)
+    cv_img = _to_cv(img)
+    if cfg.deskew:
+        cv_img = _deskew(cv_img)
+    if cfg.binarize:
+        cv_img = _binarize(cv_img)
+    return _to_pil(cv_img)
+
+def _tesseract_args(cfg: OCRConfig) -> str:
+    return f"--oem {cfg.oem} --psm {cfg.psm}"
+
+def ocr_image(img: Image.Image, cfg: Optional[OCRConfig] = None) -> str:
+    cfg = cfg or OCRConfig()
+    img_p = preprocess_image(img, cfg)
+    text = pytesseract.image_to_string(img_p, lang=cfg.lang, config=_tesseract_args(cfg))
+    return text.strip()
+
+def pdf_to_images(pdf_path: str | Path, dpi: int = 300) -> List[Image.Image]:
+    if not PDF2IMAGE_AVAILABLE:
+        raise RuntimeError("pdf2image not available or poppler missing.")
+    return convert_from_path(str(pdf_path), dpi=dpi)
+
+def ocr_pdf(pdf_path: str | Path, cfg: Optional[OCRConfig] = None) -> Tuple[str, List[str]]:
+    cfg = cfg or OCRConfig()
+    pages = pdf_to_images(pdf_path, dpi=cfg.dpi)
+    page_texts = [ocr_image(p, cfg) for p in pages]
+    return "\n".join(page_texts), page_texts
diff --git a/image:PDF_scanner/scan_media.py b/image:PDF_scanner/scan_media.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import json
+
+from file_handler import *
+from ocr_engine import ocr_image, ocr_pdf, OCRConfig
+from PIL import Image
+
+def main():
+
+    cfg = OCRConfig()
+    dir_path = Path(get_valid_path())
+    dir_path.mkdir(parents=True, exist_ok=True)
+
+    records = []
+    for path in find_files(dir_path):
+        p = Path(path)
+        try:
+            if p.suffix.lower() == ".pdf":
+                text, _ = ocr_pdf(p, cfg)
+            elif p.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"}:
+                text = ocr_image(Image.open(p), cfg)
+            else:
+                continue
+        except Exception as e:
+            print(f"[WARN] OCR failed for {p}: {e}")
+            continue
+
+        out_txt = dir_path / (p.stem + ".txt")
+        out_txt.write_text(text, encoding="utf-8")
+        records.append({"source": str(p), "text_path": str(out_txt), "chars": len(text)})
+
+    print(f"Done. Wrote {len(records)} files to {dir_path}.")
+
+if __name__ == "__main__":
+    main()
diff --git a/image:PDF_scanner/tests/test_ocr_engine.py b/image:PDF_scanner/tests/test_ocr_engine.py
@@ -0,0 +1,13 @@
+from PIL import Image, ImageDraw
+from ocr_engine import ocr_image, OCRConfig
+
+def _make_test_img(text: str = "Hello OCR"):
+    img = Image.new("RGB", (600, 200), "white")
+    d = ImageDraw.Draw(img)
+    d.text((50, 80), text, fill="black")
+    return img
+
+def test_basic_ocr():
+    img = _make_test_img("Secret Key: ABCD")
+    out = ocr_image(img, OCRConfig())
+    assert "Secret" in out