From f4f9f21821590019c4441950281ae193548a952f Mon Sep 17 00:00:00 2001 From: Liv Date: Fri, 5 Sep 2025 11:31:49 +1000 Subject: [PATCH] image/PDF_scanner This scans pdf's and images and coverts them to text. --- .DS_Store | Bin 6148 -> 6148 bytes image:PDF_scanner/file_handler.py | 49 +++++++++++ image:PDF_scanner/ocr_engine.py | 92 +++++++++++++++++++++ image:PDF_scanner/scan_media.py | 37 +++++++++ image:PDF_scanner/tests/test_ocr_engine.py | 13 +++ 5 files changed, 191 insertions(+) create mode 100644 image:PDF_scanner/file_handler.py create mode 100644 image:PDF_scanner/ocr_engine.py create mode 100644 image:PDF_scanner/scan_media.py create mode 100644 image:PDF_scanner/tests/test_ocr_engine.py diff --git a/.DS_Store b/.DS_Store index 60838291a1f1346678c6694a10434e6cf7ba0b1a..334a0b70df7d3206216580c2c48d867864969742 100644 GIT binary patch literal 6148 zcmeHK%}T>S5T4blTZ>RZ!Q+BQ3pQ#C>Lu3bS&ZmGr6wk{!8BW%)*ebB;7wo1H}DO7 z1fRp%pMp|_(u0WZl-Y0g=SwnQ$ZiJ!NUaxG07U?x=mK+#*nDAPUig^J=#hCuBCip` z9-M#!$50&?D+eu=c zH1O$0N_|Mxg#SnZ{r+j;ALVRPH3}F7CR0G{4-8$Ptua$5w+?J%34oZxVQCnPT0(G` zMq6X15NFVcq9Q6P(Io~^bR1`TovkrbsOUg+@j>*RiLOwHygOc>$>l(7g{CzM7zI8n zFsr&nasD5DU;lq5nVwO=DDbBgAUVf%YM7EfTgQ?UXDvg&K^G?0WeOz)8+{zhKpe$u c=+bbGi2%{om?^{&H1kJ5%3vC!z>g~M4x0vkEdT%j literal 6148 zcmeHK!A=4(5Pd}s7%v<>?!l`r8Z{m_1fz+{h3FMQHo6hSEXC+;KixOehDgvGF@el% zrmvlLrrXzMXA8i!=c8kw37|n0tnIS+MWkPJMQYI^=rqO``WRsVPc_*D24wFJxMy=r z`25_zKJHN96FGbPI6;eg#!~d0mKH8pKh>TiJYj~9cR#$H&wR<4H|R0ij3*&y+!PO( zVyv^|w5I69Q List[Path]: + root = Path(root) + results: List[Path] = [] + for p in root.rglob("*"): + if not p.is_file(): + continue + if patterns: + for pat in patterns: + if p.match(pat): + results.append(p) + break + else: + ext = p.suffix.lower() + if ext in TEXT_EXTS | IMAGE_EXTS | PDF_EXTS: + results.append(p) + return results + +def read_file(path: str | os.PathLike, binary: bool = False) -> bytes | str: + mode = "rb" if binary else "r" + try: + with open(path, mode, encoding=None if binary else "utf-8", errors=None if binary else "replace") as f: + return f.read() + except Exception as e: + raise IOError(f"Failed to read {path}: {e}") + +# Function to get a valid directory path from the user +def get_valid_path(): + while True: + path = input("Enter the directory path to scan and save the files (press Enter to save in the project folder): ").strip() + path = path.strip('"').strip("'") # Remove surrounding quotes if present + if not path: # If no input is provided, use the current directory + print("No path provided. Files will be saved in the project folder.") + print("-" * 63) + return os.getcwd() + elif os.path.isdir(path): # Validate the provided path + print("-" * 63) + return path + + else: + print("We cannot find that path. Please enter a valid directory or press Enter to use the project folder.") \ No newline at end of file diff --git a/image:PDF_scanner/ocr_engine.py b/image:PDF_scanner/ocr_engine.py new file mode 100644 index 0000000..ea314a5 --- /dev/null +++ b/image:PDF_scanner/ocr_engine.py @@ -0,0 +1,92 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import List, Optional, Tuple +from pathlib import Path +import re + +import numpy as np +from PIL import Image +import pytesseract +import cv2 + +try: + from pdf2image import convert_from_path + PDF2IMAGE_AVAILABLE = True +except Exception: + PDF2IMAGE_AVAILABLE = False + +@dataclass +class OCRConfig: + dpi: int = 300 + deskew: bool = True + binarize: bool = True + oem: int = 3 + psm: int = 3 + lang: str = "eng" + +def _to_cv(img: Image.Image) -> np.ndarray: + return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + +def _to_pil(arr: np.ndarray) -> Image.Image: + return Image.fromarray(cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)) + +def _normalize_dpi(img: Image.Image, target_dpi: int) -> Image.Image: + dpi = img.info.get("dpi", (target_dpi, target_dpi))[0] + if dpi < target_dpi: + scale = target_dpi / dpi + new_size = (int(img.width * scale), int(img.height * scale)) + img = img.resize(new_size, Image.LANCZOS) + img.info["dpi"] = (target_dpi, target_dpi) + return img + +def _deskew(cv_img: np.ndarray) -> np.ndarray: + gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) + gray = cv2.bitwise_not(gray) + thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + coords = np.column_stack(np.where(thresh > 0)) + if coords.size == 0: + return cv_img + angle = cv2.minAreaRect(coords)[-1] + if angle < -45: + angle = -(90 + angle) + else: + angle = -angle + (h, w) = cv_img.shape[:2] + M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0) + rotated = cv2.warpAffine(cv_img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) + return rotated + +def _binarize(cv_img: np.ndarray) -> np.ndarray: + gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) + thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 35, 11) + return cv2.cvtColor(thr, cv2.COLOR_GRAY2BGR) + +def preprocess_image(img: Image.Image, cfg: OCRConfig) -> Image.Image: + img = _normalize_dpi(img, cfg.dpi) + cv_img = _to_cv(img) + if cfg.deskew: + cv_img = _deskew(cv_img) + if cfg.binarize: + cv_img = _binarize(cv_img) + return _to_pil(cv_img) + +def _tesseract_args(cfg: OCRConfig) -> str: + return f"--oem {cfg.oem} --psm {cfg.psm}" + +def ocr_image(img: Image.Image, cfg: Optional[OCRConfig] = None) -> str: + cfg = cfg or OCRConfig() + img_p = preprocess_image(img, cfg) + text = pytesseract.image_to_string(img_p, lang=cfg.lang, config=_tesseract_args(cfg)) + return text.strip() + +def pdf_to_images(pdf_path: str | Path, dpi: int = 300) -> List[Image.Image]: + if not PDF2IMAGE_AVAILABLE: + raise RuntimeError("pdf2image not available or poppler missing.") + return convert_from_path(str(pdf_path), dpi=dpi) + +def ocr_pdf(pdf_path: str | Path, cfg: Optional[OCRConfig] = None) -> Tuple[str, List[str]]: + cfg = cfg or OCRConfig() + pages = pdf_to_images(pdf_path, dpi=cfg.dpi) + page_texts = [ocr_image(p, cfg) for p in pages] + return "\n".join(page_texts), page_texts diff --git a/image:PDF_scanner/scan_media.py b/image:PDF_scanner/scan_media.py new file mode 100644 index 0000000..c538e44 --- /dev/null +++ b/image:PDF_scanner/scan_media.py @@ -0,0 +1,37 @@ +from __future__ import annotations +import argparse +from pathlib import Path +import json + +from file_handler import * +from ocr_engine import ocr_image, ocr_pdf, OCRConfig +from PIL import Image + +def main(): + + cfg = OCRConfig() + dir_path = Path(get_valid_path()) + dir_path.mkdir(parents=True, exist_ok=True) + + records = [] + for path in find_files(dir_path): + p = Path(path) + try: + if p.suffix.lower() == ".pdf": + text, _ = ocr_pdf(p, cfg) + elif p.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"}: + text = ocr_image(Image.open(p), cfg) + else: + continue + except Exception as e: + print(f"[WARN] OCR failed for {p}: {e}") + continue + + out_txt = dir_path / (p.stem + ".txt") + out_txt.write_text(text, encoding="utf-8") + records.append({"source": str(p), "text_path": str(out_txt), "chars": len(text)}) + + print(f"Done. Wrote {len(records)} files to {dir_path}.") + +if __name__ == "__main__": + main() diff --git a/image:PDF_scanner/tests/test_ocr_engine.py b/image:PDF_scanner/tests/test_ocr_engine.py new file mode 100644 index 0000000..bfa014c --- /dev/null +++ b/image:PDF_scanner/tests/test_ocr_engine.py @@ -0,0 +1,13 @@ +from PIL import Image, ImageDraw +from ocr_engine import ocr_image, OCRConfig + +def _make_test_img(text: str = "Hello OCR"): + img = Image.new("RGB", (600, 200), "white") + d = ImageDraw.Draw(img) + d.text((50, 80), text, fill="black") + return img + +def test_basic_ocr(): + img = _make_test_img("Secret Key: ABCD") + out = ocr_image(img, OCRConfig()) + assert "Secret" in out