Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
49 changes: 49 additions & 0 deletions image:PDF_scanner/file_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations
import os
from pathlib import Path
from typing import Iterable, List

TEXT_EXTS = {".txt", ".md"}
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"}
PDF_EXTS = {".pdf"}

def find_files(root: str | os.PathLike, patterns: Iterable[str] | None = None) -> List[Path]:
root = Path(root)
results: List[Path] = []
for p in root.rglob("*"):
if not p.is_file():
continue
if patterns:
for pat in patterns:
if p.match(pat):
results.append(p)
break
else:
ext = p.suffix.lower()
if ext in TEXT_EXTS | IMAGE_EXTS | PDF_EXTS:
results.append(p)
return results

def read_file(path: str | os.PathLike, binary: bool = False) -> bytes | str:
mode = "rb" if binary else "r"
try:
with open(path, mode, encoding=None if binary else "utf-8", errors=None if binary else "replace") as f:
return f.read()
except Exception as e:
raise IOError(f"Failed to read {path}: {e}")

# Function to get a valid directory path from the user
def get_valid_path():
while True:
path = input("Enter the directory path to scan and save the files (press Enter to save in the project folder): ").strip()
path = path.strip('"').strip("'") # Remove surrounding quotes if present
if not path: # If no input is provided, use the current directory
print("No path provided. Files will be saved in the project folder.")
print("-" * 63)
return os.getcwd()
elif os.path.isdir(path): # Validate the provided path
print("-" * 63)
return path

else:
print("We cannot find that path. Please enter a valid directory or press Enter to use the project folder.")
92 changes: 92 additions & 0 deletions image:PDF_scanner/ocr_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Optional, Tuple
from pathlib import Path
import re

import numpy as np
from PIL import Image
import pytesseract
import cv2

try:
from pdf2image import convert_from_path
PDF2IMAGE_AVAILABLE = True
except Exception:
PDF2IMAGE_AVAILABLE = False

@dataclass
class OCRConfig:
dpi: int = 300
deskew: bool = True
binarize: bool = True
oem: int = 3
psm: int = 3
lang: str = "eng"

def _to_cv(img: Image.Image) -> np.ndarray:
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def _to_pil(arr: np.ndarray) -> Image.Image:
return Image.fromarray(cv2.cvtColor(arr, cv2.COLOR_BGR2RGB))

def _normalize_dpi(img: Image.Image, target_dpi: int) -> Image.Image:
dpi = img.info.get("dpi", (target_dpi, target_dpi))[0]
if dpi < target_dpi:
scale = target_dpi / dpi
new_size = (int(img.width * scale), int(img.height * scale))
img = img.resize(new_size, Image.LANCZOS)
img.info["dpi"] = (target_dpi, target_dpi)
return img

def _deskew(cv_img: np.ndarray) -> np.ndarray:
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
coords = np.column_stack(np.where(thresh > 0))
if coords.size == 0:
return cv_img
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = cv_img.shape[:2]
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
rotated = cv2.warpAffine(cv_img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated

def _binarize(cv_img: np.ndarray) -> np.ndarray:
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11)
return cv2.cvtColor(thr, cv2.COLOR_GRAY2BGR)

def preprocess_image(img: Image.Image, cfg: OCRConfig) -> Image.Image:
img = _normalize_dpi(img, cfg.dpi)
cv_img = _to_cv(img)
if cfg.deskew:
cv_img = _deskew(cv_img)
if cfg.binarize:
cv_img = _binarize(cv_img)
return _to_pil(cv_img)

def _tesseract_args(cfg: OCRConfig) -> str:
return f"--oem {cfg.oem} --psm {cfg.psm}"

def ocr_image(img: Image.Image, cfg: Optional[OCRConfig] = None) -> str:
cfg = cfg or OCRConfig()
img_p = preprocess_image(img, cfg)
text = pytesseract.image_to_string(img_p, lang=cfg.lang, config=_tesseract_args(cfg))
return text.strip()

def pdf_to_images(pdf_path: str | Path, dpi: int = 300) -> List[Image.Image]:
if not PDF2IMAGE_AVAILABLE:
raise RuntimeError("pdf2image not available or poppler missing.")
return convert_from_path(str(pdf_path), dpi=dpi)

def ocr_pdf(pdf_path: str | Path, cfg: Optional[OCRConfig] = None) -> Tuple[str, List[str]]:
cfg = cfg or OCRConfig()
pages = pdf_to_images(pdf_path, dpi=cfg.dpi)
page_texts = [ocr_image(p, cfg) for p in pages]
return "\n".join(page_texts), page_texts
37 changes: 37 additions & 0 deletions image:PDF_scanner/scan_media.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations
import argparse
from pathlib import Path
import json

from file_handler import *
from ocr_engine import ocr_image, ocr_pdf, OCRConfig
from PIL import Image

def main():

cfg = OCRConfig()
dir_path = Path(get_valid_path())
dir_path.mkdir(parents=True, exist_ok=True)

records = []
for path in find_files(dir_path):
p = Path(path)
try:
if p.suffix.lower() == ".pdf":
text, _ = ocr_pdf(p, cfg)
elif p.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"}:
text = ocr_image(Image.open(p), cfg)
else:
continue
except Exception as e:
print(f"[WARN] OCR failed for {p}: {e}")
continue

out_txt = dir_path / (p.stem + ".txt")
out_txt.write_text(text, encoding="utf-8")
records.append({"source": str(p), "text_path": str(out_txt), "chars": len(text)})

print(f"Done. Wrote {len(records)} files to {dir_path}.")

if __name__ == "__main__":
main()
13 changes: 13 additions & 0 deletions image:PDF_scanner/tests/test_ocr_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from PIL import Image, ImageDraw
from ocr_engine import ocr_image, OCRConfig

def _make_test_img(text: str = "Hello OCR"):
img = Image.new("RGB", (600, 200), "white")
d = ImageDraw.Draw(img)
d.text((50, 80), text, fill="black")
return img

def test_basic_ocr():
img = _make_test_img("Secret Key: ABCD")
out = ocr_image(img, OCRConfig())
assert "Secret" in out
Loading