From 5da575d8fcf3d7fc2c8aedc9acaf090fd555f0cd Mon Sep 17 00:00:00 2001 From: probird5 <52969604+probird5@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:40:16 -0500 Subject: [PATCH 1/3] Replace Extractous with Kreuzberg for text extraction Fixes #56 --- man_spider/lib/parser/parser.py | 14 ++++++-------- pyproject.toml | 4 ++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index cea5911..0b8299b 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -3,7 +3,7 @@ import logging from time import sleep import subprocess as sp -from extractous import Extractor +from kreuzberg import extract_file_sync from man_spider.lib.util import * from man_spider.lib.logger import * @@ -26,7 +26,6 @@ class FileParser: def __init__(self, filters, quiet=False): self.init_content_filters(filters) - self.extractor = Extractor() self.quiet = quiet @@ -117,7 +116,7 @@ def parse_file(self, file, pretty_filename=None): try: - matches = self.extractous(file, pretty_filename=pretty_filename) + matches = self.extract_text(file, pretty_filename=pretty_filename) except Exception as e: if log.level <= logging.DEBUG: @@ -128,20 +127,19 @@ def parse_file(self, file, pretty_filename=None): return matches - def extractous(self, file, pretty_filename): + def extract_text(self, file, pretty_filename): ''' - Extracts text from a file using the extractous library + Extracts text from a file using the kreuzberg library ''' matches = dict() - suffix = Path(str(file)).suffix.lower() - # blacklist certain mime types if not self.match_magic(file): return matches - text_content, metadata = self.extractor.extract_file_to_string(str(file)) + result = extract_file_sync(str(file)) + text_content = result.content # try to convert to UTF-8 for grep-friendliness try: diff --git a/pyproject.toml b/pyproject.toml index cd8ed50..73ba494 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,9 +7,9 @@ authors = [ ] license = {text = "GPL-3.0"} readme = "README.md" -requires-python = ">=3.8,<3.14" +requires-python = ">=3.10,<3.14" dependencies = [ - "extractous (>=0.3.0,<0.4.0)", + "kreuzberg (>=0.9.0)", "impacket (>=0.12.0,<0.13.0)", "python-magic (>=0.4.27,<0.5.0)" ] From 0b6f6db41e3739819ffebf040c70d2118f94ca37 Mon Sep 17 00:00:00 2001 From: probird5 <52969604+probird5@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:07:14 -0500 Subject: [PATCH 2/3] fix python version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 73ba494..0c2679a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [ ] license = {text = "GPL-3.0"} readme = "README.md" -requires-python = ">=3.10,<3.14" +requires-python = ">=3.10,<3.15" dependencies = [ "kreuzberg (>=0.9.0)", "impacket (>=0.12.0,<0.13.0)", From a34ad2a3dce661a5f4325b1084740b4380f61c6e Mon Sep 17 00:00:00 2001 From: probird5 <52969604+probird5@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:54:14 -0500 Subject: [PATCH 3/3] reverted python version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0c2679a..73ba494 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [ ] license = {text = "GPL-3.0"} readme = "README.md" -requires-python = ">=3.10,<3.15" +requires-python = ">=3.10,<3.14" dependencies = [ "kreuzberg (>=0.9.0)", "impacket (>=0.12.0,<0.13.0)",