Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions man_spider/lib/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from time import sleep
import subprocess as sp
from extractous import Extractor
from kreuzberg import extract_file_sync

from man_spider.lib.util import *
from man_spider.lib.logger import *
Expand All @@ -26,7 +26,6 @@ class FileParser:

def __init__(self, filters, quiet=False):
self.init_content_filters(filters)
self.extractor = Extractor()
self.quiet = quiet


Expand Down Expand Up @@ -117,7 +116,7 @@ def parse_file(self, file, pretty_filename=None):

try:

matches = self.extractous(file, pretty_filename=pretty_filename)
matches = self.extract_text(file, pretty_filename=pretty_filename)

except Exception as e:
if log.level <= logging.DEBUG:
Expand All @@ -128,20 +127,19 @@ def parse_file(self, file, pretty_filename=None):
return matches


def extractous(self, file, pretty_filename):
def extract_text(self, file, pretty_filename):
'''
Extracts text from a file using the extractous library
Extracts text from a file using the kreuzberg library
'''

matches = dict()

suffix = Path(str(file)).suffix.lower()

# blacklist certain mime types
if not self.match_magic(file):
return matches

text_content, metadata = self.extractor.extract_file_to_string(str(file))
result = extract_file_sync(str(file))
text_content = result.content

# try to convert to UTF-8 for grep-friendliness
try:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ authors = [
]
license = {text = "GPL-3.0"}
readme = "README.md"
requires-python = ">=3.8,<3.14"
requires-python = ">=3.10,<3.14"
dependencies = [
"extractous (>=0.3.0,<0.4.0)",
"kreuzberg (>=0.9.0)",
"impacket (>=0.12.0,<0.13.0)",
"python-magic (>=0.4.27,<0.5.0)"
]
Expand Down