Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ dist/*
*.pyc
*.pyo
slate.egg-info
.cache
.*.swp
40 changes: 20 additions & 20 deletions README → README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@ text from PDF files. It depends on the PDFMiner package.

Slate provides one class, PDF. PDF takes a file-like object and
will extract all text from the document, presentating each page
as a string of text:
as a string of text::

>>> with open('example.pdf') as f:
... doc = slate.PDF(f)
...
>>> doc
[..., ..., ...]
>>> doc[1]
'Text from page 2...'
>>> with open('example.pdf', 'rb') as f:
... doc = slate.PDF(f)
...
>>> doc
[..., ..., ...]
>>> doc[1]
'Text from page 2...'

If your pdf is password protected, pass the password as the
second argument:
second argument::

>>> with open('secrets.pdf') as f:
... doc = slate.PDF(f, 'password')
...
>>> doc[0]
"My mother doesn't know this, but..."
>>> with open('secrets.pdf', 'rb') as f:
... doc = slate.PDF(f, 'password')
...
>>> doc[0]
"My mother doesn't know this, but..."

More complex operations
-----------------------
Expand All @@ -37,10 +37,10 @@ information, then take some time to learn the PDFMiner API.
What is wrong with PDFMiner?
----------------------------

1. Getting simple things done, like extracting the text
is quite complex. The program is not designed to return
Python objects, which makes interfacing things irritating.
2. It's an extremely complete set of tools, with multiple
and moderately steep learning curves.
3. It's not written with hackability in mind.
1. Getting simple things done, like extracting the text
is quite complex. The program is not designed to return
Python objects, which makes interfacing things irritating.
2. It's an extremely complete set of tools, with multiple
and moderately steep learning curves.
3. It's not written with hackability in mind.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
else:
pdfminer = 'pdfminer'

with open('README') as f:
with open('README.rst') as f:
long_description = f.read()

setup(name='slate',
Expand Down
2 changes: 1 addition & 1 deletion src/slate/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pdfminer.pdfparser import PDFPage
except ImportError:
from pdfminer.pdfpage import PDFPage
import utils
from . import utils

__all__ = ['PDF']

Expand Down
22 changes: 17 additions & 5 deletions src/slate/test_slate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
http://codespeak.net/py/dist/test/index.html
"""

from classes import PDF
import os
import pytest

def pytest_funcarg__doc(request):
with open('example.pdf', 'rb') as f:
from .classes import PDF

@pytest.fixture
def doc():
with open(get_pdf_path('example.pdf'), 'rb') as f:
return PDF(f)

def pytest_funcarg__passwd(request):
with open('protected.pdf') as f:
@pytest.fixture
def passwd():
with open(get_pdf_path('protected.pdf'), 'rb') as f:
return PDF(f, 'a')

def test_basic(doc):
Expand All @@ -30,3 +35,10 @@ def test_text_method_unclean(doc):

def test_password(passwd):
assert passwd[0] == "Chamber of secrets.\n\n\x0c"

def get_pdf_path(pdf_file):
return os.path.join(
os.path.dirname(__file__),
pdf_file)


11 changes: 9 additions & 2 deletions src/slate/unittests.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import unittest

import os
from slate import PDF

class TestSlate(unittest.TestCase):
def setUp(self):
with open('example.pdf', 'rb') as f:
with open(get_pdf_path('example.pdf'), 'rb') as f:
self.doc = PDF(f)
with open('protected.pdf', 'rb') as f:
with open(get_pdf_path('protected.pdf'), 'rb') as f:
self.passwd = PDF(f, 'a')

def test_basic(self):
Expand All @@ -27,5 +28,11 @@ def test_text_method_unclean(self):
def test_password(self):
assert self.passwd[0] == "Chamber of secrets.\n\n\x0c"


def get_pdf_path(pdf_file):
return os.path.join(
os.path.dirname(__file__),
pdf_file)

if __name__ == '__main__':
unittest.main()