Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 3 additions & 14 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ jobs:
build:
working_directory: ~/tika
docker:
- image: circleci/ruby:2.3.7-browsers
- image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
Expand All @@ -27,7 +27,7 @@ jobs:
test:
working_directory: ~/tika
docker:
- image: circleci/ruby:2.3.7-browsers
- image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
Expand All @@ -39,7 +39,7 @@ jobs:
rubocop:
working_directory: ~/tika
docker:
- image: circleci/ruby:2.3.7-browsers
- image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
Expand All @@ -56,14 +56,3 @@ workflows:
- test:
requires:
- build
- rubocop:
requires:
- build
jobs:
- build
- test:
requires:
- build
- rubocop:
requires:
- build
10 changes: 7 additions & 3 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
AllCops:
DisplayCopNames: true
TargetRubyVersion: 2.3
TargetRubyVersion: 2.7
Exclude:
- 'ext/*'
- 'vendor/**/*'

Lint/MissingSuper:
Enabled: false

Metrics/AbcSize:
Enabled: true
Max: 28
Expand All @@ -23,5 +26,6 @@ Metrics/MethodLength:

Style/Documentation:
Enabled: false
Exclude:
- 'spec/**/*'

Style/NumericPredicate:
Enabled: false
5 changes: 5 additions & 0 deletions .tool-versions
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ruby 2.7.5
nodejs 16.15.0
java zulu-8.56.0.21
python 3.9.0
yarn 1.22.10
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ Ruby Tika Parser
======

[![CircleCI](https://circleci.com/gh/pulibrary/ruby_tika_app.svg?style=svg)](https://circleci.com/gh/pulibrary/ruby_tika_app)
[![Coverage Status](https://coveralls.io/repos/github/pulibrary/ruby_tika_app/badge.svg)](https://coveralls.io/github/pulibrary/ruby_tika_app)

### Introduction

Expand Down
Binary file renamed ext/tika-app-1.18.jar → ext/tika-app-1.24.1.jar
Binary file not shown.
9 changes: 9 additions & 0 deletions ext/tika-config.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<service-loader initializableProblemHandler="ignore"/>
<parsers>
<!-- Default Parser for most things, except for 2 mime types, and never
use the Executable Parser -->
<parser class="org.apache.tika.parser.DefaultParser">
<mime-exclude>image/jpeg</mime-exclude>
<mime-exclude>application/x-sqlite3</mime-exclude>
<parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
</parser>
</parsers>
</properties>
52 changes: 17 additions & 35 deletions lib/ruby_tika_app.rb
Original file line number Diff line number Diff line change
@@ -1,27 +1,39 @@
# frozen_string_literal: true

# Based on the rake remote task code

require 'rubygems'
require 'stringio'
require 'open4'

class RubyTikaApp
TIKA_APP_VERSION = '1.24.1'

class Error < RuntimeError; end

class CommandFailedError < Error
attr_reader :status

def initialize(status)
@status = status
end
end

def initialize(document, config = nil)
@config = config
@document = if document =~ %r{https?:\/\/[\S]+}
@document = if (document =~ %r{https?://\S+}) == 0
document
else
"file://#{document}"
end

java_cmd = 'java'
java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
ext_dir = File.join(File.dirname(__FILE__))
tika_path = "#{ext_dir}/../ext/tika-app-#{TIKA_APP_VERSION}.jar"
tika_config_path = @config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml')

@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
end

def to_xml
Expand All @@ -32,7 +44,7 @@ def to_html
run_tika('--html')
end

def to_json
def to_json(*_args)
run_tika('--json')
end

Expand All @@ -50,23 +62,15 @@ def to_metadata

private

def java_args
'-server -Djava.awt.headless=true'
end

def java_cmd
'java'
end

def run_tika(option)
final_cmd = "#{tika_cmd} #{option} '#{@document}'"
final_cmd = "#{@tika_cmd} #{option} '#{@document}'"

_pid, stdin, stdout, stderr = Open4.popen4(final_cmd)
_, stdin, stdout, stderr = Open4.popen4(final_cmd)

stdout_result = stdout.read.strip
stderr_result = stderr.read.strip

unless strip_stderr(stderr_result).empty?
if stdout_result.empty? && !stderr_result.empty?
raise(CommandFailedError.new(stderr_result),
"execution failed with status #{stderr_result}: #{final_cmd}")
end
Expand All @@ -77,26 +81,4 @@ def run_tika(option)
stdout.close
stderr.close
end

def strip_stderr(error_message)
errors = error_message.split("\n")
real_errors = errors.reject { |error| error =~ /(INFO|WARN)/ }
real_errors.empty? ? real_errors : real_errors.join("\n")
end

def tika_cmd
"#{java_cmd} #{java_args} -jar '#{tika_path}' #{tika_config}"
end

def tika_config
"--config=#{tika_config_path}"
end

def tika_config_path
@config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml')
end

def tika_path
File.join(File.dirname(__FILE__), '..', 'ext', 'tika-app-1.18.jar')
end
end
6 changes: 3 additions & 3 deletions ruby_tika_app.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ Gem::Specification.new do |s|
s.require_paths = %w[lib]

s.add_runtime_dependency 'open4'
s.required_ruby_version = '>= 2.7'

s.add_development_dependency 'bundler', '>= 1.0.15'
s.add_development_dependency 'coveralls'
s.add_development_dependency 'json'
s.add_development_dependency 'pry'
s.add_development_dependency 'rake'
s.add_development_dependency 'rspec', '~> 3.3.0'
s.add_development_dependency 'rspec', '~> 3.9.0'
s.add_development_dependency 'rubocop'
s.add_development_dependency 'simplecov'
s.add_development_dependency 'stub_server'
s.add_development_dependency 'thin'
end
86 changes: 41 additions & 45 deletions spec/ruby_tika_app_spec.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# frozen_string_literal: true

require 'spec_helper'
require 'stub_server'

describe RubyTikaApp do
let(:doc_path) { File.join(File.dirname(__FILE__), 'docs') }
let(:test_file) { "#{doc_path}/graph sampling simplex - 11.pdf" }
before(:each) do
doc_path = "#{File.join(File.dirname(__FILE__))}/docs"

@test_file = "#{doc_path}/graph sampling simplex - 11.pdf"

@cnn_com_file = "#{doc_path}/cnn.com"
@news_ycombinator_com_file = "#{doc_path}/news.ycombinator.com"
end

describe 'Error' do
it 'has an error' do
Expand All @@ -16,111 +21,102 @@
end
end

describe 'CommandFailedError' do
it 'is raised correctly' do
expect do
rta = RubyTikaApp.new('/file_not_found.pdf')
rta.to_text
end.to raise_error(RubyTikaApp::CommandFailedError)
end
end

describe '#to_xml' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_xml[0..37]).to eq('<?xml version="1.0" encoding="UTF-8"?>')
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
xml = rta.to_xml

xml_size = xml.size / 2

expect(xml[xml_size..(xml_size + 100)]).to eq("d in Frontier Sampling (FS).\nSince this is the only difference between MHRW and USDSG,\nto be simple, ")
expect(xml[xml_size..(xml_size + 100)]).to eq("pply USDSG, we\nneed to change a directed graph Gd to a symmetric graph\nG. This methodology is also us")
end
end

describe '#to_html' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_html[0..42]).to eq('<html xmlns="http://www.w3.org/1999/xhtml">')
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
expect(rta.to_html[1000...1100]).to eq("Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"pdf:encrypted\" content")
rta = RubyTikaApp.new(@test_file)
expect(rta.to_html[1000...1100]).to eq("nfo:modified\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00")
end
end

describe '#to_json' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_json[0..42]).to eq('{"Application":"\\u0027Certified by IEEE PDF')
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_json[100...150]).to eq('"171510","Content-Type":"application/pdf","Creatio')
end
end

describe '#to_text' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_text[0..42]).to eq("Understanding Graph Sampling Algorithms\nfor")
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_text[100...150]).to eq("in Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixi")
end
end

describe '#to_text_main' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_text_main[0..42]).to eq('Understanding Graph Sampling Algorithms for')
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_text_main[100...150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
end
end

describe '#to_metadata' do
it 'header' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_metadata[0..42]).to eq("Application: 'Certified by IEEE PDFeXpress ")
end

it 'middle' do
rta = RubyTikaApp.new(test_file)
rta = RubyTikaApp.new(@test_file)
expect(rta.to_metadata[100...150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
end
end

describe 'external URLs' do
let(:port) { 9123 }

context 'with a link to cnn.com' do
let(:document) { File.read("#{doc_path}/cnn.com") }
let(:replies) { { '/cnn.com' => [200, {}, [document]] } }

it 'parses the url' do
StubServer.open(port, replies) do |server|
server.wait
rta = RubyTikaApp.new("http://localhost:#{port}/cnn.com")
expect(rta.to_text).to_not be_nil
expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/cnn.com").to_text)
end
end
end

context 'with a link to ycombinator.com' do
let(:document) { File.read("#{doc_path}/news.ycombinator.com") }
let(:replies) { { '/news.ycombinator.com' => [200, {}, [document]] } }

it 'parses the url' do
StubServer.open(port, replies) do |server|
server.wait
rta = RubyTikaApp.new("http://localhost:#{port}/news.ycombinator.com")
expect(rta.to_text).to_not be_nil
expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/news.ycombinator.com").to_text)
end
end
it 'should be able to parse an http url' do
rta = RubyTikaApp.new('http://localhost:9299/cnn.com')
expect(rta.to_text).to_not be_nil
expect(rta.to_text).to eq(RubyTikaApp.new(@cnn_com_file).to_text)
end

it 'should be able to parse another http url' do
rta = RubyTikaApp.new('http://localhost:9299/news.ycombinator.com')
expect(rta.to_text).to_not be_nil
expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
end
end
end
Loading