diff --git a/.circleci/config.yml b/.circleci/config.yml
index b7c1f81..b8fd177 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,7 +4,7 @@ jobs:
build:
working_directory: ~/tika
docker:
- - image: circleci/ruby:2.3.7-browsers
+ - image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
@@ -27,7 +27,7 @@ jobs:
test:
working_directory: ~/tika
docker:
- - image: circleci/ruby:2.3.7-browsers
+ - image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
@@ -39,7 +39,7 @@ jobs:
rubocop:
working_directory: ~/tika
docker:
- - image: circleci/ruby:2.3.7-browsers
+ - image: circleci/ruby:2.7.5-browsers
environment:
RAILS_ENV: test
steps:
@@ -56,14 +56,3 @@ workflows:
- test:
requires:
- build
- - rubocop:
- requires:
- - build
- jobs:
- - build
- - test:
- requires:
- - build
- - rubocop:
- requires:
- - build
diff --git a/.rubocop.yml b/.rubocop.yml
index 42f8c84..5f29edf 100644
--- a/.rubocop.yml
+++ b/.rubocop.yml
@@ -1,10 +1,13 @@
AllCops:
DisplayCopNames: true
- TargetRubyVersion: 2.3
+ TargetRubyVersion: 2.7
Exclude:
- 'ext/*'
- 'vendor/**/*'
+Lint/MissingSuper:
+ Enabled: false
+
Metrics/AbcSize:
Enabled: true
Max: 28
@@ -23,5 +26,6 @@ Metrics/MethodLength:
Style/Documentation:
Enabled: false
- Exclude:
- - 'spec/**/*'
+
+Style/NumericPredicate:
+ Enabled: false
diff --git a/.tool-versions b/.tool-versions
new file mode 100644
index 0000000..dbec821
--- /dev/null
+++ b/.tool-versions
@@ -0,0 +1,5 @@
+ruby 2.7.5
+nodejs 16.15.0
+java zulu-8.56.0.21
+python 3.9.0
+yarn 1.22.10
diff --git a/README.md b/README.md
index f9d6bb8..953200e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@ Ruby Tika Parser
======
[](https://circleci.com/gh/pulibrary/ruby_tika_app)
-[](https://coveralls.io/github/pulibrary/ruby_tika_app)
### Introduction
diff --git a/ext/tika-app-1.18.jar b/ext/tika-app-1.24.1.jar
similarity index 77%
rename from ext/tika-app-1.18.jar
rename to ext/tika-app-1.24.1.jar
index 430d2c1..bb0edab 100644
Binary files a/ext/tika-app-1.18.jar and b/ext/tika-app-1.24.1.jar differ
diff --git a/ext/tika-config.xml b/ext/tika-config.xml
index 0616c33..6b2990d 100644
--- a/ext/tika-config.xml
+++ b/ext/tika-config.xml
@@ -1,4 +1,13 @@
+
+
+
+ image/jpeg
+ application/x-sqlite3
+
+
+
diff --git a/lib/ruby_tika_app.rb b/lib/ruby_tika_app.rb
index 810033d..60230ce 100644
--- a/lib/ruby_tika_app.rb
+++ b/lib/ruby_tika_app.rb
@@ -1,15 +1,19 @@
# frozen_string_literal: true
# Based on the rake remote task code
+
require 'rubygems'
require 'stringio'
require 'open4'
class RubyTikaApp
+ TIKA_APP_VERSION = '1.24.1'
+
class Error < RuntimeError; end
class CommandFailedError < Error
attr_reader :status
+
def initialize(status)
@status = status
end
@@ -17,11 +21,19 @@ def initialize(status)
def initialize(document, config = nil)
@config = config
- @document = if document =~ %r{https?:\/\/[\S]+}
+ @document = if (document =~ %r{https?://\S+}) == 0
document
else
"file://#{document}"
end
+
+ java_cmd = 'java'
+ java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
+ ext_dir = File.join(File.dirname(__FILE__))
+ tika_path = "#{ext_dir}/../ext/tika-app-#{TIKA_APP_VERSION}.jar"
+ tika_config_path = @config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml')
+
+ @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
end
def to_xml
@@ -32,7 +44,7 @@ def to_html
run_tika('--html')
end
- def to_json
+ def to_json(*_args)
run_tika('--json')
end
@@ -50,23 +62,15 @@ def to_metadata
private
- def java_args
- '-server -Djava.awt.headless=true'
- end
-
- def java_cmd
- 'java'
- end
-
def run_tika(option)
- final_cmd = "#{tika_cmd} #{option} '#{@document}'"
+ final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
- _pid, stdin, stdout, stderr = Open4.popen4(final_cmd)
+ _, stdin, stdout, stderr = Open4.popen4(final_cmd)
stdout_result = stdout.read.strip
stderr_result = stderr.read.strip
- unless strip_stderr(stderr_result).empty?
+ if stdout_result.empty? && !stderr_result.empty?
raise(CommandFailedError.new(stderr_result),
"execution failed with status #{stderr_result}: #{final_cmd}")
end
@@ -77,26 +81,4 @@ def run_tika(option)
stdout.close
stderr.close
end
-
- def strip_stderr(error_message)
- errors = error_message.split("\n")
- real_errors = errors.reject { |error| error =~ /(INFO|WARN)/ }
- real_errors.empty? ? real_errors : real_errors.join("\n")
- end
-
- def tika_cmd
- "#{java_cmd} #{java_args} -jar '#{tika_path}' #{tika_config}"
- end
-
- def tika_config
- "--config=#{tika_config_path}"
- end
-
- def tika_config_path
- @config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml')
- end
-
- def tika_path
- File.join(File.dirname(__FILE__), '..', 'ext', 'tika-app-1.18.jar')
- end
end
diff --git a/ruby_tika_app.gemspec b/ruby_tika_app.gemspec
index 8f1ac4a..6033f25 100644
--- a/ruby_tika_app.gemspec
+++ b/ruby_tika_app.gemspec
@@ -21,14 +21,14 @@ Gem::Specification.new do |s|
s.require_paths = %w[lib]
s.add_runtime_dependency 'open4'
+ s.required_ruby_version = '>= 2.7'
s.add_development_dependency 'bundler', '>= 1.0.15'
- s.add_development_dependency 'coveralls'
s.add_development_dependency 'json'
s.add_development_dependency 'pry'
s.add_development_dependency 'rake'
- s.add_development_dependency 'rspec', '~> 3.3.0'
+ s.add_development_dependency 'rspec', '~> 3.9.0'
s.add_development_dependency 'rubocop'
s.add_development_dependency 'simplecov'
- s.add_development_dependency 'stub_server'
+ s.add_development_dependency 'thin'
end
diff --git a/spec/ruby_tika_app_spec.rb b/spec/ruby_tika_app_spec.rb
index de6ec8f..7455dae 100644
--- a/spec/ruby_tika_app_spec.rb
+++ b/spec/ruby_tika_app_spec.rb
@@ -1,11 +1,16 @@
# frozen_string_literal: true
require 'spec_helper'
-require 'stub_server'
describe RubyTikaApp do
- let(:doc_path) { File.join(File.dirname(__FILE__), 'docs') }
- let(:test_file) { "#{doc_path}/graph sampling simplex - 11.pdf" }
+ before(:each) do
+ doc_path = "#{File.join(File.dirname(__FILE__))}/docs"
+
+ @test_file = "#{doc_path}/graph sampling simplex - 11.pdf"
+
+ @cnn_com_file = "#{doc_path}/cnn.com"
+ @news_ycombinator_com_file = "#{doc_path}/news.ycombinator.com"
+ end
describe 'Error' do
it 'has an error' do
@@ -16,111 +21,102 @@
end
end
+ describe 'CommandFailedError' do
+ it 'is raised correctly' do
+ expect do
+ rta = RubyTikaApp.new('/file_not_found.pdf')
+ rta.to_text
+ end.to raise_error(RubyTikaApp::CommandFailedError)
+ end
+ end
+
describe '#to_xml' do
it 'header' do
- rta = RubyTikaApp.new(test_file)
+ rta = RubyTikaApp.new(@test_file)
expect(rta.to_xml[0..37]).to eq('')
end
it 'middle' do
- rta = RubyTikaApp.new(test_file)
+ rta = RubyTikaApp.new(@test_file)
xml = rta.to_xml
xml_size = xml.size / 2
- expect(xml[xml_size..(xml_size + 100)]).to eq("d in Frontier Sampling (FS).\nSince this is the only difference between MHRW and USDSG,\nto be simple, ")
+ expect(xml[xml_size..(xml_size + 100)]).to eq("pply USDSG, we\nneed to change a directed graph Gd to a symmetric graph\nG. This methodology is also us")
end
end
describe '#to_html' do
it 'header' do
- rta = RubyTikaApp.new(test_file)
+ rta = RubyTikaApp.new(@test_file)
expect(rta.to_html[0..42]).to eq('')
end
it 'middle' do
- rta = RubyTikaApp.new(test_file)
- expect(rta.to_html[1000...1100]).to eq("Z\"/>\n\n\n [200, {}, [document]] } }
-
- it 'parses the url' do
- StubServer.open(port, replies) do |server|
- server.wait
- rta = RubyTikaApp.new("http://localhost:#{port}/cnn.com")
- expect(rta.to_text).to_not be_nil
- expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/cnn.com").to_text)
- end
- end
- end
-
- context 'with a link to ycombinator.com' do
- let(:document) { File.read("#{doc_path}/news.ycombinator.com") }
- let(:replies) { { '/news.ycombinator.com' => [200, {}, [document]] } }
-
- it 'parses the url' do
- StubServer.open(port, replies) do |server|
- server.wait
- rta = RubyTikaApp.new("http://localhost:#{port}/news.ycombinator.com")
- expect(rta.to_text).to_not be_nil
- expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/news.ycombinator.com").to_text)
- end
- end
+ it 'should be able to parse an http url' do
+ rta = RubyTikaApp.new('http://localhost:9299/cnn.com')
+ expect(rta.to_text).to_not be_nil
+ expect(rta.to_text).to eq(RubyTikaApp.new(@cnn_com_file).to_text)
+ end
+
+ it 'should be able to parse another http url' do
+ rta = RubyTikaApp.new('http://localhost:9299/news.ycombinator.com')
+ expect(rta.to_text).to_not be_nil
+ expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
end
end
end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
index 015b313..8014673 100644
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@@ -1,16 +1,11 @@
# frozen_string_literal: true
require 'simplecov'
-require 'coveralls'
-SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new(
- [
- SimpleCov::Formatter::HTMLFormatter,
- Coveralls::SimpleCov::Formatter
- ]
-)
+SimpleCov.formatter = SimpleCov::Formatter::HTMLFormatter
SimpleCov.start do
add_filter '/spec'
+ minimum_coverage 100
end
require 'rubygems'
@@ -18,3 +13,13 @@
require 'ruby_tika_app'
require 'rspec'
+
+# Include all files under spec/support
+Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
+
+# Start a local rack server to serve up test pages.
+@server_thread = Thread.new do
+ Rack::Handler::Thin.run(MyApp::Test::Server.new, Port: 9299, Host: '127.0.0.1')
+end
+
+sleep(1) # wait a sec for the server to be booted
diff --git a/spec/support/test_server.rb b/spec/support/test_server.rb
new file mode 100644
index 0000000..6b588a3
--- /dev/null
+++ b/spec/support/test_server.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+require 'rubygems'
+require 'rack'
+
+module MyApp
+ module Test
+ class Server
+ def call(env)
+ @root = "#{__dir__}/../docs/"
+ path = Rack::Utils.unescape(env['PATH_INFO'])
+ path += 'index.html' if path == '/'
+ file = @root + path.to_s
+
+ if File.exist?(file)
+ [200, { 'Content-Type' => 'text/html' }, File.read(file)]
+ else
+ [404, { 'Content-Type' => 'text/plain' }, 'file not found']
+ end
+ end
+ end
+ end
+end