diff --git a/.circleci/config.yml b/.circleci/config.yml index b7c1f81..b8fd177 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,7 +4,7 @@ jobs: build: working_directory: ~/tika docker: - - image: circleci/ruby:2.3.7-browsers + - image: circleci/ruby:2.7.5-browsers environment: RAILS_ENV: test steps: @@ -27,7 +27,7 @@ jobs: test: working_directory: ~/tika docker: - - image: circleci/ruby:2.3.7-browsers + - image: circleci/ruby:2.7.5-browsers environment: RAILS_ENV: test steps: @@ -39,7 +39,7 @@ jobs: rubocop: working_directory: ~/tika docker: - - image: circleci/ruby:2.3.7-browsers + - image: circleci/ruby:2.7.5-browsers environment: RAILS_ENV: test steps: @@ -56,14 +56,3 @@ workflows: - test: requires: - build - - rubocop: - requires: - - build - jobs: - - build - - test: - requires: - - build - - rubocop: - requires: - - build diff --git a/.rubocop.yml b/.rubocop.yml index 42f8c84..5f29edf 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,10 +1,13 @@ AllCops: DisplayCopNames: true - TargetRubyVersion: 2.3 + TargetRubyVersion: 2.7 Exclude: - 'ext/*' - 'vendor/**/*' +Lint/MissingSuper: + Enabled: false + Metrics/AbcSize: Enabled: true Max: 28 @@ -23,5 +26,6 @@ Metrics/MethodLength: Style/Documentation: Enabled: false - Exclude: - - 'spec/**/*' + +Style/NumericPredicate: + Enabled: false diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..dbec821 --- /dev/null +++ b/.tool-versions @@ -0,0 +1,5 @@ +ruby 2.7.5 +nodejs 16.15.0 +java zulu-8.56.0.21 +python 3.9.0 +yarn 1.22.10 diff --git a/README.md b/README.md index f9d6bb8..953200e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ Ruby Tika Parser ====== [![CircleCI](https://circleci.com/gh/pulibrary/ruby_tika_app.svg?style=svg)](https://circleci.com/gh/pulibrary/ruby_tika_app) -[![Coverage Status](https://coveralls.io/repos/github/pulibrary/ruby_tika_app/badge.svg)](https://coveralls.io/github/pulibrary/ruby_tika_app) ### Introduction diff --git a/ext/tika-app-1.18.jar b/ext/tika-app-1.24.1.jar similarity index 77% rename from ext/tika-app-1.18.jar rename to ext/tika-app-1.24.1.jar index 430d2c1..bb0edab 100644 Binary files a/ext/tika-app-1.18.jar and b/ext/tika-app-1.24.1.jar differ diff --git a/ext/tika-config.xml b/ext/tika-config.xml index 0616c33..6b2990d 100644 --- a/ext/tika-config.xml +++ b/ext/tika-config.xml @@ -1,4 +1,13 @@ + + + + image/jpeg + application/x-sqlite3 + + + diff --git a/lib/ruby_tika_app.rb b/lib/ruby_tika_app.rb index 810033d..60230ce 100644 --- a/lib/ruby_tika_app.rb +++ b/lib/ruby_tika_app.rb @@ -1,15 +1,19 @@ # frozen_string_literal: true # Based on the rake remote task code + require 'rubygems' require 'stringio' require 'open4' class RubyTikaApp + TIKA_APP_VERSION = '1.24.1' + class Error < RuntimeError; end class CommandFailedError < Error attr_reader :status + def initialize(status) @status = status end @@ -17,11 +21,19 @@ def initialize(status) def initialize(document, config = nil) @config = config - @document = if document =~ %r{https?:\/\/[\S]+} + @document = if (document =~ %r{https?://\S+}) == 0 document else "file://#{document}" end + + java_cmd = 'java' + java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8' + ext_dir = File.join(File.dirname(__FILE__)) + tika_path = "#{ext_dir}/../ext/tika-app-#{TIKA_APP_VERSION}.jar" + tika_config_path = @config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml') + + @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'" end def to_xml @@ -32,7 +44,7 @@ def to_html run_tika('--html') end - def to_json + def to_json(*_args) run_tika('--json') end @@ -50,23 +62,15 @@ def to_metadata private - def java_args - '-server -Djava.awt.headless=true' - end - - def java_cmd - 'java' - end - def run_tika(option) - final_cmd = "#{tika_cmd} #{option} '#{@document}'" + final_cmd = "#{@tika_cmd} #{option} '#{@document}'" - _pid, stdin, stdout, stderr = Open4.popen4(final_cmd) + _, stdin, stdout, stderr = Open4.popen4(final_cmd) stdout_result = stdout.read.strip stderr_result = stderr.read.strip - unless strip_stderr(stderr_result).empty? + if stdout_result.empty? && !stderr_result.empty? raise(CommandFailedError.new(stderr_result), "execution failed with status #{stderr_result}: #{final_cmd}") end @@ -77,26 +81,4 @@ def run_tika(option) stdout.close stderr.close end - - def strip_stderr(error_message) - errors = error_message.split("\n") - real_errors = errors.reject { |error| error =~ /(INFO|WARN)/ } - real_errors.empty? ? real_errors : real_errors.join("\n") - end - - def tika_cmd - "#{java_cmd} #{java_args} -jar '#{tika_path}' #{tika_config}" - end - - def tika_config - "--config=#{tika_config_path}" - end - - def tika_config_path - @config || File.join(File.dirname(__FILE__), '..', 'ext', 'tika-config.xml') - end - - def tika_path - File.join(File.dirname(__FILE__), '..', 'ext', 'tika-app-1.18.jar') - end end diff --git a/ruby_tika_app.gemspec b/ruby_tika_app.gemspec index 8f1ac4a..6033f25 100644 --- a/ruby_tika_app.gemspec +++ b/ruby_tika_app.gemspec @@ -21,14 +21,14 @@ Gem::Specification.new do |s| s.require_paths = %w[lib] s.add_runtime_dependency 'open4' + s.required_ruby_version = '>= 2.7' s.add_development_dependency 'bundler', '>= 1.0.15' - s.add_development_dependency 'coveralls' s.add_development_dependency 'json' s.add_development_dependency 'pry' s.add_development_dependency 'rake' - s.add_development_dependency 'rspec', '~> 3.3.0' + s.add_development_dependency 'rspec', '~> 3.9.0' s.add_development_dependency 'rubocop' s.add_development_dependency 'simplecov' - s.add_development_dependency 'stub_server' + s.add_development_dependency 'thin' end diff --git a/spec/ruby_tika_app_spec.rb b/spec/ruby_tika_app_spec.rb index de6ec8f..7455dae 100644 --- a/spec/ruby_tika_app_spec.rb +++ b/spec/ruby_tika_app_spec.rb @@ -1,11 +1,16 @@ # frozen_string_literal: true require 'spec_helper' -require 'stub_server' describe RubyTikaApp do - let(:doc_path) { File.join(File.dirname(__FILE__), 'docs') } - let(:test_file) { "#{doc_path}/graph sampling simplex - 11.pdf" } + before(:each) do + doc_path = "#{File.join(File.dirname(__FILE__))}/docs" + + @test_file = "#{doc_path}/graph sampling simplex - 11.pdf" + + @cnn_com_file = "#{doc_path}/cnn.com" + @news_ycombinator_com_file = "#{doc_path}/news.ycombinator.com" + end describe 'Error' do it 'has an error' do @@ -16,111 +21,102 @@ end end + describe 'CommandFailedError' do + it 'is raised correctly' do + expect do + rta = RubyTikaApp.new('/file_not_found.pdf') + rta.to_text + end.to raise_error(RubyTikaApp::CommandFailedError) + end + end + describe '#to_xml' do it 'header' do - rta = RubyTikaApp.new(test_file) + rta = RubyTikaApp.new(@test_file) expect(rta.to_xml[0..37]).to eq('') end it 'middle' do - rta = RubyTikaApp.new(test_file) + rta = RubyTikaApp.new(@test_file) xml = rta.to_xml xml_size = xml.size / 2 - expect(xml[xml_size..(xml_size + 100)]).to eq("d in Frontier Sampling (FS).\nSince this is the only difference between MHRW and USDSG,\nto be simple, ") + expect(xml[xml_size..(xml_size + 100)]).to eq("pply USDSG, we\nneed to change a directed graph Gd to a symmetric graph\nG. This methodology is also us") end end describe '#to_html' do it 'header' do - rta = RubyTikaApp.new(test_file) + rta = RubyTikaApp.new(@test_file) expect(rta.to_html[0..42]).to eq('') end it 'middle' do - rta = RubyTikaApp.new(test_file) - expect(rta.to_html[1000...1100]).to eq("Z\"/>\n\n\n [200, {}, [document]] } } - - it 'parses the url' do - StubServer.open(port, replies) do |server| - server.wait - rta = RubyTikaApp.new("http://localhost:#{port}/cnn.com") - expect(rta.to_text).to_not be_nil - expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/cnn.com").to_text) - end - end - end - - context 'with a link to ycombinator.com' do - let(:document) { File.read("#{doc_path}/news.ycombinator.com") } - let(:replies) { { '/news.ycombinator.com' => [200, {}, [document]] } } - - it 'parses the url' do - StubServer.open(port, replies) do |server| - server.wait - rta = RubyTikaApp.new("http://localhost:#{port}/news.ycombinator.com") - expect(rta.to_text).to_not be_nil - expect(rta.to_text).to eq(RubyTikaApp.new("#{doc_path}/news.ycombinator.com").to_text) - end - end + it 'should be able to parse an http url' do + rta = RubyTikaApp.new('http://localhost:9299/cnn.com') + expect(rta.to_text).to_not be_nil + expect(rta.to_text).to eq(RubyTikaApp.new(@cnn_com_file).to_text) + end + + it 'should be able to parse another http url' do + rta = RubyTikaApp.new('http://localhost:9299/news.ycombinator.com') + expect(rta.to_text).to_not be_nil + expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text) end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 015b313..8014673 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,16 +1,11 @@ # frozen_string_literal: true require 'simplecov' -require 'coveralls' -SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new( - [ - SimpleCov::Formatter::HTMLFormatter, - Coveralls::SimpleCov::Formatter - ] -) +SimpleCov.formatter = SimpleCov::Formatter::HTMLFormatter SimpleCov.start do add_filter '/spec' + minimum_coverage 100 end require 'rubygems' @@ -18,3 +13,13 @@ require 'ruby_tika_app' require 'rspec' + +# Include all files under spec/support +Dir['./spec/support/**/*.rb'].sort.each { |f| require f } + +# Start a local rack server to serve up test pages. +@server_thread = Thread.new do + Rack::Handler::Thin.run(MyApp::Test::Server.new, Port: 9299, Host: '127.0.0.1') +end + +sleep(1) # wait a sec for the server to be booted diff --git a/spec/support/test_server.rb b/spec/support/test_server.rb new file mode 100644 index 0000000..6b588a3 --- /dev/null +++ b/spec/support/test_server.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +require 'rubygems' +require 'rack' + +module MyApp + module Test + class Server + def call(env) + @root = "#{__dir__}/../docs/" + path = Rack::Utils.unescape(env['PATH_INFO']) + path += 'index.html' if path == '/' + file = @root + path.to_s + + if File.exist?(file) + [200, { 'Content-Type' => 'text/html' }, File.read(file)] + else + [404, { 'Content-Type' => 'text/plain' }, 'file not found'] + end + end + end + end +end