diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c7301c1..4b02914 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,19 +6,19 @@ name: Test - pull_request jobs: - # rubocop: - # runs-on: ubuntu-latest - # env: - # CI: true - # steps: - # - uses: actions/checkout@v3 - # - name: Set up Ruby 3.4 - # uses: ruby/setup-ruby@v1 - # with: - # ruby-version: 3.4 - # bundler-cache: true - # - name: Run RuboCop - # run: bundle exec rubocop --parallel + rubocop: + runs-on: ubuntu-latest + env: + CI: true + steps: + - uses: actions/checkout@v3 + - name: Set up Ruby 3.3 + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.3 + bundler-cache: true + - name: Run RuboCop + run: bundle exec rubocop --parallel test: name: "${{matrix.ruby}} ${{matrix.os || 'ubuntu-latest'}}" diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..1c96486 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,42 @@ +# TODO: Enable plugins when upgrading to Rubocop +# plugins: +# - rubocop-performance +# - rubocop-rake +# - rubocop-rspec + +AllCops: + TargetRubyVersion: 2.4 + NewCops: enable + SuggestExtensions: false + +Layout/LineLength: + Exclude: + - 'spec/**/*' + +Metrics/AbcSize: + Enabled: false + +Metrics/BlockLength: + Enabled: false + +Metrics/BlockNesting: + Enabled: false + +Metrics/MethodLength: + Enabled: false + +Naming/FileName: + Exclude: + - 'lib/utf8-cleaner.rb' + +Style/AsciiComments: + Enabled: false + +# RSpec/IndexedLet: +# Enabled: false +# +# RSpec/MessageSpies: +# EnforcedStyle: receive +# +# RSpec/MultipleMemoizedHelpers: +# Enabled: false diff --git a/Gemfile b/Gemfile index 7f4f5e9..9761b56 100644 --- a/Gemfile +++ b/Gemfile @@ -2,4 +2,12 @@ source 'https://rubygems.org' +gem 'rack-test' +gem 'rake' +gem 'rspec' +gem 'rubocop', '< 1.13.0' +gem 'rubocop-performance' +gem 'rubocop-rake' +gem 'rubocop-rspec' + gemspec diff --git a/lib/utf8-cleaner/middleware.rb b/lib/utf8-cleaner/middleware.rb index 74143d9..882f73e 100644 --- a/lib/utf8-cleaner/middleware.rb +++ b/lib/utf8-cleaner/middleware.rb @@ -3,6 +3,8 @@ require 'active_support/multibyte/unicode' module UTF8Cleaner + # Rack middleware to sanitize non-UTF8 chars in + # environment variables and request input. class Middleware SANITIZE_ENV_KEYS = %w[ HTTP_REFERER @@ -35,6 +37,7 @@ def sanitize_env(env) def sanitize_env_keys(env) SANITIZE_ENV_KEYS.each do |key| next unless (value = env[key]) + env[key] = cleaned_string(value) end end @@ -57,10 +60,10 @@ def sanitize_env_rack_input(env) return unless input_data && !input_data.ascii_only? env['rack.input'] = StringIO.new(tidy_bytes(input_data)) - else - # Do not process multipart/form-data since it may contain binary content. - # Leave all other unknown content types alone. end + # Else: + # - Do not process multipart/form-data since it may contain binary content. + # - Leave all other unknown content types alone. end def read_input(input) diff --git a/lib/utf8-cleaner/railtie.rb b/lib/utf8-cleaner/railtie.rb index 7b3b953..1cd2145 100644 --- a/lib/utf8-cleaner/railtie.rb +++ b/lib/utf8-cleaner/railtie.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true module UTF8Cleaner + # Railtie to bootstrap UTF8Cleaner::Middleware in a Rails application. class Railtie < Rails::Railtie initializer('utf8-cleaner.insert_middleware') do |app| app.config.middleware.insert_before(0, UTF8Cleaner::Middleware) diff --git a/lib/utf8-cleaner/uri_string.rb b/lib/utf8-cleaner/uri_string.rb index abd53da..4161c99 100644 --- a/lib/utf8-cleaner/uri_string.rb +++ b/lib/utf8-cleaner/uri_string.rb @@ -6,8 +6,8 @@ class URIString attr_accessor :data HEX_CHARS = '0-9a-fA-F' - HEX_CHARS_REGEX = /[#{HEX_CHARS}]/ - INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/ + HEX_CHARS_REGEX = /[#{HEX_CHARS}]/.freeze + INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/.freeze def initialize(data) self.data = data @@ -34,7 +34,7 @@ def encoded_char_array char_array = [] index = 0 - while index < data.length do + while index < data.length char = data[index] if char == '%' @@ -43,20 +43,20 @@ def encoded_char_array skip_next = 2 # If the next character is not a hex char, drop the percent and it - unless data[index + 1] =~ HEX_CHARS_REGEX + unless HEX_CHARS_REGEX.match?(data[index + 1]) index += 2 next end # If the character after that is not a hex char, drop the percent and # both of the following chars. - unless data[index + 2] =~ HEX_CHARS_REGEX + unless HEX_CHARS_REGEX.match?(data[index + 2]) index += 3 next end # How long is this character? - first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase + first_byte = "0x#{(data[index + 1] + data[index + 2]).upcase}" bytes = utf8_char_length_in_bytes(first_byte) # Grab the specified number of encoded bytes @@ -74,7 +74,7 @@ def encoded_char_array # If we're dealing with a multibyte character, skip more than two # of the next characters, which have already been processed. - skip_next = bytes * 3 - 1 + skip_next = (bytes * 3) - 1 end end index += skip_next @@ -92,9 +92,8 @@ def valid_uri_encoded_utf8(string) URI::DEFAULT_PARSER.unescape(string).force_encoding('UTF-8').valid_encoding? && string !~ INVALID_PERCENT_ENCODING_REGEX rescue ArgumentError => e - if e.message =~ /invalid byte sequence/ - return false - end + return false if e.message.include?('invalid byte sequence') + raise e end @@ -103,16 +102,13 @@ def valid_uri_encoded_utf8(string) def next_n_bytes_from(index, num_bytes) return [] if data.length < index + (3 * num_bytes) - num_bytes.times.map do |n| + Array.new(num_bytes) do |n| # Look for percent signs in the right places pct_index = index + (3 * n) - if data[pct_index] == '%' - byte = data[pct_index + 1..pct_index + 2] - else - # An expected percent sign was missing. The whole character is invalid. - return [] - end - '%' + byte + return [] unless data[pct_index] == '%' + + # An expected percent sign was missing. The whole character is invalid. + "%#{data[(pct_index + 1)..(pct_index + 2)]}" end end diff --git a/spec/integration_spec.rb b/spec/integration_spec.rb new file mode 100644 index 0000000..012895e --- /dev/null +++ b/spec/integration_spec.rb @@ -0,0 +1,584 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'rack' +require 'rack/test' +require 'json' + +describe 'UTF8Cleaner::Middleware Integration' do + include Rack::Test::Methods + + let(:app) do + Rack::Builder.new do + use UTF8Cleaner::Middleware + run(lambda do |env| + # Create a Rack::Request to properly parse parameters + request = Rack::Request.new(env) + + # Echo back the sanitized environment for verification + response_body = { + path_info: env['PATH_INFO'], + query_string: env['QUERY_STRING'], + request_uri: env['REQUEST_URI'], + request_path: env['REQUEST_PATH'], + http_referer: env['HTTP_REFERER'], + http_user_agent: env['HTTP_USER_AGENT'], + http_cookie: env['HTTP_COOKIE'] + } + + # Include parsed params for form data testing + if env['REQUEST_METHOD'] == 'POST' && env['CONTENT_TYPE']&.match?(%r{application/x-www-form-urlencoded}i) + response_body[:params] = begin + request.POST + rescue StandardError + {} + end + elsif env['CONTENT_TYPE']&.match?(%r{application/json}i) + env['rack.input'].rewind + response_body[:payload] = env['rack.input'].read + env['rack.input'].rewind + elsif env['CONTENT_TYPE']&.match?(%r{multipart/form-data}i) + response_body[:multipart_size] = env['rack.input'].read.bytesize + env['rack.input'].rewind + end + + begin + [200, { 'Content-Type' => 'application/json' }, [response_body.to_json]] + rescue JSON::GeneratorError + [200, { 'Content-Type' => 'text/plain' }, response_body[:payload]] + end + end) + end + end + + describe 'GET requests' do + context 'with invalid UTF-8' do + it 'removes invalid percent-encoded sequences from query string' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'name=%FFbad%F8&valid=true' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('name=bad&valid=true') + end + + it 'cleans invalid sequences from path_info' do + env = Rack::MockRequest.env_for('/test/path') + env['PATH_INFO'] = '/test/%FFbad%F8/path' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['path_info']).to eq('/test/bad/path') + end + + it 'handles mixed valid and invalid encodings' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'good=%E2%9C%93&bad=%FF&mixed=%E2%9C%FF' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('good=%E2%9C%93&bad=&mixed=') + end + + it 'handles truncated multibyte sequences' do + # %E2%9C%93 is ✓, but truncate it at different points + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'truncated=%E2%9C&complete=%E2%9C%93' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('truncated=&complete=%E2%9C%93') + end + + it 'handles overlong UTF-8 sequences' do + # Overlong encoding of '/' (should be %2F but encoded as %C0%AF) + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'overlong=%C0%AF&normal=text' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('overlong=&normal=text') + end + + it 'handles invalid continuation bytes' do + # Valid start byte but invalid continuation + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'invalid=%E2%FF%93&valid=test' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('invalid=&valid=test') + end + end + + context 'percent encodings' do + it 'preserves valid percent-encoded characters' do + env = Rack::MockRequest.env_for('/test%2Fpath?foo=%26bar%3D1') + _status, _headers, body = app.call(env) + + response = JSON.parse(body.first) + expect(response['path_info']).to eq('/test%2Fpath') + expect(response['query_string']).to eq('foo=%26bar%3D1') + end + + it 'removes incomplete percent encodings' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'incomplete=%2' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('incomplete=') + end + + it 'removes percent signs without hex chars' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'invalid=%GG&partial=%F' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('invalid=G&partial=') + end + + it 'removes bare percent signs' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'percent=%&value=test' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('percent=value=test') + end + end + end + + describe 'POST requests' do + context 'with form-encoded data' do + it 'cleans invalid UTF-8 in form parameters' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'name=%FFbad%F8&email=test@example.com') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'name' => 'bad', + 'email' => 'test@example.com' + }) + end + + it 'preserves valid UTF-8 in form data' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'name=José&city=São+Paulo') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'name' => 'José', + 'city' => 'São Paulo' + }) + end + + it 'handles complex mixed encodings in forms' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'valid=%E2%9C%93&invalid=%FF&text=hello%20world') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'valid' => '✓', + 'invalid' => '', + 'text' => 'hello world' + }) + end + + it 'handles form data with no content type' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + :input => 'test=%FFdata') + + status, _headers, _body = app.call(env) + expect(status).to eq(200) + end + + it 'handles empty POST body' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'CONTENT_LENGTH' => '0', + :input => '') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({}) + end + end + + context 'with JSON data' do + it 'tidies invalid UTF-8 bytes in JSON' do + pending 'This case is not currently handled correctly' unless defined?(TruffleRuby) + + # Create JSON with invalid UTF-8 bytes + json_with_invalid = "{\"name\": \"test\xFF\xF8\", \"valid\": \"✓\"}" + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json_with_invalid) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + # Invalid bytes should be replaced with replacement characters + expect(JSON.parse(response['payload'])).to eq({ + 'name' => "test\u00FF\u00F8", + 'valid' => '✓' + }) + end + + it 'does not URI-decode JSON content' do + json = { 'encoded' => '%20%26%3D', 'normal' => 'text' }.to_json + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(JSON.parse(response['payload'])).to eq({ + 'encoded' => '%20%26%3D', + 'normal' => 'text' + }) + end + + it 'handles valid UTF-8 JSON unchanged' do + skip if RUBY_VERSION < '3.1' || defined?(JRuby) || defined?(TruffleRuby) + + json = { 'name' => 'José', 'emoji' => '😀', 'check' => '✓' }.to_json + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(JSON.parse(response['payload'])).to eq({ + 'name' => 'José', + 'emoji' => '😀', + 'check' => '✓' + }) + end + end + + context 'with multipart/form-data' do + it 'handles multipart form data with invalid UTF-8 in field names' do + input = <<~INPUT + --AaB03x\r + content-disposition: form-data; name="field%FFname"\r + \r + value\r + --AaB03x--\r + INPUT + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'multipart/form-data; boundary=AaB03x', + 'CONTENT_LENGTH' => input.size.to_s, + :input => input) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['multipart_size']).to be > 0 + end + + it 'does not modify binary content in multipart' do + skip if RUBY_VERSION < '3.1' || defined?(TruffleRuby) + + # Create binary content with invalid UTF-8 + binary_content = (+"file\xFF\xF8content").force_encoding('BINARY') + + # Build multipart request manually + boundary = 'AaB03x' + input = "--#{boundary}\r\n" + input << "content-disposition: form-data; name=\"file\"; filename=\"test.bin\"\r\n" + input << "content-type: application/octet-stream\r\n" + input << "\r\n" + input << binary_content + input << "\r\n--#{boundary}--\r\n" + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => "multipart/form-data; boundary=#{boundary}", + 'CONTENT_LENGTH' => input.bytesize.to_s, + :input => StringIO.new(input)) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['multipart_size']).to eq(input.bytesize) + end + end + end + + describe 'HTTP headers' do + context 'with invalid UTF-8 in headers' do + it 'cleans User-Agent header' do + env = Rack::MockRequest.env_for('/', 'HTTP_USER_AGENT' => "Mozilla/5.0\xFF\xF8") + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_user_agent']).to eq("Mozilla/5.0\u00FF\u00F8") + end + + it 'cleans Referer header with invalid percent encoding' do + env = Rack::MockRequest.env_for('/', 'HTTP_REFERER' => 'http://example.com/search?q=%FFbad%F8') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/search?q=bad') + end + + it 'cleans Cookie header' do + env = Rack::MockRequest.env_for('/', 'HTTP_COOKIE' => 'session=%FFbad%F8; user=john') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_cookie']).to eq('session=bad; user=john') + end + + it 'handles multiple cookies with invalid encoding' do + env = Rack::MockRequest.env_for('/', + 'HTTP_COOKIE' => 'first=%FFbad; second=good%20value; third=%F8invalid') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_cookie']).to eq('first=bad; second=good%20value; third=invalid') + end + end + + context 'with valid UTF-8 in headers' do + it 'preserves valid UTF-8 in headers' do + env = Rack::MockRequest.env_for('/', + 'HTTP_REFERER' => 'http://example.com/José', + 'HTTP_USER_AGENT' => 'Custom/1.0 (São Paulo)') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/José') + expect(response['http_user_agent']).to eq('Custom/1.0 (São Paulo)') + end + + it 'preserves valid percent-encoded UTF-8 in referer' do + env = Rack::MockRequest.env_for('/', + 'HTTP_REFERER' => 'http://example.com/search?q=%E2%9C%93') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/search?q=%E2%9C%93') + end + end + end + + describe 'REQUEST_URI and REQUEST_PATH' do + it 'cleans both REQUEST_URI and REQUEST_PATH' do + env = Rack::MockRequest.env_for('/test') + env['REQUEST_URI'] = '/test%FFpath?query=%FFparam' + env['REQUEST_PATH'] = '/test%FFpath' + env['QUERY_STRING'] = 'query=%FFparam' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['request_uri']).to eq('/testpath?query=param') + expect(response['request_path']).to eq('/testpath') + expect(response['query_string']).to eq('query=param') + end + + it 'handles REQUEST_URI with fragment identifiers' do + env = Rack::MockRequest.env_for('/') + env['REQUEST_URI'] = '/page?param=%FFvalue#section' + env['QUERY_STRING'] = 'param=%FFvalue' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['request_uri']).to eq('/page?param=value#section') + end + end + + describe 'edge cases' do + it 'handles empty rack.input gracefully' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => nil) + + status, _headers, _body = app.call(env) + expect(status).to eq(200) + end + + it 'handles very long invalid sequences' do + long_invalid = '%FF' * 100 + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = "data=#{long_invalid}" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('data=') + end + + it 'handles mixed case content-type headers' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'Application/X-WWW-Form-URLEncoded', + :input => 'test=%FFdata') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'test' => 'data' }) + end + + it 'preserves plus signs as spaces in form data' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'text=hello+world+test') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'text' => 'hello world test' }) + end + + it 'handles null bytes in input' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = "null=\x00&text=hello" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + # Should preserve null bytes that are valid + expect(response['query_string']).to include('text=hello') + end + + it 'handles Safari ajax POST body with null terminator' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => "foo=bar&quux=bla\0") + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'foo' => 'bar', 'quux' => 'bla' }) + end + end + + describe 'complex real-world scenarios' do + it 'handles a request with multiple invalid encodings across different parts' do + env = Rack::MockRequest.env_for('/pathinfo', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'HTTP_REFERER' => 'http://example.com/page%FFtest', + 'HTTP_USER_AGENT' => "Bot\xFF\xF8/1.0", + 'HTTP_COOKIE' => 'id=%FFbad', + :input => 'field=%FFvalue') + + env['PATH_INFO'] = '/path%FFinfo' + env['QUERY_STRING'] = 'param=%FFquery' + env['REQUEST_URI'] = '/path%FFinfo?param=%FFquery' + env['REQUEST_PATH'] = '/path%FFinfo' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + + expect(response['path_info']).to eq('/pathinfo') + expect(response['query_string']).to eq('param=query') + expect(response['request_uri']).to eq('/pathinfo?param=query') + expect(response['request_path']).to eq('/pathinfo') + expect(response['http_referer']).to eq('http://example.com/pagetest') + expect(response['http_user_agent']).to eq("Bot\u00FF\u00F8/1.0") + expect(response['http_cookie']).to eq('id=bad') + expect(response['params']).to eq({ 'field' => 'value' }) + end + + it 'handles requests with byte sequences at buffer boundaries' do + # Test sequences that might be split at typical buffer sizes + # This tests the middleware's ability to handle partial sequences + env = Rack::MockRequest.env_for('/') + # Create a query string that puts invalid bytes at common buffer boundaries + env['QUERY_STRING'] = "#{'a' * 8190}%FF#{'b' * 10}" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq(('a' * 8190) + ('b' * 10)) + end + end + + describe 'middleware composition' do + let(:app) do + Rack::Builder.new do + use UTF8Cleaner::Middleware + run ->(_env) { [200, { 'content_type' => 'text/plain' }, ['OK']] } + end + end + + it 'works correctly with other middleware' do + env = Rack::MockRequest.env_for('/?test=%FFbad') + status, _headers, body = app.call(env) + expect(status).to eq(200) + expect(body).to eq(['OK']) + end + end + + describe 'rack.input rewind behavior' do + it 'ensures rack.input can be read multiple times' do + test_app = Rack::Builder.new do + use UTF8Cleaner::Middleware + run lambda { |env| + # First read + env['rack.input'].rewind + first_read = env['rack.input'].read + + # Second read + env['rack.input'].rewind + second_read = env['rack.input'].read + + [ + 200, + { 'Content-Type' => 'application/json' }, + [{ first: first_read, second: second_read, same: first_read == second_read }.to_json] + ] + } + end + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'test=%FFdata') + + _status, _headers, body = test_app.call(env) + response = JSON.parse(body.first) + expect(response['first']).to eq('test=data') + expect(response['second']).to eq('test=data') + expect(response['same']).to be true + end + end + + describe 'performance considerations' do + it 'handles large payloads efficiently' do + # Create a large form payload with some invalid sequences + large_data = (1..1000).map { |i| "field#{i}=value#{i}" }.join('&') + large_data += '&invalid=%FFtest' + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => large_data) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to include('field1' => 'value1') + expect(response['params']).to include('field1000' => 'value1000') + expect(response['params']).to include('invalid' => 'test') + end + end +end diff --git a/spec/middleware_spec.rb b/spec/middleware_spec.rb index e174d30..418f984 100644 --- a/spec/middleware_spec.rb +++ b/spec/middleware_spec.rb @@ -3,105 +3,104 @@ require 'spec_helper' require 'rack/lint' -module UTF8Cleaner - describe Middleware do - let :new_env do - Middleware.new(nil).send(:sanitize_env, env) +describe UTF8Cleaner::Middleware do + let :new_env do + described_class.new(nil).send(:sanitize_env, env) + end + + # Ensure that all cleaned values parse cleanly. + # E.g. make sure Rack/Rails won't choke on them + after do + cleaned = new_env + env.keys.reject { |key| key == 'rack.input' }.each do |key| + URI.decode_www_form_component(cleaned[key]) if cleaned[key] end + end - describe "with a big nasty env" do - let :env do - { - 'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B', - 'QUERY_STRING' => 'foo=bar%FF', - 'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', - 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93", - 'REQUEST_URI' => '%C3%89%E2%9C%93', - 'rack.input' => StringIO.new("foo=%FFbar%F8"), - 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', - 'HTTP_COOKIE' => nil - } - end + describe 'with a big nasty env' do + let :env do + { + 'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B', + 'QUERY_STRING' => 'foo=bar%FF', + 'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', + 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93", + 'REQUEST_URI' => '%C3%89%E2%9C%93', + 'rack.input' => StringIO.new('foo=%FFbar%F8'), + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'HTTP_COOKIE' => nil + } + end - describe "removes invalid %-encoded UTF-8 sequences" do - it { expect(new_env['QUERY_STRING']).to eq('foo=bar') } - it { expect(new_env['HTTP_REFERER']).to eq('http://example.com/blog+Result:+++++') } - it { expect(new_env['rack.input'].read).to eq('foo=bar') } - end + describe 'removes invalid %-encoded UTF-8 sequences' do + it { expect(new_env['QUERY_STRING']).to eq('foo=bar') } + it { expect(new_env['HTTP_REFERER']).to eq('http://example.com/blog+Result:+++++') } + it { expect(new_env['rack.input'].read).to eq('foo=bar') } + end - describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do - it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") } - end + describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do + it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") } + end - describe "leaves all valid characters untouched" do - it { expect(new_env['PATH_INFO']).to eq('foo/bar%2e%2fbaz%26%3B') } - it { expect(new_env['REQUEST_URI']).to eq('%C3%89%E2%9C%93') } - end + describe 'leaves all valid characters untouched' do + it { expect(new_env['PATH_INFO']).to eq('foo/bar%2e%2fbaz%26%3B') } + it { expect(new_env['REQUEST_URI']).to eq('%C3%89%E2%9C%93') } + end - describe "when rack.input is wrapped" do - it "removes invalid UTF-8 sequences" do - wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new("foo=%FFbar%F8")) - env.merge!('rack.input' => wrapped_rack_input) - new_env = Middleware.new(nil).send(:sanitize_env, env) - expect(new_env['rack.input'].read).to eq('foo=bar') - end + describe "when rack.input is wrapped" do + it "removes invalid UTF-8 sequences" do + wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new("foo=%FFbar%F8")) + env.merge!('rack.input' => wrapped_rack_input) + new_env = Middleware.new(nil).send(:sanitize_env, env) + expect(new_env['rack.input'].read).to eq('foo=bar') end + end - describe "when binary data is POSTed" do - before do - env['CONTENT_TYPE'] = 'multipart/form-data' - end - it "leaves the body alone" do - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq "foo=%FFbar%F8" - end + describe 'when binary data is POSTed' do + before do + env['CONTENT_TYPE'] = 'multipart/form-data' end - describe "when json data is POSTed" do - before do - env['CONTENT_TYPE'] = 'application/json' - end - - it "tidies invalid UTF-8 sequences" do - env['rack.input'] = StringIO.new(%Q({"foo": "\xFFbar\xF8"})) - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq(%Q({"foo": "\u00FFbar\u00F8"})) - end - - it "does not attempt to URI-decode data" do - json = %Q({"foo": "%FF"}) - env['rack.input'] = StringIO.new(json) - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq(json) - end + it 'leaves the body alone' do + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq 'foo=%FFbar%F8' end end - describe "with a minimal env" do - let(:env) do - { - 'PATH_INFO' => '/this/is/safe', - 'QUERY_STRING' => 'foo=bar%FF' - } + describe 'when json data is POSTed' do + before do + env['CONTENT_TYPE'] = 'application/json' end - it "only runs URIString cleaning on potentially unclean strings" do - expect(URIString).to receive(:new).once.and_call_original - new_env + it 'tidies invalid UTF-8 sequences' do + env['rack.input'] = StringIO.new(%({"foo": "\xFFbar\xF8"})) + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq(%({"foo": "\u00FFbar\u00F8"})) end - it "leaves clean values alone" do - expect(new_env['PATH_INFO']).to eq('/this/is/safe') + it 'does not attempt to URI-decode data' do + json = %({"foo": "%FF"}) + env['rack.input'] = StringIO.new(json) + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq(json) end end + end - # Ensure that all cleaned values parse cleanly. - # E.g. make sure Rack/Rails won't choke on them - after do - cleaned = new_env - env.keys.reject { |key| key == 'rack.input' }.each do |key| - URI.decode_www_form_component(cleaned[key]) if cleaned[key] - end + describe 'with a minimal env' do + let(:env) do + { + 'PATH_INFO' => '/this/is/safe', + 'QUERY_STRING' => 'foo=bar%FF' + } + end + + it 'only runs URIString cleaning on potentially unclean strings' do + expect(UTF8Cleaner::URIString).to receive(:new).once.and_call_original + new_env + end + + it 'leaves clean values alone' do + expect(new_env['PATH_INFO']).to eq('/this/is/safe') end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index e2999ac..d957122 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -9,8 +9,8 @@ # order dependency and want to debug it, you can fix the order by providing # the seed, which is printed after each run. # --seed 1234 - config.order = "random" + config.order = 'random' - config.filter_run :focus => true + config.filter_run focus: true config.run_all_when_everything_filtered = true end diff --git a/spec/uri_string_spec.rb b/spec/uri_string_spec.rb index 0918583..d747928 100644 --- a/spec/uri_string_spec.rb +++ b/spec/uri_string_spec.rb @@ -2,48 +2,45 @@ require 'spec_helper' -module UTF8Cleaner - - describe URIString do - let(:invalid_string) { URIString.new('%FF') } - let(:ascii_string) { URIString.new('foo') } - let(:encoded_string) { URIString.new('%26') } - let(:multibyte_string) { URIString.new('%E2%9C%93') } - let(:complex_invalid_string) do - # ------------ foo/ bar. / baz& ; √ baz - URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') - end - let(:no_byte_at_all) { URIString.new('%') } - let(:not_even_hex_chars1) { URIString.new('%x') } - let(:not_even_hex_chars2) { URIString.new('%0zhey') } - let(:mixed_encodings) { URIString.new('§%e2') } +describe UTF8Cleaner::URIString do + let(:invalid_string) { described_class.new('%FF') } + let(:ascii_string) { described_class.new('foo') } + let(:encoded_string) { described_class.new('%26') } + let(:multibyte_string) { described_class.new('%E2%9C%93') } + let(:complex_invalid_string) do + # ------------ foo/ bar. / baz& ; √ baz + described_class.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') + end + let(:no_byte_at_all) { described_class.new('%') } + let(:not_even_hex_chars1) { described_class.new('%x') } + let(:not_even_hex_chars2) { described_class.new('%0zhey') } + let(:mixed_encodings) { described_class.new('§%e2') } - describe '#new' do - it { expect(encoded_string).to be_a(URIString) } - end + describe '#new' do + it { expect(encoded_string).to be_a(described_class) } + end - describe '#cleaned' do - it { expect(invalid_string.cleaned).to eq('') } - it { expect(ascii_string.cleaned).to eq('foo') } - it { expect(encoded_string.cleaned).to eq('%26') } - it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') } - it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') } - it { expect(no_byte_at_all.cleaned).to eq('') } - it { expect(not_even_hex_chars1.cleaned).to eq('') } - it { expect(not_even_hex_chars2.cleaned).to eq('hey') } - it { expect(mixed_encodings.cleaned).to eq('§') } - end + describe '#cleaned' do + it { expect(invalid_string.cleaned).to eq('') } + it { expect(ascii_string.cleaned).to eq('foo') } + it { expect(encoded_string.cleaned).to eq('%26') } + it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') } + it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') } + it { expect(no_byte_at_all.cleaned).to eq('') } + it { expect(not_even_hex_chars1.cleaned).to eq('') } + it { expect(not_even_hex_chars2.cleaned).to eq('hey') } + it { expect(mixed_encodings.cleaned).to eq('§') } + end - describe '#valid?' do - it { expect(ascii_string).to be_valid } - it { expect(encoded_string).to be_valid } - it { expect(multibyte_string).to be_valid } - it { expect(invalid_string).to_not be_valid } - it { expect(complex_invalid_string).to_not be_valid } - it { expect(no_byte_at_all).to_not be_valid } - it { expect(not_even_hex_chars1).to_not be_valid } - it { expect(not_even_hex_chars2).to_not be_valid } - it { expect(mixed_encodings).to_not be_valid } - end + describe '#valid?' do + it { expect(ascii_string).to be_valid } + it { expect(encoded_string).to be_valid } + it { expect(multibyte_string).to be_valid } + it { expect(invalid_string).not_to be_valid } + it { expect(complex_invalid_string).not_to be_valid } + it { expect(no_byte_at_all).not_to be_valid } + it { expect(not_even_hex_chars1).not_to be_valid } + it { expect(not_even_hex_chars2).not_to be_valid } + it { expect(mixed_encodings).not_to be_valid } end end diff --git a/utf8-cleaner.gemspec b/utf8-cleaner.gemspec index 67639b7..f8b3116 100644 --- a/utf8-cleaner.gemspec +++ b/utf8-cleaner.gemspec @@ -12,9 +12,7 @@ Gem::Specification.new do |gem| gem.homepage = 'https://github.com/singlebrook/utf8-cleaner' gem.license = 'MIT' - gem.files = `git ls-files`.split($/) - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } - gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) + gem.files = Dir['lib/**/*'] + %w[README.md LICENSE.txt] gem.require_paths = ['lib'] gem.required_ruby_version = '>= 2.4' @@ -22,6 +20,5 @@ Gem::Specification.new do |gem| gem.add_dependency 'activesupport' gem.add_dependency 'rack', '~> 3.0' - gem.add_development_dependency 'rake' - gem.add_development_dependency 'rspec' + gem.metadata['rubygems_mfa_required'] = 'true' end