From 607152a129edeb18aa5f90c0ffce5c4f73595997 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:36:07 +0900 Subject: [PATCH 1/9] Add and fix Rubocop --- .github/workflows/test.yml | 26 +++++++------- .rubocop.yml | 37 ++++++++++++++++++++ Gemfile | 4 +++ lib/utf8-cleaner/middleware.rb | 9 +++-- lib/utf8-cleaner/railtie.rb | 1 + lib/utf8-cleaner/uri_string.rb | 32 ++++++++--------- spec/middleware_spec.rb | 63 +++++++++++++++++----------------- spec/spec_helper.rb | 4 +-- spec/uri_string_spec.rb | 33 +++++++++--------- utf8-cleaner.gemspec | 11 +++--- 10 files changed, 129 insertions(+), 91 deletions(-) create mode 100644 .rubocop.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 07710d0..bcebd6e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,19 +6,19 @@ name: Test - pull_request jobs: - # rubocop: - # runs-on: ubuntu-latest - # env: - # CI: true - # steps: - # - uses: actions/checkout@v3 - # - name: Set up Ruby 3.4 - # uses: ruby/setup-ruby@v1 - # with: - # ruby-version: 3.4 - # bundler-cache: true - # - name: Run RuboCop - # run: bundle exec rubocop --parallel + rubocop: + runs-on: ubuntu-latest + env: + CI: true + steps: + - uses: actions/checkout@v3 + - name: Set up Ruby 3.4 + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.4 + bundler-cache: true + - name: Run RuboCop + run: bundle exec rubocop --parallel test: name: "${{matrix.ruby}} ${{matrix.os || 'ubuntu-latest'}}" diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..d5b6aa2 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,37 @@ +#inherit_from: .rubocop_todo.yml + +plugins: + - rubocop-performance + - rubocop-rake + - rubocop-rspec + +AllCops: + TargetRubyVersion: 2.4 + NewCops: enable + SuggestExtensions: false + +Layout/LineLength: + Exclude: + - 'spec/**/*' + +Metrics/AbcSize: + Enabled: false + +Metrics/BlockNesting: + Enabled: false + +Metrics/MethodLength: + Enabled: false + +Naming/FileName: + Exclude: + - 'lib/utf8-cleaner.rb' + +RSpec/IndexedLet: + Enabled: false + +RSpec/MessageSpies: + EnforcedStyle: receive + +RSpec/MultipleMemoizedHelpers: + Enabled: false diff --git a/Gemfile b/Gemfile index 7f4f5e9..60601ec 100644 --- a/Gemfile +++ b/Gemfile @@ -2,4 +2,8 @@ source 'https://rubygems.org' +gem 'rake' +gem 'rspec' +gem 'rubocop', '< 1.13.0' + gemspec diff --git a/lib/utf8-cleaner/middleware.rb b/lib/utf8-cleaner/middleware.rb index ef9956c..c09804b 100644 --- a/lib/utf8-cleaner/middleware.rb +++ b/lib/utf8-cleaner/middleware.rb @@ -3,6 +3,8 @@ require 'active_support/multibyte/unicode' module UTF8Cleaner + # Rack middleware to sanitize non-UTF8 chars in + # environment variables and request input. class Middleware SANITIZE_ENV_KEYS = %w[ http_referer @@ -36,6 +38,7 @@ def sanitize_env(env) def sanitize_env_keys(env) SANITIZE_ENV_KEYS.each do |key| next unless (value = env[key]) + env[key] = cleaned_string(value) end end @@ -58,10 +61,10 @@ def sanitize_env_rack_input(env) return unless input_data && !input_data.ascii_only? env['rack.input'] = StringIO.new(tidy_bytes(input_data)) - else - # Do not process multipart/form-data since it may contain binary content. - # Leave all other unknown content types alone. end + # Else: + # - Do not process multipart/form-data since it may contain binary content. + # - Leave all other unknown content types alone. end def read_input(input) diff --git a/lib/utf8-cleaner/railtie.rb b/lib/utf8-cleaner/railtie.rb index 7b3b953..1cd2145 100644 --- a/lib/utf8-cleaner/railtie.rb +++ b/lib/utf8-cleaner/railtie.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true module UTF8Cleaner + # Railtie to bootstrap UTF8Cleaner::Middleware in a Rails application. class Railtie < Rails::Railtie initializer('utf8-cleaner.insert_middleware') do |app| app.config.middleware.insert_before(0, UTF8Cleaner::Middleware) diff --git a/lib/utf8-cleaner/uri_string.rb b/lib/utf8-cleaner/uri_string.rb index abd53da..4161c99 100644 --- a/lib/utf8-cleaner/uri_string.rb +++ b/lib/utf8-cleaner/uri_string.rb @@ -6,8 +6,8 @@ class URIString attr_accessor :data HEX_CHARS = '0-9a-fA-F' - HEX_CHARS_REGEX = /[#{HEX_CHARS}]/ - INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/ + HEX_CHARS_REGEX = /[#{HEX_CHARS}]/.freeze + INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/.freeze def initialize(data) self.data = data @@ -34,7 +34,7 @@ def encoded_char_array char_array = [] index = 0 - while index < data.length do + while index < data.length char = data[index] if char == '%' @@ -43,20 +43,20 @@ def encoded_char_array skip_next = 2 # If the next character is not a hex char, drop the percent and it - unless data[index + 1] =~ HEX_CHARS_REGEX + unless HEX_CHARS_REGEX.match?(data[index + 1]) index += 2 next end # If the character after that is not a hex char, drop the percent and # both of the following chars. - unless data[index + 2] =~ HEX_CHARS_REGEX + unless HEX_CHARS_REGEX.match?(data[index + 2]) index += 3 next end # How long is this character? - first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase + first_byte = "0x#{(data[index + 1] + data[index + 2]).upcase}" bytes = utf8_char_length_in_bytes(first_byte) # Grab the specified number of encoded bytes @@ -74,7 +74,7 @@ def encoded_char_array # If we're dealing with a multibyte character, skip more than two # of the next characters, which have already been processed. - skip_next = bytes * 3 - 1 + skip_next = (bytes * 3) - 1 end end index += skip_next @@ -92,9 +92,8 @@ def valid_uri_encoded_utf8(string) URI::DEFAULT_PARSER.unescape(string).force_encoding('UTF-8').valid_encoding? && string !~ INVALID_PERCENT_ENCODING_REGEX rescue ArgumentError => e - if e.message =~ /invalid byte sequence/ - return false - end + return false if e.message.include?('invalid byte sequence') + raise e end @@ -103,16 +102,13 @@ def valid_uri_encoded_utf8(string) def next_n_bytes_from(index, num_bytes) return [] if data.length < index + (3 * num_bytes) - num_bytes.times.map do |n| + Array.new(num_bytes) do |n| # Look for percent signs in the right places pct_index = index + (3 * n) - if data[pct_index] == '%' - byte = data[pct_index + 1..pct_index + 2] - else - # An expected percent sign was missing. The whole character is invalid. - return [] - end - '%' + byte + return [] unless data[pct_index] == '%' + + # An expected percent sign was missing. The whole character is invalid. + "%#{data[(pct_index + 1)..(pct_index + 2)]}" end end diff --git a/spec/middleware_spec.rb b/spec/middleware_spec.rb index 093b853..a94f294 100644 --- a/spec/middleware_spec.rb +++ b/spec/middleware_spec.rb @@ -6,10 +6,19 @@ module UTF8Cleaner describe Middleware do let :new_env do - Middleware.new(nil).send(:sanitize_env, env) + described_class.new(nil).send(:sanitize_env, env) end - describe "with a big nasty env" do + # Ensure that all cleaned values parse cleanly. + # E.g. make sure Rack/Rails won't choke on them + after do + cleaned = new_env + env.keys.reject { |key| key == 'rack.input' }.each do |key| + URI.decode_www_form_component(cleaned[key]) if cleaned[key] + end + end + + describe 'with a big nasty env' do let :env do { 'path_info' => 'foo/%FFbar%2e%2fbaz%26%3B', @@ -17,13 +26,13 @@ module UTF8Cleaner 'http_referer' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', 'http_user_agent' => "Android Versi\xF3n/4.0\x93", 'request_uri' => '%C3%89%E2%9C%93', - 'rack.input' => StringIO.new("foo=%FFbar%F8"), + 'rack.input' => StringIO.new('foo=%FFbar%F8'), 'content_type' => 'application/x-www-form-urlencoded', 'http_cookie' => nil } end - describe "removes invalid %-encoded UTF-8 sequences" do + describe 'removes invalid %-encoded UTF-8 sequences' do it { expect(new_env['query_string']).to eq('foo=bar') } it { expect(new_env['http_referer']).to eq('http://example.com/blog+Result:+++++') } it { expect(new_env['rack.input'].read).to eq('foo=bar') } @@ -33,45 +42,46 @@ module UTF8Cleaner it { expect(new_env['http_user_agent']).to eq("Android Versi\u00F3n/4.0\u201C") } end - describe "leaves all valid characters untouched" do + describe 'leaves all valid characters untouched' do it { expect(new_env['path_info']).to eq('foo/bar%2e%2fbaz%26%3B') } it { expect(new_env['request_uri']).to eq('%C3%89%E2%9C%93') } end - describe "when rack.input is wrapped" do + describe 'when rack.input is wrapped' do # rack.input responds only to methods gets, each, rewind, read and close # Rack::Lint::InputWrapper is the class which servers wrappers are based on - it "removes invalid UTF-8 sequences" do - wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new("foo=%FFbar%F8")) - env.merge!('rack.input' => wrapped_rack_input) - new_env = Middleware.new(nil).send(:sanitize_env, env) + it 'removes invalid UTF-8 sequences' do + wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new('foo=%FFbar%F8')) + env['rack.input'] = wrapped_rack_input + new_env = described_class.new(nil).send(:sanitize_env, env) expect(new_env['rack.input'].read).to eq('foo=bar') end end - describe "when binary data is POSTed" do + describe 'when binary data is POSTed' do before do env['content_type'] = 'multipart/form-data' end - it "leaves the body alone" do + + it 'leaves the body alone' do env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq "foo=%FFbar%F8" + expect(new_env['rack.input'].read).to eq 'foo=%FFbar%F8' end end - describe "when json data is POSTed" do + describe 'when json data is POSTed' do before do env['content_type'] = 'application/json' end - it "tidies invalid UTF-8 sequences" do - env['rack.input'] = StringIO.new(%Q({"foo": "\xFFbar\xF8"})) + it 'tidies invalid UTF-8 sequences' do + env['rack.input'] = StringIO.new(%({"foo": "\xFFbar\xF8"})) env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq(%Q({"foo": "\u00FFbar\u00F8"})) + expect(new_env['rack.input'].read).to eq(%({"foo": "\u00FFbar\u00F8"})) end - it "does not attempt to URI-decode data" do - json = %Q({"foo": "%FF"}) + it 'does not attempt to URI-decode data' do + json = %({"foo": "%FF"}) env['rack.input'] = StringIO.new(json) env['rack.input'].rewind expect(new_env['rack.input'].read).to eq(json) @@ -79,7 +89,7 @@ module UTF8Cleaner end end - describe "with a minimal env" do + describe 'with a minimal env' do let(:env) do { 'path_info' => '/this/is/safe', @@ -87,23 +97,14 @@ module UTF8Cleaner } end - it "only runs URIString cleaning on potentially unclean strings" do + it 'only runs URIString cleaning on potentially unclean strings' do expect(URIString).to receive(:new).once.and_call_original new_env end - it "leaves clean values alone" do + it 'leaves clean values alone' do expect(new_env['path_info']).to eq('/this/is/safe') end end - - # Ensure that all cleaned values parse cleanly. - # E.g. make sure Rack/Rails won't choke on them - after do - cleaned = new_env - env.keys.reject { |key| key == 'rack.input' }.each do |key| - URI.decode_www_form_component(cleaned[key]) if cleaned[key] - end - end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index e2999ac..d957122 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -9,8 +9,8 @@ # order dependency and want to debug it, you can fix the order by providing # the seed, which is printed after each run. # --seed 1234 - config.order = "random" + config.order = 'random' - config.filter_run :focus => true + config.filter_run focus: true config.run_all_when_everything_filtered = true end diff --git a/spec/uri_string_spec.rb b/spec/uri_string_spec.rb index 0918583..159a85e 100644 --- a/spec/uri_string_spec.rb +++ b/spec/uri_string_spec.rb @@ -3,23 +3,22 @@ require 'spec_helper' module UTF8Cleaner - describe URIString do - let(:invalid_string) { URIString.new('%FF') } - let(:ascii_string) { URIString.new('foo') } - let(:encoded_string) { URIString.new('%26') } - let(:multibyte_string) { URIString.new('%E2%9C%93') } + let(:invalid_string) { described_class.new('%FF') } + let(:ascii_string) { described_class.new('foo') } + let(:encoded_string) { described_class.new('%26') } + let(:multibyte_string) { described_class.new('%E2%9C%93') } let(:complex_invalid_string) do # ------------ foo/ bar. / baz& ; √ baz - URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') + described_class.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') end - let(:no_byte_at_all) { URIString.new('%') } - let(:not_even_hex_chars1) { URIString.new('%x') } - let(:not_even_hex_chars2) { URIString.new('%0zhey') } - let(:mixed_encodings) { URIString.new('§%e2') } + let(:no_byte_at_all) { described_class.new('%') } + let(:not_even_hex_chars1) { described_class.new('%x') } + let(:not_even_hex_chars2) { described_class.new('%0zhey') } + let(:mixed_encodings) { described_class.new('§%e2') } describe '#new' do - it { expect(encoded_string).to be_a(URIString) } + it { expect(encoded_string).to be_a(described_class) } end describe '#cleaned' do @@ -38,12 +37,12 @@ module UTF8Cleaner it { expect(ascii_string).to be_valid } it { expect(encoded_string).to be_valid } it { expect(multibyte_string).to be_valid } - it { expect(invalid_string).to_not be_valid } - it { expect(complex_invalid_string).to_not be_valid } - it { expect(no_byte_at_all).to_not be_valid } - it { expect(not_even_hex_chars1).to_not be_valid } - it { expect(not_even_hex_chars2).to_not be_valid } - it { expect(mixed_encodings).to_not be_valid } + it { expect(invalid_string).not_to be_valid } + it { expect(complex_invalid_string).not_to be_valid } + it { expect(no_byte_at_all).not_to be_valid } + it { expect(not_even_hex_chars1).not_to be_valid } + it { expect(not_even_hex_chars2).not_to be_valid } + it { expect(mixed_encodings).not_to be_valid } end end end diff --git a/utf8-cleaner.gemspec b/utf8-cleaner.gemspec index 0cf3a23..7d6c6d3 100644 --- a/utf8-cleaner.gemspec +++ b/utf8-cleaner.gemspec @@ -7,14 +7,12 @@ Gem::Specification.new do |gem| gem.version = UTF8Cleaner::VERSION gem.authors = ['Leon Miller-Out'] gem.email = ['leon@singlebrook.com'] - gem.description = %q{Rack middleware to remove invalid UTF8 characters from web requests.} - gem.summary = %q{Prevent annoying 'invalid byte sequence in UTF-8' errors} + gem.description = 'Rack middleware to remove invalid UTF8 characters from web requests.' + gem.summary = 'Prevent annoying "invalid byte sequence in UTF-8" errors.' gem.homepage = 'https://github.com/singlebrook/utf8-cleaner' gem.license = 'MIT' - gem.files = `git ls-files`.split($/) - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } - gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) + gem.files = Dir['lib/**/*'] + %w[README.md LICENSE.txt] gem.require_paths = ['lib'] gem.required_ruby_version = '>= 2.4' @@ -22,6 +20,5 @@ Gem::Specification.new do |gem| gem.add_dependency 'activesupport' gem.add_dependency 'rack', '>= 3.0' - gem.add_development_dependency 'rake' - gem.add_development_dependency 'rspec' + gem.metadata['rubygems_mfa_required'] = 'true' end From 49be149193b6c15f00ad14457e68dee90300e6fa Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:40:22 +0900 Subject: [PATCH 2/9] Fix failing tests --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bcebd6e..c1af27a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,6 +46,10 @@ jobs: - macos-latest - windows-latest exclude: + - ruby: ruby-2.4 + os: macos-latest + - ruby: ruby-2.5 + os: macos-latest - ruby: jruby-9.4 os: windows-latest - ruby: truffleruby From 9f05ad8c9da9f9a0d99732e7d44f0ffa70776e58 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:42:35 +0900 Subject: [PATCH 3/9] Downgrade rubocop runner ruby version --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c1af27a..4b02914 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,10 +12,10 @@ jobs: CI: true steps: - uses: actions/checkout@v3 - - name: Set up Ruby 3.4 + - name: Set up Ruby 3.3 uses: ruby/setup-ruby@v1 with: - ruby-version: 3.4 + ruby-version: 3.3 bundler-cache: true - name: Run RuboCop run: bundle exec rubocop --parallel From 6814f42fbbd6f4466249b2e20f528ab8a2d0250b Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:51:56 +0900 Subject: [PATCH 4/9] Fix rubocop --- .rubocop.yml | 29 +++++++++++++++++------------ Gemfile | 3 +++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.rubocop.yml b/.rubocop.yml index d5b6aa2..1c96486 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,9 +1,8 @@ -#inherit_from: .rubocop_todo.yml - -plugins: - - rubocop-performance - - rubocop-rake - - rubocop-rspec +# TODO: Enable plugins when upgrading to Rubocop +# plugins: +# - rubocop-performance +# - rubocop-rake +# - rubocop-rspec AllCops: TargetRubyVersion: 2.4 @@ -17,6 +16,9 @@ Layout/LineLength: Metrics/AbcSize: Enabled: false +Metrics/BlockLength: + Enabled: false + Metrics/BlockNesting: Enabled: false @@ -27,11 +29,14 @@ Naming/FileName: Exclude: - 'lib/utf8-cleaner.rb' -RSpec/IndexedLet: +Style/AsciiComments: Enabled: false -RSpec/MessageSpies: - EnforcedStyle: receive - -RSpec/MultipleMemoizedHelpers: - Enabled: false +# RSpec/IndexedLet: +# Enabled: false +# +# RSpec/MessageSpies: +# EnforcedStyle: receive +# +# RSpec/MultipleMemoizedHelpers: +# Enabled: false diff --git a/Gemfile b/Gemfile index 60601ec..3fc3d86 100644 --- a/Gemfile +++ b/Gemfile @@ -5,5 +5,8 @@ source 'https://rubygems.org' gem 'rake' gem 'rspec' gem 'rubocop', '< 1.13.0' +gem 'rubocop-performance' +gem 'rubocop-rake' +gem 'rubocop-rspec' gemspec From b2385d6db380539f9da14827763187554bcf4c68 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:20:21 +0900 Subject: [PATCH 5/9] Cleanup modules in tests --- spec/middleware_spec.rb | 162 ++++++++++++++++++++-------------------- spec/uri_string_spec.rb | 76 +++++++++---------- 2 files changed, 117 insertions(+), 121 deletions(-) diff --git a/spec/middleware_spec.rb b/spec/middleware_spec.rb index a94f294..0ab9850 100644 --- a/spec/middleware_spec.rb +++ b/spec/middleware_spec.rb @@ -3,108 +3,106 @@ require 'spec_helper' require 'rack/lint' -module UTF8Cleaner - describe Middleware do - let :new_env do - described_class.new(nil).send(:sanitize_env, env) +describe UTF8Cleaner::Middleware do + let :new_env do + described_class.new(nil).send(:sanitize_env, env) + end + + # Ensure that all cleaned values parse cleanly. + # E.g. make sure Rack/Rails won't choke on them + after do + cleaned = new_env + env.keys.reject { |key| key == 'rack.input' }.each do |key| + URI.decode_www_form_component(cleaned[key]) if cleaned[key] end + end - # Ensure that all cleaned values parse cleanly. - # E.g. make sure Rack/Rails won't choke on them - after do - cleaned = new_env - env.keys.reject { |key| key == 'rack.input' }.each do |key| - URI.decode_www_form_component(cleaned[key]) if cleaned[key] - end + describe 'with a big nasty env' do + let :env do + { + 'path_info' => 'foo/%FFbar%2e%2fbaz%26%3B', + 'query_string' => 'foo=bar%FF', + 'http_referer' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', + 'http_user_agent' => "Android Versi\xF3n/4.0\x93", + 'request_uri' => '%C3%89%E2%9C%93', + 'rack.input' => StringIO.new('foo=%FFbar%F8'), + 'content_type' => 'application/x-www-form-urlencoded', + 'http_cookie' => nil + } end - describe 'with a big nasty env' do - let :env do - { - 'path_info' => 'foo/%FFbar%2e%2fbaz%26%3B', - 'query_string' => 'foo=bar%FF', - 'http_referer' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', - 'http_user_agent' => "Android Versi\xF3n/4.0\x93", - 'request_uri' => '%C3%89%E2%9C%93', - 'rack.input' => StringIO.new('foo=%FFbar%F8'), - 'content_type' => 'application/x-www-form-urlencoded', - 'http_cookie' => nil - } - end + describe 'removes invalid %-encoded UTF-8 sequences' do + it { expect(new_env['query_string']).to eq('foo=bar') } + it { expect(new_env['http_referer']).to eq('http://example.com/blog+Result:+++++') } + it { expect(new_env['rack.input'].read).to eq('foo=bar') } + end - describe 'removes invalid %-encoded UTF-8 sequences' do - it { expect(new_env['query_string']).to eq('foo=bar') } - it { expect(new_env['http_referer']).to eq('http://example.com/blog+Result:+++++') } - it { expect(new_env['rack.input'].read).to eq('foo=bar') } - end + describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do + it { expect(new_env['http_user_agent']).to eq("Android Versi\u00F3n/4.0\u201C") } + end + + describe 'leaves all valid characters untouched' do + it { expect(new_env['path_info']).to eq('foo/bar%2e%2fbaz%26%3B') } + it { expect(new_env['request_uri']).to eq('%C3%89%E2%9C%93') } + end - describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do - it { expect(new_env['http_user_agent']).to eq("Android Versi\u00F3n/4.0\u201C") } + describe 'when rack.input is wrapped' do + # rack.input responds only to methods gets, each, rewind, read and close + # Rack::Lint::InputWrapper is the class which servers wrappers are based on + it 'removes invalid UTF-8 sequences' do + wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new('foo=%FFbar%F8')) + env['rack.input'] = wrapped_rack_input + new_env = described_class.new(nil).send(:sanitize_env, env) + expect(new_env['rack.input'].read).to eq('foo=bar') end + end - describe 'leaves all valid characters untouched' do - it { expect(new_env['path_info']).to eq('foo/bar%2e%2fbaz%26%3B') } - it { expect(new_env['request_uri']).to eq('%C3%89%E2%9C%93') } + describe 'when binary data is POSTed' do + before do + env['content_type'] = 'multipart/form-data' end - describe 'when rack.input is wrapped' do - # rack.input responds only to methods gets, each, rewind, read and close - # Rack::Lint::InputWrapper is the class which servers wrappers are based on - it 'removes invalid UTF-8 sequences' do - wrapped_rack_input = Rack::Lint::Wrapper::InputWrapper.new(StringIO.new('foo=%FFbar%F8')) - env['rack.input'] = wrapped_rack_input - new_env = described_class.new(nil).send(:sanitize_env, env) - expect(new_env['rack.input'].read).to eq('foo=bar') - end + it 'leaves the body alone' do + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq 'foo=%FFbar%F8' end + end - describe 'when binary data is POSTed' do - before do - env['content_type'] = 'multipart/form-data' - end + describe 'when json data is POSTed' do + before do + env['content_type'] = 'application/json' + end - it 'leaves the body alone' do - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq 'foo=%FFbar%F8' - end + it 'tidies invalid UTF-8 sequences' do + env['rack.input'] = StringIO.new(%({"foo": "\xFFbar\xF8"})) + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq(%({"foo": "\u00FFbar\u00F8"})) end - describe 'when json data is POSTed' do - before do - env['content_type'] = 'application/json' - end - - it 'tidies invalid UTF-8 sequences' do - env['rack.input'] = StringIO.new(%({"foo": "\xFFbar\xF8"})) - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq(%({"foo": "\u00FFbar\u00F8"})) - end - - it 'does not attempt to URI-decode data' do - json = %({"foo": "%FF"}) - env['rack.input'] = StringIO.new(json) - env['rack.input'].rewind - expect(new_env['rack.input'].read).to eq(json) - end + it 'does not attempt to URI-decode data' do + json = %({"foo": "%FF"}) + env['rack.input'] = StringIO.new(json) + env['rack.input'].rewind + expect(new_env['rack.input'].read).to eq(json) end end + end - describe 'with a minimal env' do - let(:env) do - { - 'path_info' => '/this/is/safe', - 'query_string' => 'foo=bar%FF' - } - end + describe 'with a minimal env' do + let(:env) do + { + 'path_info' => '/this/is/safe', + 'query_string' => 'foo=bar%FF' + } + end - it 'only runs URIString cleaning on potentially unclean strings' do - expect(URIString).to receive(:new).once.and_call_original - new_env - end + it 'only runs URIString cleaning on potentially unclean strings' do + expect(UTF8Cleaner::URIString).to receive(:new).once.and_call_original + new_env + end - it 'leaves clean values alone' do - expect(new_env['path_info']).to eq('/this/is/safe') - end + it 'leaves clean values alone' do + expect(new_env['path_info']).to eq('/this/is/safe') end end end diff --git a/spec/uri_string_spec.rb b/spec/uri_string_spec.rb index 159a85e..d747928 100644 --- a/spec/uri_string_spec.rb +++ b/spec/uri_string_spec.rb @@ -2,47 +2,45 @@ require 'spec_helper' -module UTF8Cleaner - describe URIString do - let(:invalid_string) { described_class.new('%FF') } - let(:ascii_string) { described_class.new('foo') } - let(:encoded_string) { described_class.new('%26') } - let(:multibyte_string) { described_class.new('%E2%9C%93') } - let(:complex_invalid_string) do - # ------------ foo/ bar. / baz& ; √ baz - described_class.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') - end - let(:no_byte_at_all) { described_class.new('%') } - let(:not_even_hex_chars1) { described_class.new('%x') } - let(:not_even_hex_chars2) { described_class.new('%0zhey') } - let(:mixed_encodings) { described_class.new('§%e2') } +describe UTF8Cleaner::URIString do + let(:invalid_string) { described_class.new('%FF') } + let(:ascii_string) { described_class.new('foo') } + let(:encoded_string) { described_class.new('%26') } + let(:multibyte_string) { described_class.new('%E2%9C%93') } + let(:complex_invalid_string) do + # ------------ foo/ bar. / baz& ; √ baz + described_class.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') + end + let(:no_byte_at_all) { described_class.new('%') } + let(:not_even_hex_chars1) { described_class.new('%x') } + let(:not_even_hex_chars2) { described_class.new('%0zhey') } + let(:mixed_encodings) { described_class.new('§%e2') } - describe '#new' do - it { expect(encoded_string).to be_a(described_class) } - end + describe '#new' do + it { expect(encoded_string).to be_a(described_class) } + end - describe '#cleaned' do - it { expect(invalid_string.cleaned).to eq('') } - it { expect(ascii_string.cleaned).to eq('foo') } - it { expect(encoded_string.cleaned).to eq('%26') } - it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') } - it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') } - it { expect(no_byte_at_all.cleaned).to eq('') } - it { expect(not_even_hex_chars1.cleaned).to eq('') } - it { expect(not_even_hex_chars2.cleaned).to eq('hey') } - it { expect(mixed_encodings.cleaned).to eq('§') } - end + describe '#cleaned' do + it { expect(invalid_string.cleaned).to eq('') } + it { expect(ascii_string.cleaned).to eq('foo') } + it { expect(encoded_string.cleaned).to eq('%26') } + it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') } + it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') } + it { expect(no_byte_at_all.cleaned).to eq('') } + it { expect(not_even_hex_chars1.cleaned).to eq('') } + it { expect(not_even_hex_chars2.cleaned).to eq('hey') } + it { expect(mixed_encodings.cleaned).to eq('§') } + end - describe '#valid?' do - it { expect(ascii_string).to be_valid } - it { expect(encoded_string).to be_valid } - it { expect(multibyte_string).to be_valid } - it { expect(invalid_string).not_to be_valid } - it { expect(complex_invalid_string).not_to be_valid } - it { expect(no_byte_at_all).not_to be_valid } - it { expect(not_even_hex_chars1).not_to be_valid } - it { expect(not_even_hex_chars2).not_to be_valid } - it { expect(mixed_encodings).not_to be_valid } - end + describe '#valid?' do + it { expect(ascii_string).to be_valid } + it { expect(encoded_string).to be_valid } + it { expect(multibyte_string).to be_valid } + it { expect(invalid_string).not_to be_valid } + it { expect(complex_invalid_string).not_to be_valid } + it { expect(no_byte_at_all).not_to be_valid } + it { expect(not_even_hex_chars1).not_to be_valid } + it { expect(not_even_hex_chars2).not_to be_valid } + it { expect(mixed_encodings).not_to be_valid } end end From 27658e61a6a5e84116b6a03cd8d4e90bbf88ceb7 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:53:52 +0900 Subject: [PATCH 6/9] Use uppercase headers --- lib/utf8-cleaner/middleware.rb | 16 ++++++++-------- spec/middleware_spec.rb | 34 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/lib/utf8-cleaner/middleware.rb b/lib/utf8-cleaner/middleware.rb index c09804b..23f24ab 100644 --- a/lib/utf8-cleaner/middleware.rb +++ b/lib/utf8-cleaner/middleware.rb @@ -7,13 +7,13 @@ module UTF8Cleaner # environment variables and request input. class Middleware SANITIZE_ENV_KEYS = %w[ - http_referer - http_user_agent - path_info - query_string - request_path - request_uri - http_cookie + HTTP_REFERER + HTTP_USER_AGENT + PATH_INFO + QUERY_STRING + REQUEST_PATH + REQUEST_URI + HTTP_COOKIE ].freeze def initialize(app) @@ -46,7 +46,7 @@ def sanitize_env_keys(env) def sanitize_env_rack_input(env) return unless env['rack.input'] - case env['content_type'] + case env['CONTENT_TYPE'] when %r{\Aapplication/x-www-form-urlencoded}i # This data gets the full cleaning treatment input_data = read_input(env['rack.input']) diff --git a/spec/middleware_spec.rb b/spec/middleware_spec.rb index 0ab9850..1ab31b8 100644 --- a/spec/middleware_spec.rb +++ b/spec/middleware_spec.rb @@ -20,30 +20,30 @@ describe 'with a big nasty env' do let :env do { - 'path_info' => 'foo/%FFbar%2e%2fbaz%26%3B', - 'query_string' => 'foo=bar%FF', - 'http_referer' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', - 'http_user_agent' => "Android Versi\xF3n/4.0\x93", - 'request_uri' => '%C3%89%E2%9C%93', + 'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B', + 'QUERY_STRING' => 'foo=bar%FF', + 'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8', + 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93", + 'REQUEST_URI' => '%C3%89%E2%9C%93', 'rack.input' => StringIO.new('foo=%FFbar%F8'), - 'content_type' => 'application/x-www-form-urlencoded', - 'http_cookie' => nil + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'HTTP_COOKIE' => nil } end describe 'removes invalid %-encoded UTF-8 sequences' do - it { expect(new_env['query_string']).to eq('foo=bar') } - it { expect(new_env['http_referer']).to eq('http://example.com/blog+Result:+++++') } + it { expect(new_env['QUERY_STRING']).to eq('foo=bar') } + it { expect(new_env['HTTP_REFERER']).to eq('http://example.com/blog+Result:+++++') } it { expect(new_env['rack.input'].read).to eq('foo=bar') } end describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do - it { expect(new_env['http_user_agent']).to eq("Android Versi\u00F3n/4.0\u201C") } + it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") } end describe 'leaves all valid characters untouched' do - it { expect(new_env['path_info']).to eq('foo/bar%2e%2fbaz%26%3B') } - it { expect(new_env['request_uri']).to eq('%C3%89%E2%9C%93') } + it { expect(new_env['PATH_INFO']).to eq('foo/bar%2e%2fbaz%26%3B') } + it { expect(new_env['REQUEST_URI']).to eq('%C3%89%E2%9C%93') } end describe 'when rack.input is wrapped' do @@ -59,7 +59,7 @@ describe 'when binary data is POSTed' do before do - env['content_type'] = 'multipart/form-data' + env['CONTENT_TYPE'] = 'multipart/form-data' end it 'leaves the body alone' do @@ -70,7 +70,7 @@ describe 'when json data is POSTed' do before do - env['content_type'] = 'application/json' + env['CONTENT_TYPE'] = 'application/json' end it 'tidies invalid UTF-8 sequences' do @@ -91,8 +91,8 @@ describe 'with a minimal env' do let(:env) do { - 'path_info' => '/this/is/safe', - 'query_string' => 'foo=bar%FF' + 'PATH_INFO' => '/this/is/safe', + 'QUERY_STRING' => 'foo=bar%FF' } end @@ -102,7 +102,7 @@ end it 'leaves clean values alone' do - expect(new_env['path_info']).to eq('/this/is/safe') + expect(new_env['PATH_INFO']).to eq('/this/is/safe') end end end From 1afceddc9f814085337bc40d2824ef1b9bd5de29 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:33:58 +0900 Subject: [PATCH 7/9] Add integration test --- Gemfile | 1 + spec/integration_spec.rb | 580 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 581 insertions(+) create mode 100644 spec/integration_spec.rb diff --git a/Gemfile b/Gemfile index 3fc3d86..9761b56 100644 --- a/Gemfile +++ b/Gemfile @@ -2,6 +2,7 @@ source 'https://rubygems.org' +gem 'rack-test' gem 'rake' gem 'rspec' gem 'rubocop', '< 1.13.0' diff --git a/spec/integration_spec.rb b/spec/integration_spec.rb new file mode 100644 index 0000000..254b5b7 --- /dev/null +++ b/spec/integration_spec.rb @@ -0,0 +1,580 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'rack' +require 'rack/test' +require 'json' + +describe 'UTF8Cleaner::Middleware Integration' do + include Rack::Test::Methods + + let(:app) do + Rack::Builder.new do + use UTF8Cleaner::Middleware + run(lambda do |env| + # Create a Rack::Request to properly parse parameters + request = Rack::Request.new(env) + + # Echo back the sanitized environment for verification + response_body = { + path_info: env['PATH_INFO'], + query_string: env['QUERY_STRING'], + request_uri: env['REQUEST_URI'], + request_path: env['REQUEST_PATH'], + http_referer: env['HTTP_REFERER'], + http_user_agent: env['HTTP_USER_AGENT'], + http_cookie: env['HTTP_COOKIE'] + } + + # Include parsed params for form data testing + if env['REQUEST_METHOD'] == 'POST' && env['CONTENT_TYPE']&.match?(%r{application/x-www-form-urlencoded}i) + response_body[:params] = begin + request.POST + rescue StandardError + {} + end + elsif env['CONTENT_TYPE']&.match?(%r{application/json}i) + env['rack.input'].rewind + response_body[:payload] = env['rack.input'].read + env['rack.input'].rewind + elsif env['CONTENT_TYPE']&.match?(%r{multipart/form-data}i) + response_body[:multipart_size] = env['rack.input'].read.bytesize + env['rack.input'].rewind + end + + begin + [200, { 'Content-Type' => 'application/json' }, [response_body.to_json]] + rescue JSON::GeneratorError + [200, { 'Content-Type' => 'text/plain' }, response_body[:payload]] + end + end) + end + end + + describe 'GET requests' do + context 'with invalid UTF-8' do + it 'removes invalid percent-encoded sequences from query string' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'name=%FFbad%F8&valid=true' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('name=bad&valid=true') + end + + it 'cleans invalid sequences from path_info' do + env = Rack::MockRequest.env_for('/test/path') + env['PATH_INFO'] = '/test/%FFbad%F8/path' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['path_info']).to eq('/test/bad/path') + end + + it 'handles mixed valid and invalid encodings' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'good=%E2%9C%93&bad=%FF&mixed=%E2%9C%FF' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('good=%E2%9C%93&bad=&mixed=') + end + + it 'handles truncated multibyte sequences' do + # %E2%9C%93 is ✓, but truncate it at different points + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'truncated=%E2%9C&complete=%E2%9C%93' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('truncated=&complete=%E2%9C%93') + end + + it 'handles overlong UTF-8 sequences' do + # Overlong encoding of '/' (should be %2F but encoded as %C0%AF) + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'overlong=%C0%AF&normal=text' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('overlong=&normal=text') + end + + it 'handles invalid continuation bytes' do + # Valid start byte but invalid continuation + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'invalid=%E2%FF%93&valid=test' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('invalid=&valid=test') + end + end + + context 'percent encodings' do + it 'preserves valid percent-encoded characters' do + env = Rack::MockRequest.env_for('/test%2Fpath?foo=%26bar%3D1') + _status, _headers, body = app.call(env) + + response = JSON.parse(body.first) + expect(response['path_info']).to eq('/test%2Fpath') + expect(response['query_string']).to eq('foo=%26bar%3D1') + end + + it 'removes incomplete percent encodings' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'incomplete=%2' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('incomplete=') + end + + it 'removes percent signs without hex chars' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'invalid=%GG&partial=%F' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('invalid=G&partial=') + end + + it 'removes bare percent signs' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = 'percent=%&value=test' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('percent=value=test') + end + end + end + + describe 'POST requests' do + context 'with form-encoded data' do + it 'cleans invalid UTF-8 in form parameters' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'name=%FFbad%F8&email=test@example.com') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'name' => 'bad', + 'email' => 'test@example.com' + }) + end + + it 'preserves valid UTF-8 in form data' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'name=José&city=São+Paulo') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'name' => 'José', + 'city' => 'São Paulo' + }) + end + + it 'handles complex mixed encodings in forms' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'valid=%E2%9C%93&invalid=%FF&text=hello%20world') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ + 'valid' => '✓', + 'invalid' => '', + 'text' => 'hello world' + }) + end + + it 'handles form data with no content type' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + :input => 'test=%FFdata') + + status, _headers, _body = app.call(env) + expect(status).to eq(200) + end + + it 'handles empty POST body' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'CONTENT_LENGTH' => '0', + :input => '') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({}) + end + end + + context 'with JSON data' do + it 'tidies invalid UTF-8 bytes in JSON' do + pending 'This case is not currently handled correctly' + + # Create JSON with invalid UTF-8 bytes + json_with_invalid = "{\"name\": \"test\xFF\xF8\", \"valid\": \"✓\"}" + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json_with_invalid) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + # Invalid bytes should be replaced with replacement characters + expect(JSON.parse(response['payload'])).to eq({ + 'name' => "test\u00FF\u00F8", + 'valid' => '✓' + }) + end + + it 'does not URI-decode JSON content' do + json = { 'encoded' => '%20%26%3D', 'normal' => 'text' }.to_json + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(JSON.parse(response['payload'])).to eq({ + 'encoded' => '%20%26%3D', + 'normal' => 'text' + }) + end + + it 'handles valid UTF-8 JSON unchanged' do + json = { 'name' => 'José', 'emoji' => '😀', 'check' => '✓' }.to_json + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/json', + :input => json) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(JSON.parse(response['payload'])).to eq({ + 'name' => 'José', + 'emoji' => '😀', + 'check' => '✓' + }) + end + end + + context 'with multipart/form-data' do + it 'handles multipart form data with invalid UTF-8 in field names' do + input = <<~INPUT + --AaB03x\r + content-disposition: form-data; name="field%FFname"\r + \r + value\r + --AaB03x--\r + INPUT + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'multipart/form-data; boundary=AaB03x', + 'CONTENT_LENGTH' => input.size.to_s, + :input => input) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['multipart_size']).to be > 0 + end + + it 'does not modify binary content in multipart' do + # Create binary content with invalid UTF-8 + binary_content = (+"file\xFF\xF8content").force_encoding('BINARY') + + # Build multipart request manually + boundary = 'AaB03x' + input = "--#{boundary}\r\n" + input << "content-disposition: form-data; name=\"file\"; filename=\"test.bin\"\r\n" + input << "content-type: application/octet-stream\r\n" + input << "\r\n" + input << binary_content + input << "\r\n--#{boundary}--\r\n" + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => "multipart/form-data; boundary=#{boundary}", + 'CONTENT_LENGTH' => input.bytesize.to_s, + :input => StringIO.new(input)) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['multipart_size']).to eq(input.bytesize) + end + end + end + + describe 'HTTP headers' do + context 'with invalid UTF-8 in headers' do + it 'cleans User-Agent header' do + env = Rack::MockRequest.env_for('/', 'HTTP_USER_AGENT' => "Mozilla/5.0\xFF\xF8") + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_user_agent']).to eq("Mozilla/5.0\u00FF\u00F8") + end + + it 'cleans Referer header with invalid percent encoding' do + env = Rack::MockRequest.env_for('/', 'HTTP_REFERER' => 'http://example.com/search?q=%FFbad%F8') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/search?q=bad') + end + + it 'cleans Cookie header' do + env = Rack::MockRequest.env_for('/', 'HTTP_COOKIE' => 'session=%FFbad%F8; user=john') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_cookie']).to eq('session=bad; user=john') + end + + it 'handles multiple cookies with invalid encoding' do + env = Rack::MockRequest.env_for('/', + 'HTTP_COOKIE' => 'first=%FFbad; second=good%20value; third=%F8invalid') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_cookie']).to eq('first=bad; second=good%20value; third=invalid') + end + end + + context 'with valid UTF-8 in headers' do + it 'preserves valid UTF-8 in headers' do + env = Rack::MockRequest.env_for('/', + 'HTTP_REFERER' => 'http://example.com/José', + 'HTTP_USER_AGENT' => 'Custom/1.0 (São Paulo)') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/José') + expect(response['http_user_agent']).to eq('Custom/1.0 (São Paulo)') + end + + it 'preserves valid percent-encoded UTF-8 in referer' do + env = Rack::MockRequest.env_for('/', + 'HTTP_REFERER' => 'http://example.com/search?q=%E2%9C%93') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['http_referer']).to eq('http://example.com/search?q=%E2%9C%93') + end + end + end + + describe 'REQUEST_URI and REQUEST_PATH' do + it 'cleans both REQUEST_URI and REQUEST_PATH' do + env = Rack::MockRequest.env_for('/test') + env['REQUEST_URI'] = '/test%FFpath?query=%FFparam' + env['REQUEST_PATH'] = '/test%FFpath' + env['QUERY_STRING'] = 'query=%FFparam' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['request_uri']).to eq('/testpath?query=param') + expect(response['request_path']).to eq('/testpath') + expect(response['query_string']).to eq('query=param') + end + + it 'handles REQUEST_URI with fragment identifiers' do + env = Rack::MockRequest.env_for('/') + env['REQUEST_URI'] = '/page?param=%FFvalue#section' + env['QUERY_STRING'] = 'param=%FFvalue' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['request_uri']).to eq('/page?param=value#section') + end + end + + describe 'edge cases' do + it 'handles empty rack.input gracefully' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => nil) + + status, _headers, _body = app.call(env) + expect(status).to eq(200) + end + + it 'handles very long invalid sequences' do + long_invalid = '%FF' * 100 + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = "data=#{long_invalid}" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq('data=') + end + + it 'handles mixed case content-type headers' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'Application/X-WWW-Form-URLEncoded', + :input => 'test=%FFdata') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'test' => 'data' }) + end + + it 'preserves plus signs as spaces in form data' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'text=hello+world+test') + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'text' => 'hello world test' }) + end + + it 'handles null bytes in input' do + env = Rack::MockRequest.env_for('/') + env['QUERY_STRING'] = "null=\x00&text=hello" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + # Should preserve null bytes that are valid + expect(response['query_string']).to include('text=hello') + end + + it 'handles Safari ajax POST body with null terminator' do + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => "foo=bar&quux=bla\0") + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to eq({ 'foo' => 'bar', 'quux' => 'bla' }) + end + end + + describe 'complex real-world scenarios' do + it 'handles a request with multiple invalid encodings across different parts' do + env = Rack::MockRequest.env_for('/pathinfo', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + 'HTTP_REFERER' => 'http://example.com/page%FFtest', + 'HTTP_USER_AGENT' => "Bot\xFF\xF8/1.0", + 'HTTP_COOKIE' => 'id=%FFbad', + :input => 'field=%FFvalue') + + env['PATH_INFO'] = '/path%FFinfo' + env['QUERY_STRING'] = 'param=%FFquery' + env['REQUEST_URI'] = '/path%FFinfo?param=%FFquery' + env['REQUEST_PATH'] = '/path%FFinfo' + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + + expect(response['path_info']).to eq('/pathinfo') + expect(response['query_string']).to eq('param=query') + expect(response['request_uri']).to eq('/pathinfo?param=query') + expect(response['request_path']).to eq('/pathinfo') + expect(response['http_referer']).to eq('http://example.com/pagetest') + expect(response['http_user_agent']).to eq("Bot\u00FF\u00F8/1.0") + expect(response['http_cookie']).to eq('id=bad') + expect(response['params']).to eq({ 'field' => 'value' }) + end + + it 'handles requests with byte sequences at buffer boundaries' do + # Test sequences that might be split at typical buffer sizes + # This tests the middleware's ability to handle partial sequences + env = Rack::MockRequest.env_for('/') + # Create a query string that puts invalid bytes at common buffer boundaries + env['QUERY_STRING'] = "#{'a' * 8190}%FF#{'b' * 10}" + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['query_string']).to eq(('a' * 8190) + ('b' * 10)) + end + end + + describe 'middleware composition' do + let(:app) do + Rack::Builder.new do + use UTF8Cleaner::Middleware + run ->(_env) { [200, { 'content_type' => 'text/plain' }, ['OK']] } + end + end + + it 'works correctly with other middleware' do + env = Rack::MockRequest.env_for('/?test=%FFbad') + status, _headers, body = app.call(env) + expect(status).to eq(200) + expect(body).to eq(['OK']) + end + end + + describe 'rack.input rewind behavior' do + it 'ensures rack.input can be read multiple times' do + test_app = Rack::Builder.new do + use UTF8Cleaner::Middleware + run lambda { |env| + # First read + env['rack.input'].rewind + first_read = env['rack.input'].read + + # Second read + env['rack.input'].rewind + second_read = env['rack.input'].read + + [ + 200, + { 'Content-Type' => 'application/json' }, + [{ first: first_read, second: second_read, same: first_read == second_read }.to_json] + ] + } + end + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => 'test=%FFdata') + + _status, _headers, body = test_app.call(env) + response = JSON.parse(body.first) + expect(response['first']).to eq('test=data') + expect(response['second']).to eq('test=data') + expect(response['same']).to be true + end + end + + describe 'performance considerations' do + it 'handles large payloads efficiently' do + # Create a large form payload with some invalid sequences + large_data = (1..1000).map { |i| "field#{i}=value#{i}" }.join('&') + large_data += '&invalid=%FFtest' + + env = Rack::MockRequest.env_for('/', + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded', + :input => large_data) + + _status, _headers, body = app.call(env) + response = JSON.parse(body.first) + expect(response['params']).to include('field1' => 'value1') + expect(response['params']).to include('field1000' => 'value1000') + expect(response['params']).to include('invalid' => 'test') + end + end +end From a5dfaa41e23efd499c288da9298a29664b56583c Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:37:24 +0900 Subject: [PATCH 8/9] Skip some specs --- spec/integration_spec.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spec/integration_spec.rb b/spec/integration_spec.rb index 254b5b7..faa5556 100644 --- a/spec/integration_spec.rb +++ b/spec/integration_spec.rb @@ -255,6 +255,8 @@ end it 'handles valid UTF-8 JSON unchanged' do + skip if RUBY_VERSION < '3.1' || defined?(JRuby) || defined?(TruffleRuby) + json = { 'name' => 'José', 'emoji' => '😀', 'check' => '✓' }.to_json env = Rack::MockRequest.env_for('/', @@ -294,6 +296,8 @@ end it 'does not modify binary content in multipart' do + skip if RUBY_VERSION < '3.1' || defined?(TruffleRuby) + # Create binary content with invalid UTF-8 binary_content = (+"file\xFF\xF8content").force_encoding('BINARY') From 979585add34515f603d2e94973bc4f7721827459 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:40:01 +0900 Subject: [PATCH 9/9] Fix TruffleRuby --- spec/integration_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/integration_spec.rb b/spec/integration_spec.rb index faa5556..012895e 100644 --- a/spec/integration_spec.rb +++ b/spec/integration_spec.rb @@ -219,7 +219,7 @@ context 'with JSON data' do it 'tidies invalid UTF-8 bytes in JSON' do - pending 'This case is not currently handled correctly' + pending 'This case is not currently handled correctly' unless defined?(TruffleRuby) # Create JSON with invalid UTF-8 bytes json_with_invalid = "{\"name\": \"test\xFF\xF8\", \"valid\": \"✓\"}"