From f7b3235cfeec7d33152130e954e562c171c099ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C5=8Dan?= Date: Sat, 14 Feb 2026 00:01:04 -0700 Subject: [PATCH] feat: make truncate filter Character Entity Reference aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The truncate filter now correctly handles HTML Character Entity References (CERs) like … … … in both the suffix string and the input text. Previously, … was counted as 8 characters by length(), causing the truncate filter to remove more text than intended. Now CERs are counted by their visual width (1 character each), and CERs in the input text are never split mid-reference. Adds _visual_length() and _truncate_visual() helper functions. Fixes RT#95707. Ref: https://github.com/abw/Template2/pull/188 Co-Authored-By: Kōan --- lib/Template/Filters.pm | 43 +++++++- lib/Template/Manual/Filters.pod | 9 +- t/truncate_cer.t | 187 ++++++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 6 deletions(-) create mode 100644 t/truncate_cer.t diff --git a/lib/Template/Filters.pm b/lib/Template/Filters.pm index deeb2f20..be8bb6b8 100644 --- a/lib/Template/Filters.pm +++ b/lib/Template/Filters.pm @@ -521,8 +521,9 @@ sub truncate_filter_factory { $len = $TRUNCATE_LENGTH unless defined $len; $char = $TRUNCATE_ADDON unless defined $char; - # Length of char is the minimum length - my $lchar = length $char; + # Calculate visual length of the addon, treating HTML character entity + # references (e.g. … … …) as single characters + my $lchar = _visual_length($char); if ($len < $lchar) { $char = substr($char, 0, $len); $lchar = $len; @@ -530,11 +531,45 @@ sub truncate_filter_factory { return sub { my $text = shift; - return $text if length $text <= $len; - return substr($text, 0, $len - $lchar) . $char; + return $text if _visual_length($text) <= $len; + return _truncate_visual($text, $len - $lchar) . $char; + } +} + + +#------------------------------------------------------------------------ +# _visual_length($str) +# +# Returns the "visual" length of a string, counting each HTML character +# entity reference (&name; &#digits; &#xhex;) as a single character. +#------------------------------------------------------------------------ + +sub _visual_length { + my $str = shift; + (my $copy = $str) =~ s/&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);/_/g; + return length $copy; +} + + +#------------------------------------------------------------------------ +# _truncate_visual($str, $maxlen) +# +# Truncates $str to at most $maxlen visual characters, treating HTML +# character entity references as single characters and never splitting +# one in the middle. +#------------------------------------------------------------------------ +sub _truncate_visual { + my ($str, $maxlen) = @_; + my $result = ''; + my $vlen = 0; + while ($str =~ /\G(&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);|.)/gs) { + last if $vlen >= $maxlen; + $result .= $1; + $vlen++; } + return $result; } diff --git a/lib/Template/Manual/Filters.pod b/lib/Template/Manual/Filters.pod index 6c9fb571..1f5b0b7c 100644 --- a/lib/Template/Manual/Filters.pod +++ b/lib/Template/Manual/Filters.pod @@ -319,14 +319,19 @@ Output: If you want to use something other than 'C<...>' you can pass that as a second argument. - [% FILTER truncate(26, '…') %] + [% FILTER truncate(27, '…') %] I have much to say on this matter that has previously been said on more than one occasion. [% END %] Output: - I have much to say… + I have much to say on this… + +HTML character entity references (e.g. C<…>, C<…>, C<…>) +in the suffix are counted by their visual width (one character each), not by +their string length. Entity references in the input text are also treated as +single characters, so they will not be split in the middle. =head1 repeat(iterations) diff --git a/t/truncate_cer.t b/t/truncate_cer.t new file mode 100644 index 00000000..bc1f02e9 --- /dev/null +++ b/t/truncate_cer.t @@ -0,0 +1,187 @@ +#============================================================= -*-perl-*- +# +# t/truncate_cer.t +# +# Tests for truncate filter with HTML Character Entity Reference awareness. +# Ensures that CERs like … … … in the suffix (and in +# the input text) are counted as single visual characters. +# +# Related: https://github.com/abw/Template2/pull/188 +# RT#95707 +# +#======================================================================== + +use strict; +use warnings; +use lib qw( ./lib ../lib ); +use Template; +use Template::Filters; +use Test::More; + +my $tt = Template->new({ INTERPOLATE => 0 }); + +my @tests = ( + #-------------------------------------------------------------------- + # Named entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'named entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + { + name => 'named entity suffix: & counts as 1 char', + input => 'The quick brown fox jumps over the lazy dog.', + tmpl => '[% text | truncate(15, "&more") %]', + # suffix visual length: & (1) + more (4) = 5 + # text visual chars: 15 - 5 = 10 → "The quick " + expect => 'The quick &more', + }, + + #-------------------------------------------------------------------- + # Numeric (decimal) entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'decimal entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + + #-------------------------------------------------------------------- + # Numeric (hex) entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'hex entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + + #-------------------------------------------------------------------- + # Plain suffix (no entities) — regression check + #-------------------------------------------------------------------- + { + name => 'plain suffix: ... still works (3 chars)', + input => 'The cat sat on the mat and wondered.', + tmpl => '[% text | truncate(10) %]', + expect => 'The cat...', + }, + { + name => 'no truncation needed', + input => 'Short', + tmpl => '[% text | truncate(10) %]', + expect => 'Short', + }, + { + name => 'exact length — no truncation', + input => 'Hello World', + tmpl => '[% text | truncate(11) %]', + expect => 'Hello World', + }, + { + name => 'len less than suffix — suffix itself truncated', + input => 'Hello World', + tmpl => '[% text | truncate(2) %]', + expect => '..', + }, + + #-------------------------------------------------------------------- + # Multiple entities in suffix + #-------------------------------------------------------------------- + { + name => 'two entities in suffix count as 2 chars', + input => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + tmpl => '[% text | truncate(10, "<>") %]', + expect => 'ABCDEFGH<>', + }, + + #-------------------------------------------------------------------- + # Mixed plain + entity chars in suffix + #-------------------------------------------------------------------- + { + name => 'mixed suffix: "...…" counts as 4 chars (3 dots + 1 entity)', + input => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + tmpl => '[% text | truncate(10, "...…") %]', + expect => 'ABCDEF...…', + }, + + #-------------------------------------------------------------------- + # Entity in input text — should not be split + #-------------------------------------------------------------------- + { + name => 'entity in input text not split mid-reference', + input => 'AB&CDEFGHIJ', + tmpl => '[% text | truncate(5, "...") %]', + # visual: A B & C D E F G H I J = 11 visual chars + # truncate to 5 visual: 2 text + "..." = 5? No: 5 - 3 = 2 visual chars of text + # visual chars: A, B → "AB..." + expect => 'AB...', + }, + { + name => 'entity in input text counted as 1 visual char', + input => 'A&B<C>DEFGHIJKLMNO', + tmpl => '[% text | truncate(6, "...") %]', + # visual: A & B < C > D E F G H I J K L M N O = 18 visual chars + # truncate to 6: 6 - 3 = 3 visual chars of text + # A, &, B → "A&B..." + expect => 'A&B...', + }, + { + name => 'input with entity exactly at boundary', + input => 'ABCD…FGHIJ', + tmpl => '[% text | truncate(7, "...") %]', + # visual: A B C D … F G H I J = 10 visual chars + # truncate to 7: 7 - 3 = 4 visual chars of text + # A, B, C, D → "ABCD..." + expect => 'ABCD...', + }, + { + name => 'input short enough with entities — no truncation', + input => 'A&B', + tmpl => '[% text | truncate(10) %]', + # visual length = 3, less than 10 + expect => 'A&B', + }, + + #-------------------------------------------------------------------- + # Edge cases + #-------------------------------------------------------------------- + { + name => 'empty string — no truncation', + input => '', + tmpl => '[% text | truncate(10) %]', + expect => '', + }, + { + name => 'entity-only suffix with len=1', + input => 'ABCDEFGHIJ', + tmpl => '[% text | truncate(1, "…") %]', + # … is 1 visual char, len=1, so no room for text + expect => '…', + }, + { + name => 'ampersand not part of entity — counts normally', + input => 'Tom & Jerry go to the park and have fun', + tmpl => '[% text | truncate(15, "...") %]', + # "Tom & Jerry " = 12 chars (no entity), "..." = 3 → 15 + expect => 'Tom & Jerry ...', + }, + { + name => 'incomplete entity reference in input — not treated as CER', + input => 'AB¬anentity CDEFGH', + tmpl => '[% text | truncate(10, "...") %]', + # "¬anentity" without semicolon is NOT a CER, counts as individual chars + expect => 'AB¬a...', + }, +); + +plan tests => scalar @tests; + +for my $t (@tests) { + my $output = ''; + $tt->process(\$t->{tmpl}, { text => $t->{input} }, \$output) + || die $tt->error(); + is($output, $t->{expect}, $t->{name}); +}