diff --git a/lib/Template/Filters.pm b/lib/Template/Filters.pm index deeb2f20..be8bb6b8 100644 --- a/lib/Template/Filters.pm +++ b/lib/Template/Filters.pm @@ -521,8 +521,9 @@ sub truncate_filter_factory { $len = $TRUNCATE_LENGTH unless defined $len; $char = $TRUNCATE_ADDON unless defined $char; - # Length of char is the minimum length - my $lchar = length $char; + # Calculate visual length of the addon, treating HTML character entity + # references (e.g. … … …) as single characters + my $lchar = _visual_length($char); if ($len < $lchar) { $char = substr($char, 0, $len); $lchar = $len; @@ -530,11 +531,45 @@ sub truncate_filter_factory { return sub { my $text = shift; - return $text if length $text <= $len; - return substr($text, 0, $len - $lchar) . $char; + return $text if _visual_length($text) <= $len; + return _truncate_visual($text, $len - $lchar) . $char; + } +} + + +#------------------------------------------------------------------------ +# _visual_length($str) +# +# Returns the "visual" length of a string, counting each HTML character +# entity reference (&name; &#digits; &#xhex;) as a single character. +#------------------------------------------------------------------------ + +sub _visual_length { + my $str = shift; + (my $copy = $str) =~ s/&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);/_/g; + return length $copy; +} + + +#------------------------------------------------------------------------ +# _truncate_visual($str, $maxlen) +# +# Truncates $str to at most $maxlen visual characters, treating HTML +# character entity references as single characters and never splitting +# one in the middle. +#------------------------------------------------------------------------ +sub _truncate_visual { + my ($str, $maxlen) = @_; + my $result = ''; + my $vlen = 0; + while ($str =~ /\G(&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);|.)/gs) { + last if $vlen >= $maxlen; + $result .= $1; + $vlen++; } + return $result; } diff --git a/lib/Template/Manual/Filters.pod b/lib/Template/Manual/Filters.pod index 6c9fb571..1f5b0b7c 100644 --- a/lib/Template/Manual/Filters.pod +++ b/lib/Template/Manual/Filters.pod @@ -319,14 +319,19 @@ Output: If you want to use something other than 'C<...>' you can pass that as a second argument. - [% FILTER truncate(26, '…') %] + [% FILTER truncate(27, '…') %] I have much to say on this matter that has previously been said on more than one occasion. [% END %] Output: - I have much to say… + I have much to say on this… + +HTML character entity references (e.g. C<…>, C<…>, C<…>) +in the suffix are counted by their visual width (one character each), not by +their string length. Entity references in the input text are also treated as +single characters, so they will not be split in the middle. =head1 repeat(iterations) diff --git a/t/truncate_cer.t b/t/truncate_cer.t new file mode 100644 index 00000000..bc1f02e9 --- /dev/null +++ b/t/truncate_cer.t @@ -0,0 +1,187 @@ +#============================================================= -*-perl-*- +# +# t/truncate_cer.t +# +# Tests for truncate filter with HTML Character Entity Reference awareness. +# Ensures that CERs like … … … in the suffix (and in +# the input text) are counted as single visual characters. +# +# Related: https://github.com/abw/Template2/pull/188 +# RT#95707 +# +#======================================================================== + +use strict; +use warnings; +use lib qw( ./lib ../lib ); +use Template; +use Template::Filters; +use Test::More; + +my $tt = Template->new({ INTERPOLATE => 0 }); + +my @tests = ( + #-------------------------------------------------------------------- + # Named entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'named entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + { + name => 'named entity suffix: & counts as 1 char', + input => 'The quick brown fox jumps over the lazy dog.', + tmpl => '[% text | truncate(15, "&more") %]', + # suffix visual length: & (1) + more (4) = 5 + # text visual chars: 15 - 5 = 10 → "The quick " + expect => 'The quick &more', + }, + + #-------------------------------------------------------------------- + # Numeric (decimal) entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'decimal entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + + #-------------------------------------------------------------------- + # Numeric (hex) entity in suffix (…) + #-------------------------------------------------------------------- + { + name => 'hex entity suffix: … counts as 1 char', + input => 'I have much to say on this matter that has previously been said.', + tmpl => '[% text | truncate(27, "…") %]', + expect => 'I have much to say on this…', + }, + + #-------------------------------------------------------------------- + # Plain suffix (no entities) — regression check + #-------------------------------------------------------------------- + { + name => 'plain suffix: ... still works (3 chars)', + input => 'The cat sat on the mat and wondered.', + tmpl => '[% text | truncate(10) %]', + expect => 'The cat...', + }, + { + name => 'no truncation needed', + input => 'Short', + tmpl => '[% text | truncate(10) %]', + expect => 'Short', + }, + { + name => 'exact length — no truncation', + input => 'Hello World', + tmpl => '[% text | truncate(11) %]', + expect => 'Hello World', + }, + { + name => 'len less than suffix — suffix itself truncated', + input => 'Hello World', + tmpl => '[% text | truncate(2) %]', + expect => '..', + }, + + #-------------------------------------------------------------------- + # Multiple entities in suffix + #-------------------------------------------------------------------- + { + name => 'two entities in suffix count as 2 chars', + input => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + tmpl => '[% text | truncate(10, "<>") %]', + expect => 'ABCDEFGH<>', + }, + + #-------------------------------------------------------------------- + # Mixed plain + entity chars in suffix + #-------------------------------------------------------------------- + { + name => 'mixed suffix: "...…" counts as 4 chars (3 dots + 1 entity)', + input => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + tmpl => '[% text | truncate(10, "...…") %]', + expect => 'ABCDEF...…', + }, + + #-------------------------------------------------------------------- + # Entity in input text — should not be split + #-------------------------------------------------------------------- + { + name => 'entity in input text not split mid-reference', + input => 'AB&CDEFGHIJ', + tmpl => '[% text | truncate(5, "...") %]', + # visual: A B & C D E F G H I J = 11 visual chars + # truncate to 5 visual: 2 text + "..." = 5? No: 5 - 3 = 2 visual chars of text + # visual chars: A, B → "AB..." + expect => 'AB...', + }, + { + name => 'entity in input text counted as 1 visual char', + input => 'A&B<C>DEFGHIJKLMNO', + tmpl => '[% text | truncate(6, "...") %]', + # visual: A & B < C > D E F G H I J K L M N O = 18 visual chars + # truncate to 6: 6 - 3 = 3 visual chars of text + # A, &, B → "A&B..." + expect => 'A&B...', + }, + { + name => 'input with entity exactly at boundary', + input => 'ABCD…FGHIJ', + tmpl => '[% text | truncate(7, "...") %]', + # visual: A B C D … F G H I J = 10 visual chars + # truncate to 7: 7 - 3 = 4 visual chars of text + # A, B, C, D → "ABCD..." + expect => 'ABCD...', + }, + { + name => 'input short enough with entities — no truncation', + input => 'A&B', + tmpl => '[% text | truncate(10) %]', + # visual length = 3, less than 10 + expect => 'A&B', + }, + + #-------------------------------------------------------------------- + # Edge cases + #-------------------------------------------------------------------- + { + name => 'empty string — no truncation', + input => '', + tmpl => '[% text | truncate(10) %]', + expect => '', + }, + { + name => 'entity-only suffix with len=1', + input => 'ABCDEFGHIJ', + tmpl => '[% text | truncate(1, "…") %]', + # … is 1 visual char, len=1, so no room for text + expect => '…', + }, + { + name => 'ampersand not part of entity — counts normally', + input => 'Tom & Jerry go to the park and have fun', + tmpl => '[% text | truncate(15, "...") %]', + # "Tom & Jerry " = 12 chars (no entity), "..." = 3 → 15 + expect => 'Tom & Jerry ...', + }, + { + name => 'incomplete entity reference in input — not treated as CER', + input => 'AB¬anentity CDEFGH', + tmpl => '[% text | truncate(10, "...") %]', + # "¬anentity" without semicolon is NOT a CER, counts as individual chars + expect => 'AB¬a...', + }, +); + +plan tests => scalar @tests; + +for my $t (@tests) { + my $output = ''; + $tt->process(\$t->{tmpl}, { text => $t->{input} }, \$output) + || die $tt->error(); + is($output, $t->{expect}, $t->{name}); +}