From 741d1f9aa11c928c8feca66d63740ab98b0b81f1 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Mon, 9 Feb 2026 21:48:06 +0000 Subject: [PATCH 1/2] Support escaping of pipe characters within multi-valued slots. Currently, because the pipe character (`|`) is used to separate the different values of a multi-valued slot in the TSV serialisation, said values cannot contain a pipe character themselves. There is no real reason for the serialisation format to mandate such a restriction. This commit implements a mechanism allowing to encode pipe characters in the values of a multi-valued slot. Briefly, and only in the context of a multi-valued slot in TSV format: * a `\|` sequence (backslash + pipe) is interpreted as a `|` character that is part of the current value, NOT as the value separator; * a `\\` sequence (double backslash) is interpreted as a `\` character that is part of the current value; * any other occurrence of a backslash character is treated as a normal character that is part of the current value. When writing a multi-valued slot in TSV, the writer must: * escape any '|' character that is part of the value; * escape a '\' character that is part of the value iff (1) the character is followed by another backslash or a pipe, or (2) the character would be followed by the pipe used to separate values (which would happen if the backslash is the last character of any of the values in the slot except the last one). This commit also fixes a (slightly) unrelated bug in the way multi-valued slots are quoted: if any of the values of a multi-valued slot needs quoting, then it is the entire slot that must be quoted, NOT the individual value. --- .../incenp/obofoundry/sssom/TSVWriter.java | 140 ++++++++++++++---- .../obofoundry/sssom/YAMLConverter.java | 45 +++++- .../obofoundry/sssom/TSVReaderTest.java | 22 +++ .../obofoundry/sssom/TSVWriterTest.java | 19 +++ .../sets/test-escaping-pipe.sssom.tsv | 8 + .../sets/test-escaping-tsv.sssom.tsv | 4 +- 6 files changed, 207 insertions(+), 31 deletions(-) create mode 100644 core/src/test/resources/sets/test-escaping-pipe.sssom.tsv diff --git a/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java b/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java index d677116e..bbaa8925 100644 --- a/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java +++ b/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java @@ -32,7 +32,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.regex.Pattern; import org.incenp.obofoundry.sssom.model.ExtensionDefinition; import org.incenp.obofoundry.sssom.model.ExtensionValue; @@ -72,9 +71,6 @@ */ public class TSVWriter extends SSSOMWriter { - private static final Pattern tsvSpecialChars = Pattern.compile("[\t\n\r\"]"); - private static final Pattern csvSpecialChars = Pattern.compile("[,\n\r\"]"); - private BufferedWriter tsvWriter, metaWriter; private Set usedPrefixes = new HashSet(); private boolean isCSV = false; @@ -630,16 +626,7 @@ public void visit(StringSlot slot, Mapping object, List values) results.add(""); return; } - - StringBuilder sb = new StringBuilder(); - for ( int i = 0, n = values.size(); i < n; i++ ) { - String value = values.get(i); - sb.append(escapeTSV(value)); - if ( i < n - 1 ) { - sb.append('|'); - } - } - results.add(sb.toString()); + results.add(escapeTSV(values)); } @Override @@ -649,15 +636,7 @@ public void visit(EntityReferenceSlot slot, Mapping object, List slot, Mapping object, Mapindividual values here, it is the entire |-separated multivalue that + * must be quoted if any single value within it contains quote-triggering + * characters. + */ + private String escapeTSV(List values) { + StringBuilder sb = new StringBuilder(); + boolean quotesNeeded = false; + int nValues = values.size(); + for ( int i = 0; i < nValues; i++ ) { + String value = values.get(i); + if ( i > 0 ) { + sb.append('|'); + } + + int len = value.length(); + for ( int j = 0; j < len; j++ ) { + char c = value.charAt(j); + switch ( c ) { + case ',': + if ( isCSV ) { + quotesNeeded = true; + } + break; + + case '\t': + if ( !isCSV ) { + quotesNeeded = true; + } + break; + + case '\r': + case '\n': + quotesNeeded = true; + break; + + case '"': + quotesNeeded = true; + sb.append('"'); + break; + + case '\\': + // The backslash needs escaping only if (1) it is followed by another backslash + // or a pipe, or (2) it is the last character of the current value and there are + // more values to follow. + if ( j < len - 1 ) { + char next = value.charAt(j + 1); + if ( next == '\\' || next == '|' ) { + sb.append('\\'); + } + } else if ( i < nValues - 1 ) { + sb.append('\\'); + } + break; + + case '|': + sb.append('\\'); + break; + } + sb.append(c); + } + } + + if ( quotesNeeded ) { + sb.insert(0, '"'); + sb.append('"'); + } + + return sb.toString(); } } } diff --git a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java index 70b21295..4396eee8 100644 --- a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java +++ b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java @@ -400,7 +400,7 @@ private List getListOfStrings(String slotName, Object rawValue) throws S * they were single-valued, which is strictly speaking but happens in the wild * (including in the examples shown in the SSSOM documentation!). */ - for ( String item : rawValue.toString().split("\\|") ) { + for ( String item : splitString(rawValue.toString()) ) { value.add(item); } } else { @@ -409,6 +409,49 @@ private List getListOfStrings(String slotName, Object rawValue) throws S return value; } + /* + * Splits a string along pipe (`|`) characters, with support for backslash + * escaping. + */ + private List splitString(String value) { + ArrayList list = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + int len = value.length(); + boolean escaped = false; + for ( int i = 0; i < len; i++ ) { + char c = value.charAt(i); + if ( escaped ) { + sb.append(c); + escaped = false; + } else if ( c == '\\' ) { + // The backslash is treated as an escape character only if it is followed by + // another backslash or a pipe. + if ( i < len - 1 ) { + char next = value.charAt(i + 1); + if ( next == '\\' || next == '|' ) { + escaped = true; + } + } + // Otherwise it is a normal character. + if ( !escaped ) { + sb.append(c); + } + } else if ( c == '|' ) { + if ( sb.length() > 0 ) { + list.add(sb.toString()); + sb.delete(0, sb.length()); + } + } else { + sb.append(c); + } + } + if ( sb.length() > 0 ) { + list.add(sb.toString()); + } + + return list; + } + /* * Parses the "extension_definitions" key. */ diff --git a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java index 89f39ff9..50351522 100644 --- a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java +++ b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java @@ -493,6 +493,28 @@ void testEscapedTSV() throws IOException, SSSOMFormatException { Assertions.assertEquals("Value\u0009with\u0009tab\u0009characters", m.getComment()); Assertions.assertEquals("Value with \"quote\" characters", m.getObjectLabel()); Assertions.assertEquals("Value with\nnew line character", m.getIssueTrackerItem()); + Assertions.assertEquals("Alice", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Bob\tand\tCharlie", m.getAuthorLabel().get(1)); + } + + /* + * Test that the parser can handle escaped pipe characters in multi-valued + * slots. + */ + @Test + void testEscapedPipe() throws IOException, SSSOMFormatException { + TSVReader reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv"); + MappingSet ms = reader.read(); + Mapping m = ms.getMappings().get(0); + + Assertions.assertEquals("Alice|Bob", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Charlie", m.getAuthorLabel().get(1)); + + m = ms.getMappings().get(1); + Assertions.assertEquals(3, m.getAuthorLabel().size()); + Assertions.assertEquals("Alice\\Bob", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Charlie\\", m.getAuthorLabel().get(1)); + Assertions.assertEquals("David\\|Eve\\", m.getAuthorLabel().get(2)); } /* diff --git a/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java b/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java index 2fa0ca3c..19e9f19a 100644 --- a/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java +++ b/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java @@ -333,10 +333,29 @@ void testEscapingTSV() throws IOException, SSSOMFormatException { ms.getMappings().get(0).setComment("Value\twith\ttab\tcharacters"); ms.getMappings().get(0).setObjectLabel("Value with \"quote\" characters"); ms.getMappings().get(0).setIssueTrackerItem("Value with\nnew line character"); + ms.getMappings().get(0).getAuthorLabel(true).add("Alice"); + ms.getMappings().get(0).getAuthorLabel().add("Bob\tand\tCharlie"); assertWrittenAsExpected(ms, "test-escaping-tsv", null, null, null); } + @Test + void testEscapingPipeCharacter() throws IOException, SSSOMFormatException { + MappingSet ms = getTestSet(); + ms.setMappingSetId("https://example.org/sets/test-escaping-pipe"); + ms.getMappings().get(0).getAuthorLabel(true).add("Alice|Bob"); + ms.getMappings().get(0).getAuthorLabel().add("Charlie"); + + Mapping m2 = ms.getMappings().get(0).toBuilder().subjectId("https://example.org/entities/0002") + .authorLabel(new ArrayList<>()).build(); + m2.getAuthorLabel().add("Alice\\Bob"); + m2.getAuthorLabel().add("Charlie\\"); + m2.getAuthorLabel().add("David\\|Eve\\"); + ms.getMappings().add(m2); + + assertWrittenAsExpected(ms, "test-escaping-pipe", null, null, null); + } + @Test void testWritingEnumValuesInYAML() throws IOException, SSSOMFormatException { MappingSet ms = getTestSet(); diff --git a/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv b/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv new file mode 100644 index 00000000..54b49396 --- /dev/null +++ b/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv @@ -0,0 +1,8 @@ +#curie_map: +# COMENT: https://example.com/entities/ +# ORGENT: https://example.org/entities/ +#mapping_set_id: https://example.org/sets/test-escaping-pipe +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id subject_label predicate_id object_id object_label mapping_justification author_label +ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\|Bob|Charlie +ORGENT:0002 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\Bob|Charlie\\|David\\\|Eve\ diff --git a/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv b/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv index 47a5174e..9a7e9a72 100644 --- a/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv +++ b/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv @@ -3,6 +3,6 @@ # ORGENT: https://example.org/entities/ #mapping_set_id: https://example.org/sets/test-escaping-tsv #license: https://creativecommons.org/licenses/by/4.0/ -subject_id subject_label predicate_id object_id object_label mapping_justification issue_tracker_item comment -ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Value with +subject_id subject_label predicate_id object_id object_label mapping_justification author_label issue_tracker_item comment +ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Alice|Bob and Charlie" "Value with new line character" "Value with tab characters" From 4cc453584f5a49cf7f93350fc23997825b2f436e Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 10 Feb 2026 19:44:33 +0000 Subject: [PATCH 2/2] Disable support for pipe escaping in SSSOM 1.0 mode. When reading a set in SSSOM 1.0 compliance mode, we should not try to interpret a `\|` sequence as a literal `|` character, since the escaping mechanism was not described in version 1.0 of the spec. --- .../org/incenp/obofoundry/sssom/YAMLConverter.java | 4 +++- .../org/incenp/obofoundry/sssom/TSVReaderTest.java | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java index 4396eee8..16eed27a 100644 --- a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java +++ b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java @@ -56,6 +56,7 @@ public class YAMLConverter { private ExtensionSlotManager extensionManager; private ExtraMetadataPolicy extraPolicy = ExtraMetadataPolicy.NONE; private Version assumedVersion = Version.SSSOM_1_0; + private boolean supportEscapedPipes = false; /** * Creates a new YAML converter. @@ -184,6 +185,7 @@ public MappingSet convertMappingSet(Map rawMap) throws SSSOMForm if ( version == Version.UNKNOWN ) { version = Version.LATEST; } + supportEscapedPipes = version != Version.SSSOM_1_0; // Process the CURIE map, so that we can expand CURIEs as soon as possible Object rawCurieMap = rawMap.getOrDefault("curie_map", new HashMap()); @@ -423,7 +425,7 @@ private List splitString(String value) { if ( escaped ) { sb.append(c); escaped = false; - } else if ( c == '\\' ) { + } else if ( c == '\\' && supportEscapedPipes ) { // The backslash is treated as an escape character only if it is followed by // another backslash or a pipe. if ( i < len - 1 ) { diff --git a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java index 50351522..c484690a 100644 --- a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java +++ b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java @@ -504,6 +504,7 @@ void testEscapedTSV() throws IOException, SSSOMFormatException { @Test void testEscapedPipe() throws IOException, SSSOMFormatException { TSVReader reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv"); + reader.setAssumedVersion(Version.SSSOM_1_1); MappingSet ms = reader.read(); Mapping m = ms.getMappings().get(0); @@ -515,6 +516,15 @@ void testEscapedPipe() throws IOException, SSSOMFormatException { Assertions.assertEquals("Alice\\Bob", m.getAuthorLabel().get(0)); Assertions.assertEquals("Charlie\\", m.getAuthorLabel().get(1)); Assertions.assertEquals("David\\|Eve\\", m.getAuthorLabel().get(2)); + + // Try again in SSSOM 1.0 compliance mode; there should be no escaping + reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv"); + reader.setAssumedVersion(Version.SSSOM_1_0); + m = reader.read().getMappings().get(0); + + Assertions.assertEquals("Alice\\", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Bob", m.getAuthorLabel().get(1)); + Assertions.assertEquals("Charlie", m.getAuthorLabel().get(2)); } /*