diff --git a/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java b/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java index d677116..bbaa892 100644 --- a/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java +++ b/core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java @@ -32,7 +32,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.regex.Pattern; import org.incenp.obofoundry.sssom.model.ExtensionDefinition; import org.incenp.obofoundry.sssom.model.ExtensionValue; @@ -72,9 +71,6 @@ */ public class TSVWriter extends SSSOMWriter { - private static final Pattern tsvSpecialChars = Pattern.compile("[\t\n\r\"]"); - private static final Pattern csvSpecialChars = Pattern.compile("[,\n\r\"]"); - private BufferedWriter tsvWriter, metaWriter; private Set usedPrefixes = new HashSet(); private boolean isCSV = false; @@ -630,16 +626,7 @@ public void visit(StringSlot slot, Mapping object, List values) results.add(""); return; } - - StringBuilder sb = new StringBuilder(); - for ( int i = 0, n = values.size(); i < n; i++ ) { - String value = values.get(i); - sb.append(escapeTSV(value)); - if ( i < n - 1 ) { - sb.append('|'); - } - } - results.add(sb.toString()); + results.add(escapeTSV(values)); } @Override @@ -649,15 +636,7 @@ public void visit(EntityReferenceSlot slot, Mapping object, List slot, Mapping object, Mapindividual values here, it is the entire |-separated multivalue that + * must be quoted if any single value within it contains quote-triggering + * characters. + */ + private String escapeTSV(List values) { + StringBuilder sb = new StringBuilder(); + boolean quotesNeeded = false; + int nValues = values.size(); + for ( int i = 0; i < nValues; i++ ) { + String value = values.get(i); + if ( i > 0 ) { + sb.append('|'); + } + + int len = value.length(); + for ( int j = 0; j < len; j++ ) { + char c = value.charAt(j); + switch ( c ) { + case ',': + if ( isCSV ) { + quotesNeeded = true; + } + break; + + case '\t': + if ( !isCSV ) { + quotesNeeded = true; + } + break; + + case '\r': + case '\n': + quotesNeeded = true; + break; + + case '"': + quotesNeeded = true; + sb.append('"'); + break; + + case '\\': + // The backslash needs escaping only if (1) it is followed by another backslash + // or a pipe, or (2) it is the last character of the current value and there are + // more values to follow. + if ( j < len - 1 ) { + char next = value.charAt(j + 1); + if ( next == '\\' || next == '|' ) { + sb.append('\\'); + } + } else if ( i < nValues - 1 ) { + sb.append('\\'); + } + break; + + case '|': + sb.append('\\'); + break; + } + sb.append(c); + } + } + + if ( quotesNeeded ) { + sb.insert(0, '"'); + sb.append('"'); + } + + return sb.toString(); } } } diff --git a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java index 70b2129..16eed27 100644 --- a/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java +++ b/core/src/main/java/org/incenp/obofoundry/sssom/YAMLConverter.java @@ -56,6 +56,7 @@ public class YAMLConverter { private ExtensionSlotManager extensionManager; private ExtraMetadataPolicy extraPolicy = ExtraMetadataPolicy.NONE; private Version assumedVersion = Version.SSSOM_1_0; + private boolean supportEscapedPipes = false; /** * Creates a new YAML converter. @@ -184,6 +185,7 @@ public MappingSet convertMappingSet(Map rawMap) throws SSSOMForm if ( version == Version.UNKNOWN ) { version = Version.LATEST; } + supportEscapedPipes = version != Version.SSSOM_1_0; // Process the CURIE map, so that we can expand CURIEs as soon as possible Object rawCurieMap = rawMap.getOrDefault("curie_map", new HashMap()); @@ -400,7 +402,7 @@ private List getListOfStrings(String slotName, Object rawValue) throws S * they were single-valued, which is strictly speaking but happens in the wild * (including in the examples shown in the SSSOM documentation!). */ - for ( String item : rawValue.toString().split("\\|") ) { + for ( String item : splitString(rawValue.toString()) ) { value.add(item); } } else { @@ -409,6 +411,49 @@ private List getListOfStrings(String slotName, Object rawValue) throws S return value; } + /* + * Splits a string along pipe (`|`) characters, with support for backslash + * escaping. + */ + private List splitString(String value) { + ArrayList list = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + int len = value.length(); + boolean escaped = false; + for ( int i = 0; i < len; i++ ) { + char c = value.charAt(i); + if ( escaped ) { + sb.append(c); + escaped = false; + } else if ( c == '\\' && supportEscapedPipes ) { + // The backslash is treated as an escape character only if it is followed by + // another backslash or a pipe. + if ( i < len - 1 ) { + char next = value.charAt(i + 1); + if ( next == '\\' || next == '|' ) { + escaped = true; + } + } + // Otherwise it is a normal character. + if ( !escaped ) { + sb.append(c); + } + } else if ( c == '|' ) { + if ( sb.length() > 0 ) { + list.add(sb.toString()); + sb.delete(0, sb.length()); + } + } else { + sb.append(c); + } + } + if ( sb.length() > 0 ) { + list.add(sb.toString()); + } + + return list; + } + /* * Parses the "extension_definitions" key. */ diff --git a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java index 89f39ff..c484690 100644 --- a/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java +++ b/core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java @@ -493,6 +493,38 @@ void testEscapedTSV() throws IOException, SSSOMFormatException { Assertions.assertEquals("Value\u0009with\u0009tab\u0009characters", m.getComment()); Assertions.assertEquals("Value with \"quote\" characters", m.getObjectLabel()); Assertions.assertEquals("Value with\nnew line character", m.getIssueTrackerItem()); + Assertions.assertEquals("Alice", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Bob\tand\tCharlie", m.getAuthorLabel().get(1)); + } + + /* + * Test that the parser can handle escaped pipe characters in multi-valued + * slots. + */ + @Test + void testEscapedPipe() throws IOException, SSSOMFormatException { + TSVReader reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv"); + reader.setAssumedVersion(Version.SSSOM_1_1); + MappingSet ms = reader.read(); + Mapping m = ms.getMappings().get(0); + + Assertions.assertEquals("Alice|Bob", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Charlie", m.getAuthorLabel().get(1)); + + m = ms.getMappings().get(1); + Assertions.assertEquals(3, m.getAuthorLabel().size()); + Assertions.assertEquals("Alice\\Bob", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Charlie\\", m.getAuthorLabel().get(1)); + Assertions.assertEquals("David\\|Eve\\", m.getAuthorLabel().get(2)); + + // Try again in SSSOM 1.0 compliance mode; there should be no escaping + reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv"); + reader.setAssumedVersion(Version.SSSOM_1_0); + m = reader.read().getMappings().get(0); + + Assertions.assertEquals("Alice\\", m.getAuthorLabel().get(0)); + Assertions.assertEquals("Bob", m.getAuthorLabel().get(1)); + Assertions.assertEquals("Charlie", m.getAuthorLabel().get(2)); } /* diff --git a/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java b/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java index 2fa0ca3..19e9f19 100644 --- a/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java +++ b/core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java @@ -333,10 +333,29 @@ void testEscapingTSV() throws IOException, SSSOMFormatException { ms.getMappings().get(0).setComment("Value\twith\ttab\tcharacters"); ms.getMappings().get(0).setObjectLabel("Value with \"quote\" characters"); ms.getMappings().get(0).setIssueTrackerItem("Value with\nnew line character"); + ms.getMappings().get(0).getAuthorLabel(true).add("Alice"); + ms.getMappings().get(0).getAuthorLabel().add("Bob\tand\tCharlie"); assertWrittenAsExpected(ms, "test-escaping-tsv", null, null, null); } + @Test + void testEscapingPipeCharacter() throws IOException, SSSOMFormatException { + MappingSet ms = getTestSet(); + ms.setMappingSetId("https://example.org/sets/test-escaping-pipe"); + ms.getMappings().get(0).getAuthorLabel(true).add("Alice|Bob"); + ms.getMappings().get(0).getAuthorLabel().add("Charlie"); + + Mapping m2 = ms.getMappings().get(0).toBuilder().subjectId("https://example.org/entities/0002") + .authorLabel(new ArrayList<>()).build(); + m2.getAuthorLabel().add("Alice\\Bob"); + m2.getAuthorLabel().add("Charlie\\"); + m2.getAuthorLabel().add("David\\|Eve\\"); + ms.getMappings().add(m2); + + assertWrittenAsExpected(ms, "test-escaping-pipe", null, null, null); + } + @Test void testWritingEnumValuesInYAML() throws IOException, SSSOMFormatException { MappingSet ms = getTestSet(); diff --git a/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv b/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv new file mode 100644 index 0000000..54b4939 --- /dev/null +++ b/core/src/test/resources/sets/test-escaping-pipe.sssom.tsv @@ -0,0 +1,8 @@ +#curie_map: +# COMENT: https://example.com/entities/ +# ORGENT: https://example.org/entities/ +#mapping_set_id: https://example.org/sets/test-escaping-pipe +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id subject_label predicate_id object_id object_label mapping_justification author_label +ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\|Bob|Charlie +ORGENT:0002 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\Bob|Charlie\\|David\\\|Eve\ diff --git a/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv b/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv index 47a5174..9a7e9a7 100644 --- a/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv +++ b/core/src/test/resources/sets/test-escaping-tsv.sssom.tsv @@ -3,6 +3,6 @@ # ORGENT: https://example.org/entities/ #mapping_set_id: https://example.org/sets/test-escaping-tsv #license: https://creativecommons.org/licenses/by/4.0/ -subject_id subject_label predicate_id object_id object_label mapping_justification issue_tracker_item comment -ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Value with +subject_id subject_label predicate_id object_id object_label mapping_justification author_label issue_tracker_item comment +ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Alice|Bob and Charlie" "Value with new line character" "Value with tab characters"