diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6df7bd1..01f5298 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,17 +16,22 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Set up Scala uses: actions/setup-java@v4 with: distribution: temurin - java-version: 11 + java-version: 21 cache: sbt + + - uses: sbt/setup-sbt@v1 + - name: Show output of sed to build.sbt command for logs run: | VERS_TAG=$(echo $GIT_REF | sed 's/refs\/tags\/v//g') VERS_TAG="\"$VERS_TAG"\" sed "s/version := *.*/version := $VERS_TAG/" build.sbt + - name: Write Tag to Scala Build Version run: | VERS_TAG=$(echo $GIT_REF | sed 's/refs\/tags\/v//g') @@ -44,11 +49,15 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Submit SBT dependencies to the GitHub dependency graph. + - name: Sbt Dependency Submission + uses: scalacenter/sbt-dependency-submission@v3.1.0 + - name: Build Universal Artifact run: sbt Universal/packageBin - name: Archive package and documentation - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: csvw-check-universal path: target/universal/csvw-check-*.zip @@ -68,7 +77,7 @@ jobs: rel_cand_substring='rc' vers_num_lower_case=${vers_num,,} - local_param="csvwcheck:$vers_num" + local_param="csvw-check:$vers_num" remote_param="gsscogs/csvw-check" push_param="$remote_param:v$vers_num" latest_param="$remote_param:latest" diff --git a/.github/workflows/reusable-test.yaml b/.github/workflows/reusable-test.yaml index b1b5e1e..4400596 100644 --- a/.github/workflows/reusable-test.yaml +++ b/.github/workflows/reusable-test.yaml @@ -10,7 +10,7 @@ jobs: test_in_environments: runs-on: ${{ inputs.os }} container: - image: sbtscala/scala-sbt:graalvm-ce-22.3.0-b2-java17_1.8.3_2.13.10 + image: sbtscala/scala-sbt:graalvm-ce-22.3.3-b1-java17_1.10.7_3.6.3 steps: - uses: actions/checkout@v3 with: @@ -24,7 +24,7 @@ jobs: - name: Archive test results from xml files if: always() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ inputs.os }} test results path: | @@ -35,7 +35,7 @@ jobs: runs-on: ubuntu-latest if: always() steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: ${{ inputs.os }} test results diff --git a/build.sbt b/build.sbt index ccc7fbf..32ecf9c 100644 --- a/build.sbt +++ b/build.sbt @@ -1,43 +1,48 @@ +import com.typesafe.sbt.packager.docker.Cmd + name := "csvw-check" organization := "io.github.gss-cogs" version := "0.0.3" maintainer := "csvcubed@gsscogs.uk" -scalaVersion := "2.13.4" +scalaVersion := "2.13.16" scalacOptions ++= Seq("-deprecation", "-feature") autoCompilerPlugins := true enablePlugins(JavaAppPackaging) -enablePlugins(DockerPlugin) enablePlugins(UniversalPlugin) +enablePlugins(DockerPlugin) +enablePlugins(AshScriptPlugin) -dockerBaseImage := "openjdk:11" -dockerEntrypoint := Seq("bash") +dockerBaseImage := "eclipse-temurin:23-jre-alpine" +dockerEntrypoint := Seq("/opt/docker/bin/csvw-check") dockerEnvVars := Map("PATH" -> "$PATH:/opt/docker/bin") -Docker / packageName := "csvwcheck" +Docker / packageName := "csvw-check" -libraryDependencies += "io.cucumber" %% "cucumber-scala" % "8.14.1" % Test -libraryDependencies += "io.cucumber" % "cucumber-junit" % "7.11.1" % Test +libraryDependencies += "io.cucumber" %% "cucumber-scala" % "8.26.1" % Test +libraryDependencies += "io.cucumber" % "cucumber-junit" % "7.21.1" % Test libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % Test -libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.15" % Test +libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.19" % Test libraryDependencies += "com.github.pathikrit" %% "better-files" % "3.9.2" % Test libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.2" libraryDependencies += "io.spray" %% "spray-json" % "1.3.6" -libraryDependencies += "org.apache.jena" % "jena-arq" % "4.4.0" -libraryDependencies += "joda-time" % "joda-time" % "2.12.2" +libraryDependencies += "org.apache.jena" % "jena-arq" % "5.3.0" +libraryDependencies += "joda-time" % "joda-time" % "2.13.1" libraryDependencies += "com.github.scopt" %% "scopt" % "4.1.0" +// Past version 2.6.21 AKKA starts requiring a license key which doesn't fit the use-case of this OSS project at all. +// Unfortunately it looks like we will want to stop using AKKA entirely, which would require quite a bit of work. libraryDependencies += "com.typesafe.akka" %% "akka-stream" % "2.6.21" -libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.5.3" +libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.5.16" libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.9.5" -libraryDependencies += "com.fasterxml.jackson.core" % "jackson-databind" % "2.14.2" -libraryDependencies += "com.fasterxml.jackson.core" % "jackson-annotations" % "2.14.2" -libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.14.2" -libraryDependencies += "com.softwaremill.sttp.client3" %% "core" % "3.8.15" -libraryDependencies += "com.ibm.icu" % "icu4j" % "72.1" -libraryDependencies += "org.apache.commons" % "commons-csv" % "1.10.0" -libraryDependencies += "com.chuusai" %% "shapeless" % "2.3.10" +libraryDependencies += "com.fasterxml.jackson.core" % "jackson-databind" % "2.18.2" +libraryDependencies += "com.fasterxml.jackson.core" % "jackson-annotations" % "2.18.2" +libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.18.2" +libraryDependencies += "com.softwaremill.sttp.client3" %% "core" % "3.10.3" +libraryDependencies += "com.ibm.icu" % "icu4j" % "76.1" +libraryDependencies += "org.apache.commons" % "commons-csv" % "1.13.0" +libraryDependencies += "com.chuusai" %% "shapeless" % "2.3.12" publishTo := Some("GitHub Maven package repo for GSS-Cogs" at "https://maven.pkg.github.com/gss-cogs/csvw-check") publishMavenStyle := true @@ -50,4 +55,4 @@ credentials += Credentials( organizationName := "Crown Copyright (Office for National Statistics)" startYear := Some(2020) -licenses += ("Apache-2.0", new URL("https://www.apache.org/licenses/LICENSE-2.0.txt")) \ No newline at end of file +licenses += ("Apache-2.0", new URI("https://www.apache.org/licenses/LICENSE-2.0.txt").toURL) \ No newline at end of file diff --git a/project/build.properties b/project/build.properties index e64c208..73df629 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.5.8 +sbt.version=1.10.7 diff --git a/project/plugins.sbt b/project/plugins.sbt index 05e65d8..e8e0708 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,2 +1,2 @@ addSbtPlugin("org.jetbrains" % "sbt-ide-settings" % "1.1.0") -addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.0") \ No newline at end of file +addSbtPlugin("com.github.sbt" % "sbt-native-packager" % "1.11.1") diff --git a/src/main/scala/csvwcheck/ConfiguredObjectMapper.scala b/src/main/scala/csvwcheck/ConfiguredObjectMapper.scala index 1380d82..11892f8 100644 --- a/src/main/scala/csvwcheck/ConfiguredObjectMapper.scala +++ b/src/main/scala/csvwcheck/ConfiguredObjectMapper.scala @@ -16,14 +16,19 @@ package csvwcheck -import com.fasterxml.jackson.databind.node.JsonNodeFactory +import com.fasterxml.jackson.databind.cfg.JsonNodeFeature import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} object ConfiguredObjectMapper { val objectMapper = new ObjectMapper() - objectMapper.setNodeFactory(JsonNodeFactory.withExactBigDecimals(true)) + objectMapper.configure( - DeserializationFeature.USE_BIG_DECIMAL_FOR_FLOATS, + JsonNodeFeature.STRIP_TRAILING_BIGDECIMAL_ZEROES, true ) + + objectMapper.configure( + DeserializationFeature.USE_BIG_DECIMAL_FOR_FLOATS, + true + ) } diff --git a/src/main/scala/csvwcheck/Main.scala b/src/main/scala/csvwcheck/Main.scala index 0aebb55..1be41a8 100644 --- a/src/main/scala/csvwcheck/Main.scala +++ b/src/main/scala/csvwcheck/Main.scala @@ -69,7 +69,6 @@ object Main extends App { implicit val actorSystem: ActorSystem = ActorSystem("actor-system") - val numParallelThreads: Int = sys.env.get("PARALLELISM") match { case Some(value) => value.toInt case None => Runtime.getRuntime.availableProcessors() @@ -116,6 +115,14 @@ object Main extends App { val rootLogger = Logger("ROOT") val underlyingLogger = rootLogger.underlying.asInstanceOf[ch.qos.logback.classic.Logger] underlyingLogger.setLevel(logLevel) + + val akkaLogLevel = logLevel match { + case Level.WARN => "WARNING" + case Level.TRACE => "DEBUG" + case Level.OFF|Level.INFO|Level.DEBUG|Level.ERROR => logLevel.toString + } + System.setProperty("akka.loglevel", akkaLogLevel) + rootLogger } @@ -123,14 +130,20 @@ object Main extends App { errorMessage: MessageWithCsvContext ): String = { val message = new StringBuilder() + + message.append(s"Type: ${errorMessage.`type`}") + + errorMessage.csvFilePath + .foreach(csvFilePath => message.append(s" in CSV '$csvFilePath'")) + if (errorMessage.row.nonEmpty) { - message.append(s"Row: ${errorMessage.row}$newLine") + message.append(s", Row: ${errorMessage.row}") } if (errorMessage.column.nonEmpty) { - message.append(s", Column: ${errorMessage.column}$newLine") + message.append(s", Column: '${errorMessage.column}'") } if (errorMessage.content.nonEmpty) { - message.append(s": ${errorMessage.content}$newLine") + message.append(s"$newLine${errorMessage.content}$newLine") } message.toString() diff --git a/src/main/scala/csvwcheck/Validator.scala b/src/main/scala/csvwcheck/Validator.scala index 63803d8..16d0243 100644 --- a/src/main/scala/csvwcheck/Validator.scala +++ b/src/main/scala/csvwcheck/Validator.scala @@ -53,10 +53,8 @@ class Validator( val metadataJsonLocation = csvwLinkHeaderRegEx.replaceAllIn(header, "$1") // Now make the URL absolute if it isn't already. - new URL( - new URL(getUriWithoutQueryString(csvUri).toString), - metadataJsonLocation - ).toURI + getUriWithoutQueryString(csvUri) + .resolve(metadataJsonLocation) }) } else { None diff --git a/src/main/scala/csvwcheck/errors/MessageWithCsvContext.scala b/src/main/scala/csvwcheck/errors/MessageWithCsvContext.scala index 2cb3751..ed3a0b1 100644 --- a/src/main/scala/csvwcheck/errors/MessageWithCsvContext.scala +++ b/src/main/scala/csvwcheck/errors/MessageWithCsvContext.scala @@ -31,6 +31,8 @@ abstract class MessageWithCsvContext { def content: String def constraints: String + + def csvFilePath: Option[String] } case class ErrorWithCsvContext( @@ -39,7 +41,8 @@ case class ErrorWithCsvContext( row: String, column: String, content: String, - constraints: String + constraints: String, + csvFilePath: Option[String] = None ) extends MessageWithCsvContext {} case class WarningWithCsvContext( @@ -48,5 +51,6 @@ case class WarningWithCsvContext( row: String, column: String, content: String, - constraints: String + constraints: String, + csvFilePath: Option[String] = None ) extends MessageWithCsvContext {} diff --git a/src/main/scala/csvwcheck/models/Column.scala b/src/main/scala/csvwcheck/models/Column.scala index f908345..41ab3e4 100644 --- a/src/main/scala/csvwcheck/models/Column.scala +++ b/src/main/scala/csvwcheck/models/Column.scala @@ -23,6 +23,7 @@ import csvwcheck.errors.{ErrorWithCsvContext, ErrorWithoutContext, MetadataError import csvwcheck.models import csvwcheck.models.Column._ import csvwcheck.models.ParseResult.ParseResult +import csvwcheck.models.Values.ColumnValue import csvwcheck.normalisation.Constants.undefinedLanguage import csvwcheck.normalisation.Utils.parseNodeAsText import csvwcheck.numberformatparser.LdmlNumberFormatParser @@ -33,6 +34,7 @@ import org.joda.time.{DateTime, DateTimeZone} import java.math.BigInteger import java.time.{LocalDateTime, Month, ZoneId, ZonedDateTime} +import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.math.BigInt.javaBigInteger2bigInt @@ -1135,7 +1137,7 @@ case class Column private( def validate( value: String - ): (Array[ErrorWithoutContext], List[Any]) = { + ): (Array[ErrorWithoutContext], ColumnValue) = { val errors = ArrayBuffer.empty[ErrorWithoutContext] if (nullParam.contains(value)) { // Since the cell value is among the null values specified for this CSV-W, it can be considered as the default null value which is "" @@ -1145,9 +1147,9 @@ case class Column private( } (errors.toArray, List.empty) } else { - val valuesArrayToReturn = ArrayBuffer.empty[Any] + val parsedColumnValues = ArrayBuffer.empty[Any] val values = separator match { - case Some(separator) => value.split(separator) + case Some(separator) => value.split(Pattern.quote(separator)) case None => Array[String](value) } val parserForDataType = datatypeParser(baseDataType) @@ -1160,7 +1162,7 @@ case class Column private( s"'$v' - ${errorMessageContent.content} (${format.flatMap(_.pattern).getOrElse("no format provided")})" ) ) - valuesArrayToReturn.addOne(s"invalid - $v") + parsedColumnValues.addOne(s"invalid - $v") case Right(s) => errors.addAll(validateLength(s.toString)) errors.addAll(validateValue(s)) @@ -1174,11 +1176,11 @@ case class Column private( } if (errors.isEmpty) { - valuesArrayToReturn.addOne(s) + parsedColumnValues.addOne(s) } } } - (errors.toArray, valuesArrayToReturn.toList) + (errors.toArray, parsedColumnValues.toList) } } diff --git a/src/main/scala/csvwcheck/models/ForeignKeyDefinition.scala b/src/main/scala/csvwcheck/models/ForeignKeyDefinition.scala index 78f1bff..e081737 100644 --- a/src/main/scala/csvwcheck/models/ForeignKeyDefinition.scala +++ b/src/main/scala/csvwcheck/models/ForeignKeyDefinition.scala @@ -21,7 +21,7 @@ import com.fasterxml.jackson.databind.node.ObjectNode case class ForeignKeyDefinition( jsonObject: ObjectNode, localColumns: Array[Column] - ) { + ) extends Key { override def toString: String = s"ForeignKeyDefinition([${localColumns.map(_.name.getOrElse("unnamed column")).mkString(", ")}])" diff --git a/src/main/scala/csvwcheck/models/Key.scala b/src/main/scala/csvwcheck/models/Key.scala new file mode 100644 index 0000000..1f16f4d --- /dev/null +++ b/src/main/scala/csvwcheck/models/Key.scala @@ -0,0 +1,8 @@ +package csvwcheck.models + +/** + * Represents a primary or foreign key. + */ +abstract class Key { + +} diff --git a/src/main/scala/csvwcheck/models/KeyWithContext.scala b/src/main/scala/csvwcheck/models/KeyValueWithContext.scala similarity index 72% rename from src/main/scala/csvwcheck/models/KeyWithContext.scala rename to src/main/scala/csvwcheck/models/KeyValueWithContext.scala index a5ea2e9..c14d2e4 100644 --- a/src/main/scala/csvwcheck/models/KeyWithContext.scala +++ b/src/main/scala/csvwcheck/models/KeyValueWithContext.scala @@ -16,10 +16,12 @@ package csvwcheck.models -case class KeyWithContext( - rowNumber: Long, - keyValues: List[Any], - var isDuplicate: Boolean = false +import csvwcheck.models.Values.KeyValue + +case class KeyValueWithContext( + rowNumber: Long, + keyValue: KeyValue, + var isDuplicate: Boolean = false ) { /** @@ -29,13 +31,13 @@ case class KeyWithContext( */ override def equals(obj: Any): Boolean = obj != null && - obj.isInstanceOf[KeyWithContext] && - this.keyValues.equals(obj.asInstanceOf[KeyWithContext].keyValues) + obj.isInstanceOf[KeyValueWithContext] && + this.keyValue.equals(obj.asInstanceOf[KeyValueWithContext].keyValue) - override def hashCode(): Int = this.keyValues.hashCode() + override def hashCode(): Int = this.keyValue.hashCode() def keyValuesToString(): String = { - val stringList = keyValues.map { + val stringList = keyValue.map { case listOfAny: List[Any] => listOfAny.map(s => s.toString).mkString(",") case i => i.toString diff --git a/src/main/scala/csvwcheck/models/ReferencedTableForeignKeyReference.scala b/src/main/scala/csvwcheck/models/ReferencedTableForeignKeyReference.scala index 7e2484a..0833f65 100644 --- a/src/main/scala/csvwcheck/models/ReferencedTableForeignKeyReference.scala +++ b/src/main/scala/csvwcheck/models/ReferencedTableForeignKeyReference.scala @@ -29,7 +29,7 @@ case class ReferencedTableForeignKeyReference( * The table the foreign key was defined on. */ definitionTable: Table - ) { + ) extends Key { override def toString: String = s"ReferencedTableForeignKeyReference($definitionTable.[${ foreignKeyDefinition.localColumns diff --git a/src/main/scala/csvwcheck/models/Table.scala b/src/main/scala/csvwcheck/models/Table.scala index e449cca..1a7e84b 100644 --- a/src/main/scala/csvwcheck/models/Table.scala +++ b/src/main/scala/csvwcheck/models/Table.scala @@ -24,6 +24,7 @@ import csvwcheck.errors.{ErrorWithCsvContext, MetadataError, WarningWithCsvConte import csvwcheck.models import csvwcheck.models.ParseResult.ParseResult import csvwcheck.models.Table.csvDownloadsTemporaryDirectory +import csvwcheck.models.Values.{ColumnValue, KeyComponentValue, KeyValue} import csvwcheck.normalisation.InheritedProperties import csvwcheck.normalisation.Utils.parseNodeAsText import csvwcheck.traits.JavaIteratorExtensions.IteratorHasAsScalaArray @@ -46,9 +47,9 @@ import scala.util.control.NonFatal object Table { type MapForeignKeyDefinitionToValues = - Map[ForeignKeyDefinition, Set[KeyWithContext]] + Map[ForeignKeyDefinition, Set[KeyValueWithContext]] type MapForeignKeyReferenceToValues = - Map[ReferencedTableForeignKeyReference, Set[KeyWithContext]] + Map[ReferencedTableForeignKeyReference, Set[KeyValueWithContext]] type PrimaryKeysAndErrors = (mutable.Set[List[Any]], ArrayBuffer[ErrorWithCsvContext]) @@ -201,8 +202,8 @@ case class Table private( ( WarningsAndErrors(warnings = warnings), - Map[ForeignKeyDefinition, Set[KeyWithContext]](), - Map[ReferencedTableForeignKeyReference, Set[KeyWithContext]]() + Map[ForeignKeyDefinition, Set[KeyValueWithContext]](), + Map[ReferencedTableForeignKeyReference, Set[KeyValueWithContext]]() ) } case Left(warningsAndErrors) => @@ -210,8 +211,8 @@ case class Table private( List( ( warningsAndErrors, - Map[ForeignKeyDefinition, Set[KeyWithContext]](), - Map[ReferencedTableForeignKeyReference, Set[KeyWithContext]]() + Map[ForeignKeyDefinition, Set[KeyValueWithContext]](), + Map[ReferencedTableForeignKeyReference, Set[KeyValueWithContext]]() ) ) ) @@ -235,7 +236,7 @@ case class Table private( formatBuilder.setEscape('\\') } - formatBuilder.build() + formatBuilder.get() } private def validateHeader( @@ -532,9 +533,9 @@ case class Table private( existingPrimaryKeyValues: mutable.Set[List[Any]], validateRowOutput: ValidateRowOutput ): Option[ErrorWithCsvContext] = { - val primaryKeyValues = validateRowOutput.primaryKeyValues + val primaryKeyValues = validateRowOutput.primaryKeyValue if ( - validateRowOutput.primaryKeyValues.nonEmpty && existingPrimaryKeyValues + validateRowOutput.primaryKeyValue.nonEmpty && existingPrimaryKeyValues .contains( primaryKeyValues ) @@ -619,8 +620,8 @@ case class Table private( mapPrimaryKeyHashToRowNumbers: Map[Int, Array[Long]], validateRowOutput: ValidateRowOutput ): Map[Int, Array[Long]] = { - val primaryKeyValueHash = validateRowOutput.primaryKeyValues.hashCode() - if (validateRowOutput.primaryKeyValues.nonEmpty) { + val primaryKeyValueHash = validateRowOutput.primaryKeyValue.hashCode() + if (validateRowOutput.primaryKeyValue.nonEmpty) { val existingRowsMatchingHash = mapPrimaryKeyHashToRowNumbers.getOrElse( primaryKeyValueHash, Array[Long]() @@ -640,15 +641,15 @@ case class Table private( ): MapForeignKeyReferenceToValues = validateRowOutput.parentTableForeignKeyReferences .foldLeft(mapReferencedTableToForeignKeyValues)({ - case (acc, (keyReference, keyValues)) => + case (acc, (keyReference, keyValue)) => val existingValues = acc.getOrElse(keyReference, Set()) - if (existingValues.contains(keyValues)) { + if (existingValues.contains(keyValue)) { acc.updated( keyReference, - existingValues - keyValues + keyValues.copy(isDuplicate = true) + existingValues - keyValue + keyValue.copy(isDuplicate = true) ) } else { - acc.updated(keyReference, existingValues + keyValues) + acc.updated(keyReference, existingValues + keyValue) } }) @@ -659,29 +660,27 @@ case class Table private( validateRowOutput.childTableForeignKeys.foldLeft( foreignKeyDefinitionsWithValues ) { - case (acc, (keyDefinition, keyValues)) => + case (acc, (keyDefinition, keyValue)) => val valuesForKey = acc .getOrElse(keyDefinition, Set()) - acc.updated(keyDefinition, valuesForKey + keyValues) + acc.updated(keyDefinition, valuesForKey + keyValue) } private def validateRow(row: CSVRecord): ValidateRowOutput = { var errors = Array[ErrorWithCsvContext]() - val primaryKeyValues = ArrayBuffer.empty[Any] + val primaryKeyValue = ArrayBuffer.empty[KeyComponentValue] val foreignKeyReferenceValues = ArrayBuffer.empty[ - (ReferencedTableForeignKeyReference, List[Any]) + (ReferencedTableForeignKeyReference, ColumnValue) ] // to store the validated referenced Table Columns values in each row - val foreignKeyValues = { - ArrayBuffer.empty[ + val foreignKeyValues = ArrayBuffer.empty[ (ForeignKeyDefinition, List[Any]) ] // to store the validated foreign key values in each row - } schema.foreach(s => { for ((value, column) <- row.iterator.asScalaArray.zip(s.columns)) { //catch any exception here, possibly outOfBounds and set warning too many values - val (es, newValue) = column.validate(value) + val (es, parsedColumnValue) = column.validate(value) errors = errors ++ es.map(e => ErrorWithCsvContext( e.`type`, @@ -693,7 +692,15 @@ case class Table private( ) ) if (s.primaryKey.contains(column)) { - primaryKeyValues.addAll(newValue) + // If you choose to reference a list-column in a primary key, then we treat each list item as contributing + // to the uniqueness of your primary key; this is consistent with how you would be able to use said values + // to define an `aboutURL` for the row; i.e. you are forced to concatenate the string values of your list + // items to create an amalgamated identifier. + // See . + // Also see code reference 7da26b34-efa5-11ef-8180-57883ec4256f. + // + // The below `addAll` is consistent with that approach. + primaryKeyValue.addAll(parsedColumnValue) } for (foreignKeyReferenceObject <- foreignKeyReferences) { @@ -703,14 +710,14 @@ case class Table private( ) ) { foreignKeyReferenceValues.addOne( - (foreignKeyReferenceObject, newValue) + (foreignKeyReferenceObject, parsedColumnValue) ) } } for (foreignKeyWrapperObject <- s.foreignKeys) { if (foreignKeyWrapperObject.localColumns.contains(column)) { - foreignKeyValues.addOne((foreignKeyWrapperObject, newValue)) + foreignKeyValues.addOne((foreignKeyWrapperObject, parsedColumnValue)) } } } @@ -719,42 +726,35 @@ case class Table private( ValidateRowOutput( row.getRecordNumber, WarningsAndErrors(Array(), errors), - primaryKeyValues.toList, - getForeignKeyReferencesWithPossibleValues( - foreignKeyReferenceValues.toList, - row - ), - getForeignKeyDefinitionsWithValues(foreignKeyValues.toList, row) + primaryKeyValue.toList, + assembleValuesForEachKey(foreignKeyReferenceValues.toList, row), + assembleValuesForEachKey(foreignKeyValues.toList, row) ) } - private def getForeignKeyDefinitionsWithValues( - foreignKeyValues: List[(ForeignKeyDefinition, List[Any])], - row: CSVRecord - ): Predef.Map[ForeignKeyDefinition, KeyWithContext] = { - foreignKeyValues + /** + * Returns a map from the key to the values + * @param keysWithColumnValues - A list of all of the keys and their column values + * @param row - The row in the CSV file being parsed. + * @tparam TKey - Must inherit from the abstract `csvwcheck.models.Key` case class. + * @return + */ + private def assembleValuesForEachKey[TKey <: Key]( + keysWithColumnValues: List[(TKey, ColumnValue)], + row: CSVRecord + ): Predef.Map[TKey, KeyValueWithContext] = { + keysWithColumnValues .groupBy { - case (k, _) => k + case (keyDefinition, _) => keyDefinition } .map { - case (k, values) => - (k, KeyWithContext(row.getRecordNumber, values.map(v => v._2))) - } - } + case (keyDefinition, keyDefinitionsWithValues) => + val columnValuesForKey = keyDefinitionsWithValues.map({case (_, columnValue) => columnValue}) + // Purposefully ignore the fact that a column may have more than one value as this is undefined behaviour. + // Code reference: 7da26b34-efa5-11ef-8180-57883ec4256f + val solitaryKeyValue = columnValuesForKey.map(columnValue => columnValue.mkString.asInstanceOf[KeyComponentValue]) - private def getForeignKeyReferencesWithPossibleValues( - foreignKeyReferenceValues: List[ - (ReferencedTableForeignKeyReference, List[Any]) - ], - row: CSVRecord - ): Predef.Map[ReferencedTableForeignKeyReference, KeyWithContext] = { - foreignKeyReferenceValues - .groupBy { - case (k, _) => k - } - .map { - case (k, values) => - (k, KeyWithContext(row.getRecordNumber, values.map(v => v._2))) + (keyDefinition, KeyValueWithContext(row.getRecordNumber, solitaryKeyValue)) } } diff --git a/src/main/scala/csvwcheck/models/TableGroup.scala b/src/main/scala/csvwcheck/models/TableGroup.scala index b63259d..fdce10d 100644 --- a/src/main/scala/csvwcheck/models/TableGroup.scala +++ b/src/main/scala/csvwcheck/models/TableGroup.scala @@ -57,11 +57,10 @@ object TableGroup { } parseTables(standardisedTableGroupNode) - .flatMap({ - case tablesWithWarningsAndErrors => + .flatMap(tablesWithWarningsAndErrors => linkForeignKeysToReferencedTables(tablesWithWarningsAndErrors.component) .map(tables => tablesWithWarningsAndErrors.copy(component = tables)) - }) + ) .flatMap(tablesWithWarningsAndErrors => { for { dialect <- parseDialect(standardisedTableGroupNode) @@ -452,7 +451,7 @@ case class TableGroup private( foreignKeyDefinitionsByTable: MapTableToForeignKeyDefinitions, referencedTable: Table, foreignKeyReference: ReferencedTableForeignKeyReference, - allValidValues: Set[KeyWithContext] + allValidValues: Set[KeyValueWithContext] ): TolerableErrors = { val keysReferencesInOriginTable = getKeysReferencedInOriginTable( foreignKeyDefinitionsByTable, @@ -461,18 +460,21 @@ case class TableGroup private( ) getUnmatchedForeignKeyReferences( + foreignKeyReference, allValidValues, keysReferencesInOriginTable ) ++ getDuplicateKeysInReferencedTable( + foreignKeyReference, allValidValues, keysReferencesInOriginTable ) } private def getDuplicateKeysInReferencedTable( - allPossibleParentTableValues: Set[KeyWithContext], - keysReferencesInOriginTable: Set[KeyWithContext] + referencedTableForeignKeyReference: ReferencedTableForeignKeyReference, + allPossibleParentTableValues: Set[KeyValueWithContext], + keysReferencesInOriginTable: Set[KeyValueWithContext] ): TolerableErrors = { val duplicateKeysInParent = allPossibleParentTableValues .intersect(keysReferencesInOriginTable) @@ -487,7 +489,8 @@ case class TableGroup private( k.rowNumber.toString, "", k.keyValuesToString(), - "" + "", + csvFilePath = Some(referencedTableForeignKeyReference.referencedTable.url) ) ) .toArray @@ -497,8 +500,9 @@ case class TableGroup private( } private def getUnmatchedForeignKeyReferences( - allPossibleParentTableValues: Set[KeyWithContext], - keysReferencesInOriginTable: Set[KeyWithContext] + referencedTableForeignKeyReference: ReferencedTableForeignKeyReference, + allPossibleParentTableValues: Set[KeyValueWithContext], + keysReferencesInOriginTable: Set[KeyValueWithContext] ): TolerableErrors = { val keyValuesNotDefinedInParent = keysReferencesInOriginTable.diff(allPossibleParentTableValues) @@ -509,9 +513,10 @@ case class TableGroup private( "unmatched_foreign_key_reference", "schema", k.rowNumber.toString, - "", + referencedTableForeignKeyReference.foreignKeyDefinition.localColumns.map(_.name.getOrElse("")).mkString, k.keyValuesToString(), - "" + "", + csvFilePath = Some(referencedTableForeignKeyReference.definitionTable.url) ) ) .toArray @@ -524,7 +529,7 @@ case class TableGroup private( foreignKeyDefinitionsByTable: MapTableToForeignKeyDefinitions, referencedTable: Table, parentTableForeignKeyReference: ReferencedTableForeignKeyReference - ): Set[KeyWithContext] = { + ): Set[KeyValueWithContext] = { val foreignKeyDefinitionsOnTable = foreignKeyDefinitionsByTable .getOrElse( parentTableForeignKeyReference.definitionTable, diff --git a/src/main/scala/csvwcheck/models/TableSchema.scala b/src/main/scala/csvwcheck/models/TableSchema.scala index d555e07..4d70b64 100644 --- a/src/main/scala/csvwcheck/models/TableSchema.scala +++ b/src/main/scala/csvwcheck/models/TableSchema.scala @@ -261,11 +261,18 @@ object TableSchema { val columnReference = columnReferenceElementTextNode.asText columns .find(col => col.name.contains(columnReference)) - .map(Right(_)) + .map(col => { + if (col.separator.isEmpty) { + Right(col) + } else { + // Code reference: 7da26b34-efa5-11ef-8180-57883ec4256f + Left(MetadataError(s"foreign key references list column '$columnReference'; behaviour is undefined")) + } + }) .getOrElse( Left( MetadataError( - s"foreignKey references non-existent column - $columnReference" + s"foreignKey references non-existent column '$columnReference'" ) ) ) diff --git a/src/main/scala/csvwcheck/models/ValidateRowOutput.scala b/src/main/scala/csvwcheck/models/ValidateRowOutput.scala index 50d6410..c88b773 100644 --- a/src/main/scala/csvwcheck/models/ValidateRowOutput.scala +++ b/src/main/scala/csvwcheck/models/ValidateRowOutput.scala @@ -16,13 +16,15 @@ package csvwcheck.models +import csvwcheck.models.Values.KeyValue + case class ValidateRowOutput( recordNumber: Long, warningsAndErrors: WarningsAndErrors = WarningsAndErrors(), - primaryKeyValues: List[Any] = List(), + primaryKeyValue: KeyValue = List(), parentTableForeignKeyReferences: Map[ ReferencedTableForeignKeyReference, - KeyWithContext + KeyValueWithContext ] = Map(), - childTableForeignKeys: Map[ForeignKeyDefinition, KeyWithContext] = Map() + childTableForeignKeys: Map[ForeignKeyDefinition, KeyValueWithContext] = Map() ) diff --git a/src/main/scala/csvwcheck/models/Values.scala b/src/main/scala/csvwcheck/models/Values.scala new file mode 100644 index 0000000..fa7aaeb --- /dev/null +++ b/src/main/scala/csvwcheck/models/Values.scala @@ -0,0 +1,21 @@ +package csvwcheck.models + +object Values { + /** + * Part of a KeyValue. Whereas a KeyValue may take values from multiple columns, a KeyComponent value is from + * a single column. + */ + type KeyComponentValue = Any + + /** + * The value associated with a Key in a given row. + */ + type KeyValue = List[KeyComponentValue] + + /** + * Represents the value from one particular column. + * + * May contain multiple values if the column represents a list (i.e. has a separator set). + */ + type ColumnValue = List[Any] +} diff --git a/src/main/scala/csvwcheck/normalisation/Utils.scala b/src/main/scala/csvwcheck/normalisation/Utils.scala index 42b5d92..d725e67 100644 --- a/src/main/scala/csvwcheck/normalisation/Utils.scala +++ b/src/main/scala/csvwcheck/normalisation/Utils.scala @@ -575,11 +575,13 @@ object Utils { .map((_, noWarnings, csvwPropertyType)) } - def toAbsoluteUrl(possiblyRelativeUrl: String, baseUrl: String): String = + private def toAbsoluteUrl(possiblyRelativeUrl: String, baseUrl: String): String = if (baseUrl.isEmpty) { possiblyRelativeUrl } else { - new URL(new URL(baseUrl), possiblyRelativeUrl).toString + new URI(baseUrl) + .resolve(possiblyRelativeUrl) + .toString } def normaliseNaturalLanguageProperty( diff --git a/src/test/resources/csvwExamples/foreign-key-with-separator.csv b/src/test/resources/csvwExamples/foreign-key-with-separator.csv new file mode 100644 index 0000000..05a48ba --- /dev/null +++ b/src/test/resources/csvwExamples/foreign-key-with-separator.csv @@ -0,0 +1,3 @@ +countryRefs,info +EU|UK,The European Union and the UK +UK|EU|UK,The European Union and the UK and then the EU again \ No newline at end of file diff --git a/src/test/resources/csvwExamples/foreign-key-with-separator.csv-metadata.json b/src/test/resources/csvwExamples/foreign-key-with-separator.csv-metadata.json new file mode 100644 index 0000000..0b6d841 --- /dev/null +++ b/src/test/resources/csvwExamples/foreign-key-with-separator.csv-metadata.json @@ -0,0 +1,62 @@ +{ + "@context": "http://www.w3.org/ns/csvw", + "tables": [ + { + "url": "countries.csv", + "tableSchema": { + "columns": [ + { + "name": "countryCode", + "titles": "countryCode", + "datatype": "string", + "propertyUrl": "http://www.geonames.org/ontology{#_name}" + }, + { + "name": "latitude", + "titles": "latitude", + "datatype": "number" + }, + { + "name": "longitude", + "titles": "longitude", + "datatype": "number" + }, + { + "name": "name", + "titles": "name", + "datatype": "string" + } + ], + "aboutUrl": "http://example.org/countries.csv{#countryCode}", + "propertyUrl": "http://schema.org/{_name}", + "primaryKey": "countryCode" + } + }, + { + "url": "foreign-key-with-separator.csv", + "tableSchema": { + "columns": [ + { + "name": "countryRefs", + "titles": "countryRefs", + "separator": "|", + "ordered": false + }, + { + "name": "info", + "titles": "info" + } + ], + "foreignKeys": [ + { + "columnReference": "countryRefs", + "reference": { + "resource": "countries.csv", + "columnReference": "countryCode" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/src/test/scala/csvwcheck/TestUtils.scala b/src/test/scala/csvwcheck/TestUtils.scala new file mode 100644 index 0000000..eb59d57 --- /dev/null +++ b/src/test/scala/csvwcheck/TestUtils.scala @@ -0,0 +1,21 @@ +package csvwcheck + +import akka.actor.ActorSystem +import akka.stream.scaladsl.Sink +import csvwcheck.models.WarningsAndErrors + +import scala.concurrent.Await +import scala.concurrent.duration.DurationInt + +object TestUtils { + implicit final val system: ActorSystem = ActorSystem("actor-system") + + def RunValidationInAkka(validator: Validator): WarningsAndErrors = { + var warningsAndErrors = WarningsAndErrors() + val akkaStream = + validator.validate().map(wAndE => warningsAndErrors = wAndE) + Await.ready(akkaStream.runWith(Sink.ignore), 10.hours) + + warningsAndErrors + } +} diff --git a/src/test/scala/csvwcheck/ValidatorTests.scala b/src/test/scala/csvwcheck/ValidatorTests.scala index 67210c1..9a3eda9 100644 --- a/src/test/scala/csvwcheck/ValidatorTests.scala +++ b/src/test/scala/csvwcheck/ValidatorTests.scala @@ -1,23 +1,19 @@ package csvwcheck -import akka.actor.ActorSystem -import akka.stream.scaladsl.Sink -import csvwcheck.models.{KeyWithContext, WarningsAndErrors} +import csvwcheck.models.KeyValueWithContext +import csvwcheck.TestUtils.RunValidationInAkka import org.scalatest.funsuite.AnyFunSuite import java.io.File import java.time.{ZoneId, ZonedDateTime} import scala.collection.mutable -import scala.concurrent.Await -import scala.concurrent.duration.DurationInt class ValidatorTests extends AnyFunSuite { val csvwExamplesBaseDir = "src/test/resources/csvwExamples/" - implicit val system: ActorSystem = ActorSystem("actor-system") test("Non-mocked over-HTTP CSV-W Validation") { val validator = new Validator(Some("https://w3c.github.io/csvw/tests/test011/tree-ops.csv-metadata.json")) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.isEmpty) assert(warningsAndErrors.warnings.isEmpty) } @@ -31,21 +27,12 @@ class ValidatorTests extends AnyFunSuite { // assert(warningsAndErrors.warnings.isEmpty) // } - def runValidationInAkka(validator: Validator): WarningsAndErrors = { - var warningsAndErrors = WarningsAndErrors() - val akkaStream = - validator.validate().map(wAndE => warningsAndErrors = wAndE) - Await.ready(akkaStream.runWith(Sink.ignore), 10.seconds) - - warningsAndErrors - } - test("set warning when title is empty for a column") { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_missing_headers.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.warnings.length === 1) val warning = warningsAndErrors.warnings(0) @@ -60,7 +47,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_missing_headers.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 1) val error = warningsAndErrors.errors(0) assert(error.`type` === "Invalid Header") @@ -74,7 +61,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_duplicate_headers.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.warnings.length === 1) val warning = warningsAndErrors.warnings(0) assert(warning.`type` === "Duplicate column name") @@ -88,7 +75,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_duplicate_headers.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 1) val error = warningsAndErrors.errors(0) assert(error.`type` === "Invalid Header") @@ -100,7 +87,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_duplicate_primary_key.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 1) val error = warningsAndErrors.errors(0) assert(error.`type` === "duplicate_key") @@ -116,7 +103,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_primary_key_datetime.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 0) } @@ -126,7 +113,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}observations_primary_key_datetime_violation.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 1) val error = warningsAndErrors.errors(0) assert( @@ -140,7 +127,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}obs_decimal_primary_key_vio.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 1) val error = warningsAndErrors.errors(0) assert(error.`type` === "duplicate_key") @@ -156,7 +143,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}foreignKeyValidationTest.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) assert(warningsAndErrors.errors.length === 0) } @@ -166,7 +153,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}foreignKeyViolationTest.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) val errors = warningsAndErrors.errors assert(errors.length === 1) assert(errors(0).`type` === "unmatched_foreign_key_reference") @@ -179,7 +166,7 @@ class ValidatorTests extends AnyFunSuite { val uri = s"file://${new File(s"${csvwExamplesBaseDir}foreignKeyValidationTestmultiple_parent_rows_matched.csv-metadata.json").getAbsolutePath}" val validator = new Validator(Some(uri)) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) val errors = warningsAndErrors.errors assert(errors.length === 1) assert(errors(0).`type` === "multiple_matched_rows") @@ -239,9 +226,9 @@ class ValidatorTests extends AnyFunSuite { "it should not add second key-value with the exact same values and different row number" ) { - val set = mutable.Set[KeyWithContext]() - val val1 = KeyWithContext(1, List(1, 2, 3)) - val val2 = KeyWithContext(89, List(1, 2, 3)) + val set = mutable.Set[KeyValueWithContext]() + val val1 = KeyValueWithContext(1, List(1, 2, 3)) + val val2 = KeyValueWithContext(89, List(1, 2, 3)) set += val1 set += val2 // Since the hashcode method is overridden in KeyWithContext class val1 is equal to val2 assert(set.size == 1) @@ -257,7 +244,7 @@ class ValidatorTests extends AnyFunSuite { val validator = new Validator(Some(s"file://${testCaseFile.getAbsolutePath}")) - val warningsAndErrors = runValidationInAkka(validator) + val warningsAndErrors = RunValidationInAkka(validator) val errors = warningsAndErrors.errors assert(errors.isEmpty) } diff --git a/src/test/scala/csvwcheck/bugs/ForeignKeys.scala b/src/test/scala/csvwcheck/bugs/ForeignKeys.scala new file mode 100644 index 0000000..0cf7b88 --- /dev/null +++ b/src/test/scala/csvwcheck/bugs/ForeignKeys.scala @@ -0,0 +1,23 @@ +package csvwcheck.bugs + +import csvwcheck.TestPaths.resourcesPath +import csvwcheck.TestUtils.RunValidationInAkka +import csvwcheck.Validator +import org.scalatest.funsuite.AnyFunSuite + +class ForeignKeys extends AnyFunSuite { + + test("A foreign key definition referencing a list column should return an error") { + // Code reference 7da26b34-efa5-11ef-8180-57883ec4256f + val csvwInput = resourcesPath./("csvwExamples")./("foreign-key-with-separator.csv-metadata.json") + val validator = new Validator(Some(csvwInput.path.toString)) + val warningsAndErrors = RunValidationInAkka(validator) + + assert(warningsAndErrors.warnings.isEmpty) + assert(warningsAndErrors.errors.length == 1) + val error = warningsAndErrors.errors.head + + assert(error.`type` == "metadata") + assert(error.content.contains("foreign key references list column")) + } +}