From ba2b6633e4136a878d342a97536665cd3825a5a9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Dec 2025 07:09:23 +0000 Subject: [PATCH 1/4] Initial plan From 3df3f6ef4335804a4ea017b3c22cc8127f7bd9c0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Dec 2025 07:16:49 +0000 Subject: [PATCH 2/4] Add validation for sourceCode maximum length of 50 characters Co-authored-by: javier-gracia-tabuenca-tuni <54809193+javier-gracia-tabuenca-tuni@users.noreply.github.com> --- R/validateUsagiFile.R | 2 ++ .../VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv | 1 + tests/testthat/test-validateUsagiFile.R | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/R/validateUsagiFile.R b/R/validateUsagiFile.R index eee9cb8..f7ee515 100644 --- a/R/validateUsagiFile.R +++ b/R/validateUsagiFile.R @@ -5,6 +5,7 @@ #' - Check if all default Usagi columns are present: #' - Check if sourceCode and conceptId are unique #' - Check if sourceCode is not empty +#' - Check if sourceCode is less than 50 characters #' - Check if sourceName is not empty #' - Check if sourceName is less than 255 characters #' If usagi file has C&CR columns: @@ -96,6 +97,7 @@ validateUsagiFile <- function( validationRules <- validate::validator( SourceCode.is.empty = is_complete(sourceCode), SourceCode.and.conceptId.are.not.unique = is_unique(sourceCode, conceptId), + SourceCode.is.more.than.50.characters = field_length(sourceCode, min = 0, max = 50), SourceName.is.empty = is_complete(sourceName), SourceName.is.more.than.255.characters = field_length(sourceName, min = 0, max = 255), SourceFrequency.is.not.empty = is_complete(sourceFrequency), diff --git a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv index f990856..15f463b 100644 --- a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv +++ b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv @@ -70,3 +70,4 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, +A01234567890123456789012345678901234567890123456789X,[SourceCode is more than 50 characters]Test entry for sourceCode length validation,-1,,2000500999,Test entry,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,4100102,Test concept,Condition,MAPS_TO,,TAYS,1623974400000,,, diff --git a/tests/testthat/test-validateUsagiFile.R b/tests/testthat/test-validateUsagiFile.R index f207ea6..cc7a34d 100644 --- a/tests/testthat/test-validateUsagiFile.R +++ b/tests/testthat/test-validateUsagiFile.R @@ -88,6 +88,14 @@ test_that("test validateUsagiFile returns errors with the errored usagi file", { validatedUsagiFile |> dplyr::filter(is.na(sourceName)) |> dplyr::pull(mappingStatus) |> expect_equal("FLAGGED") + # SourceCode is more than 50 characters + validationsSummary |> dplyr::filter(step == "SourceCode is more than 50 characters") |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> dplyr::pull(`ADD_INFO:validationMessages`) |> + expect_equal("ERROR: SourceCode is more than 50 characters") + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> dplyr::pull(mappingStatus) |> + expect_equal("FLAGGED") + # SourceName is more than 255 characters validationsSummary |> dplyr::filter(step == "SourceName is more than 255 characters") |> nrow() |> expect_equal(1) validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceName is more than 255 characters")) |> nrow() |> expect_equal(1) From 66d955a7c3cb1a3cbb5547130ac21e5ac083c682 Mon Sep 17 00:00:00 2001 From: javier-gracia-tabuenca-tuni Date: Mon, 1 Dec 2025 09:53:20 +0200 Subject: [PATCH 3/4] updated test file --- .../testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv index 15f463b..b75b5d6 100644 --- a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv +++ b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv @@ -4,6 +4,7 @@ A01.0+G01,[SourceCode and conceptId are not unique]Meningitis (in) typhoid fever A01.0+G01,[SourceCode and conceptId are not unique]Meningitis (in) typhoid fever,-1,,2000500101,Lavantautiin liittyvä aivokalvotulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.0|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794379045,4100102,Meningitis due to typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,, A01.0+J17.0,,-1,,2000500103,Lavantautiin liittyvä keuhkokuume,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.0|J17.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794388143,4166072,Pneumonia in typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,, A01.4+M01.3,[SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad [SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad [SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad ddd,-1,,2000500104,Lavantautiin tai pikkulavantautiin liittyvä nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.4|M01.3,ICD10|ICD10|ICD10,0.78,APPROVED,EQUIVALENT,PKo,1666806100347,80316,Salmonella arthritis,Condition,MAPS_TO,,PKo,1666806094598,,, +A01234567890123456789012345678901234567890123456789X,[SourceCode is more than 50 characters]Test entry for sourceCode length validation,-1,,2000500999,Test entry,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,4100102,Meningitis due to typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,, A02.2+G01,[APPROVED mappingStatus conceptId is 0]Salmonella meningitis,-1,,2000500105,Salmonellan aiheuttama aivokalvotulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794600409,0,Salmonella meningitis,Condition,MAPS_TO,,TAYS,1623974400000,,, A17.0+G01,[APPROVED mappingStatus with concepts outdated]Tuberculous meningitis,-1,,2000500115,Tuberkuloottinen meningiitti,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.0|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304049249,1234,Tuberculosis of meninges,Condition,MAPS_TO,,PKo,1666804429398,,, A17.1+G07,[APPROVED mappingStatus with concepts outdated]Meningeal tuberculoma,-1,,2000500116,Aivokalvojen tuberkulooma,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.1|G07,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304054597,1234,Tuberculoma of meninges,Condition,MAPS_TO,,TAYS,1623974400000,,, @@ -70,4 +71,3 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, -A01234567890123456789012345678901234567890123456789X,[SourceCode is more than 50 characters]Test entry for sourceCode length validation,-1,,2000500999,Test entry,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,4100102,Test concept,Condition,MAPS_TO,,TAYS,1623974400000,,, From 0f5ed944c1630d33c3be391ee696a4c8ab145006 Mon Sep 17 00:00:00 2001 From: javier-gracia-tabuenca-tuni Date: Mon, 1 Dec 2025 09:59:38 +0200 Subject: [PATCH 4/4] Refactor SQL generation in OMOP vocabulary functions to improve formatting and add QUOTE option for CSV export --- R/databasesFromAndToCSV.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/databasesFromAndToCSV.R b/R/databasesFromAndToCSV.R index 06b9aa5..4016d82 100644 --- a/R/databasesFromAndToCSV.R +++ b/R/databasesFromAndToCSV.R @@ -24,7 +24,7 @@ omopVocabularyCSVsToDuckDB <- function( "CONCEPT_RELATIONSHIP", "CONCEPT_SYNONYM", "DOMAIN", - "DRUG_STRENGTH", + "DRUG_STRENGTH", "RELATIONSHIP", "VOCABULARY" ) @@ -55,7 +55,7 @@ omopVocabularyCSVsToDuckDB <- function( sql = sql, targetDialect = "duckdb" ) - + # Fix DuckDB data type issues: replace NUMERIC with DOUBLE for float columns # This prevents precision errors when importing large numeric values sql <- gsub("NUMERIC NULL", "DOUBLE NULL", sql) @@ -130,14 +130,14 @@ duckdbToOMOPVocabularyCSVs <- function( for (table_name in OMOPVocabularyTableNames) { message("Exporting table: ", table_name) out_path <- file.path(pathToOMOPVocabularyCSVsFolder, paste0(table_name, ".csv")) - + col_info <- DBI::dbGetQuery( connection, paste0("PRAGMA table_info(", table_name, ");") ) cols <- col_info$name date_cols <- col_info$name[grepl("^date$", tolower(col_info$type))] - + select_cols <- sapply(cols, function(col) { if (col %in% date_cols) { paste0("STRFTIME('%Y%m%d', ", col, ") AS ", col) @@ -147,7 +147,7 @@ duckdbToOMOPVocabularyCSVs <- function( }) select_sql <- paste(select_cols, collapse = ", ") - sql <- paste0("COPY (SELECT ", select_sql, " FROM ", table_name, ") TO '", out_path, "' (HEADER, DELIM '\t');") + sql <- paste0("COPY (SELECT ", select_sql, " FROM ", table_name, ") TO '", out_path, "' (HEADER, DELIM '\t', QUOTE '');") DatabaseConnector::dbExecute(connection, sql) }