From a493b3b6bbd01c611277c95abb3f4a3689e8dcaa Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Wed, 22 Nov 2023 13:32:59 +0100 Subject: [PATCH 1/4] should change arrow to show R-arrow --- _benchplot/benchplot-dict.R | 14 +++++++------- _control/solutions.csv | 4 ++-- _report/report.R | 12 +++++++++--- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index f1351858..cb625f90 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -42,7 +42,7 @@ solution.dict = {list( "juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")), "clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")), "polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")), - "arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")), + "R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")), "duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")), "duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")), "datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3")) @@ -199,7 +199,7 @@ groupby.syntax.dict = {list( "regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()", "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()" )}, - "arrow" = {c( + "R-arrow" = {c( "sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))", "sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))", "sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))", @@ -260,7 +260,7 @@ groupby.syntax.dict = {list( "juliads" = list(), "clickhouse" = list(), "polars" = list(), - "arrow" = list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"), + "R-arrow" = list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"), "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list() @@ -309,7 +309,7 @@ groupby.data.exceptions = {list( "polars" = {list( # "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10 )}, - "arrow" = {list( + "R-arrow" = {list( # "timeout" = c(), # q10 "internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr "G1_1e9_2e0_0_0"), # #190 @@ -413,7 +413,7 @@ join.syntax.dict = {list( "medium inner on factor" = "DF.merge(medium, on='id5')", "big inner on int" = "DF.merge(big, on='id3')" )}, - "arrow" = {c( + "R-arrow" = {c( "small inner on int" = "inner_join(DF, small, by='id1')", "medium inner on int" = "inner_join(DF, medium, by='id2')", "medium outer on int" = "left_join(DF, medium, by='id2')", @@ -454,7 +454,7 @@ join.query.exceptions = {list( "juliads" = list(), "clickhouse" = list(), "polars" = list(), - "arrow" = list(), + "R-arrow" = list(), "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list() @@ -496,7 +496,7 @@ join.data.exceptions = {list( "polars" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") )}, - "arrow" = {list( + "R-arrow" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#, # "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") )}, diff --git a/_control/solutions.csv b/_control/solutions.csv index c96f07cf..89009a06 100644 --- a/_control/solutions.csv +++ b/_control/solutions.csv @@ -25,8 +25,8 @@ clickhouse,groupby clickhouse,join polars,groupby polars,join -arrow,groupby -arrow,join +R-arrow,groupby +R-arrow,join duckdb,groupby duckdb,join duckdb-latest,groupby diff --git a/_report/report.R b/_report/report.R index 35082113..c56ed4e4 100644 --- a/_report/report.R +++ b/_report/report.R @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) { file.path(path, "report-done") } get_report_solutions = function() { - c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion") + c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow") } get_data_levels = function() { ## groupby @@ -243,9 +243,15 @@ transform = function(ld) { # all ---- time_logs = function(path=getwd()) { - ct = clean_time(load_time(path=getwd())) + lt <- load_time(path=getwd()) + # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) + lt$solution[lt$solution == "arrow"] <- "R-arrow" + + ct = clean_time(lt) d = model_time(ct) - l = model_logs(clean_logs(load_logs(path=path))) + ll <- load_logs(path=path) + ll$solution[ll$solution == "arrow"] <- "R-arrow" + l = model_logs(clean_logs(ll)) q = model_questions(clean_questions(load_questions(path=path))) lq = merge_logs_questions(l, q) From cf48b23bc42f04c3f8f50b1fd62a2a7f1989cffb Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 23 Nov 2023 10:33:01 +0100 Subject: [PATCH 2/4] new arrow benchmarks report solution as R-arrow --- arrow/groupby-arrow.R => R-arrow/groupby-R-arrow.R | 2 +- arrow/join-arrow.R => R-arrow/join-R-arrow.R | 2 +- R-arrow/setup-R-arrow.sh | 6 ++++++ arrow/upg-arrow.sh => R-arrow/upg-R-arrow.sh | 2 +- R-arrow/ver-R-arrow.sh | 4 ++++ _report/report.R | 5 +++-- arrow/setup-arrow.sh | 6 ------ arrow/ver-arrow.sh | 4 ---- 8 files changed, 16 insertions(+), 15 deletions(-) rename arrow/groupby-arrow.R => R-arrow/groupby-R-arrow.R (99%) rename arrow/join-arrow.R => R-arrow/join-R-arrow.R (99%) create mode 100755 R-arrow/setup-R-arrow.sh rename arrow/upg-arrow.sh => R-arrow/upg-R-arrow.sh (55%) create mode 100755 R-arrow/ver-R-arrow.sh delete mode 100755 arrow/setup-arrow.sh delete mode 100755 arrow/ver-arrow.sh diff --git a/arrow/groupby-arrow.R b/R-arrow/groupby-R-arrow.R similarity index 99% rename from arrow/groupby-arrow.R rename to R-arrow/groupby-R-arrow.R index 950bcff0..77b2c5cc 100755 --- a/arrow/groupby-arrow.R +++ b/R-arrow/groupby-R-arrow.R @@ -13,7 +13,7 @@ suppressPackageStartupMessages({ ver = packageVersion("arrow") git = "" task = "groupby" -solution = "arrow" +solution = "R-arrow" fun = "group_by" cache = TRUE on_disk = FALSE diff --git a/arrow/join-arrow.R b/R-arrow/join-R-arrow.R similarity index 99% rename from arrow/join-arrow.R rename to R-arrow/join-R-arrow.R index 69df274d..7ac4f165 100755 --- a/arrow/join-arrow.R +++ b/R-arrow/join-R-arrow.R @@ -12,7 +12,7 @@ suppressPackageStartupMessages({ ver = packageVersion("arrow") git = "" task = "join" -solution = "arrow" +solution = "R-arrow" cache = TRUE on_disk = FALSE diff --git a/R-arrow/setup-R-arrow.sh b/R-arrow/setup-R-arrow.sh new file mode 100755 index 00000000..db589344 --- /dev/null +++ b/R-arrow/setup-R-arrow.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# install stable arrow +mkdir -p ./arrow/r-arrow +Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")' diff --git a/arrow/upg-arrow.sh b/R-arrow/upg-R-arrow.sh similarity index 55% rename from arrow/upg-arrow.sh rename to R-arrow/upg-R-arrow.sh index d2fb9de5..d1743c85 100755 --- a/arrow/upg-arrow.sh +++ b/R-arrow/upg-R-arrow.sh @@ -3,4 +3,4 @@ set -e # upgrade all packages in arrow library only if new arrow is out echo 'upgrading arrow...' -Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' +Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' diff --git a/R-arrow/ver-R-arrow.sh b/R-arrow/ver-R-arrow.sh new file mode 100755 index 00000000..b920df45 --- /dev/null +++ b/R-arrow/ver-R-arrow.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' diff --git a/_report/report.R b/_report/report.R index c56ed4e4..4b0db151 100644 --- a/_report/report.R +++ b/_report/report.R @@ -69,6 +69,9 @@ clean_time = function(d) { if (nrow(d[!nzchar(version) | is.na(version)])) stop("timings data contains NA or '' as version field, that should not happen") old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6") + + # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) + d[solution == "arrow", solution := "R-arrow"] d[!nzchar(git), git := NA_character_ ][,"on_disk" := as.logical(on_disk) ][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_ @@ -244,8 +247,6 @@ transform = function(ld) { time_logs = function(path=getwd()) { lt <- load_time(path=getwd()) - # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) - lt$solution[lt$solution == "arrow"] <- "R-arrow" ct = clean_time(lt) d = model_time(ct) diff --git a/arrow/setup-arrow.sh b/arrow/setup-arrow.sh deleted file mode 100755 index dcad2ad3..00000000 --- a/arrow/setup-arrow.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -# install stable arrow -mkdir -p ./arrow/r-arrow -Rscript -e 'install.packages(c("arrow","dplyr"), lib="./arrow/r-arrow")' diff --git a/arrow/ver-arrow.sh b/arrow/ver-arrow.sh deleted file mode 100755 index 44bb8ede..00000000 --- a/arrow/ver-arrow.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -e - -Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' From 5227ea5d5a5e78c31bffb0be4bcbca58deedeaf6 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 24 Nov 2023 08:56:41 +0100 Subject: [PATCH 3/4] update arrow to R-arrow in a few more places --- .github/workflows/regression.yml | 2 +- run.conf | 2 +- run.sh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 53a7684e..12a955cf 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion] + solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion] name: Regression Tests solo solutions runs-on: ubuntu-20.04 env: diff --git a/run.conf b/run.conf index 14e0f435..c019b15f 100644 --- a/run.conf +++ b/run.conf @@ -1,7 +1,7 @@ # task, used in init-setup-iteration.R export RUN_TASKS="groupby join" # solution, used in init-setup-iteration.R -export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb duckdb-latest datafusion" +export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion" # flag to upgrade tools, used in run.sh on init export DO_UPGRADE=false diff --git a/run.sh b/run.sh index 8afc679c..04399708 100755 --- a/run.sh +++ b/run.sh @@ -71,8 +71,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/upg-h2o. if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi; if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi; -if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/upg-arrow.sh; fi; -if [[ "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/ver-arrow.sh; fi; +if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-upg-arrow.sh; fi; +if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-ver-arrow.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/upg-duckdb.sh; fi; if [[ "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/ver-duckdb.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/setup-duckdb-latest.sh; fi; From 3d136f869a197852d56daedde4209d6bd2a9e434 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 1 Dec 2023 06:49:37 -0900 Subject: [PATCH 4/4] Fix remaining issues in https://github.com/Tmonster/db-benchmark/pull/10 (#13) * Fix remaining issues in arrow -> R-arrow rename * Fix bug in rename code in report.R The previous code was causing something wild to happen. The changed code is idiomatic code for replacing values in a data.frame based on a condition. --- R-arrow/groupby-R-arrow.R | 4 ++-- R-arrow/join-R-arrow.R | 4 ++-- R-arrow/setup-R-arrow.sh | 2 +- R-arrow/upg-R-arrow.sh | 2 +- R-arrow/ver-R-arrow.sh | 2 +- _launcher/launcher.R | 2 +- _launcher/solution.R | 2 +- _report/report.R | 2 +- run.sh | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R-arrow/groupby-R-arrow.R b/R-arrow/groupby-R-arrow.R index 77b2c5cc..100d3dec 100755 --- a/R-arrow/groupby-R-arrow.R +++ b/R-arrow/groupby-R-arrow.R @@ -7,8 +7,8 @@ source("./_helpers/helpers.R") stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well suppressPackageStartupMessages({ - library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) - library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) + library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) + library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) }) ver = packageVersion("arrow") git = "" diff --git a/R-arrow/join-R-arrow.R b/R-arrow/join-R-arrow.R index 7ac4f165..559d05c9 100755 --- a/R-arrow/join-R-arrow.R +++ b/R-arrow/join-R-arrow.R @@ -6,8 +6,8 @@ source("./_helpers/helpers.R") .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well suppressPackageStartupMessages({ - library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) - library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) + library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) + library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) }) ver = packageVersion("arrow") git = "" diff --git a/R-arrow/setup-R-arrow.sh b/R-arrow/setup-R-arrow.sh index db589344..e5ff947a 100755 --- a/R-arrow/setup-R-arrow.sh +++ b/R-arrow/setup-R-arrow.sh @@ -2,5 +2,5 @@ set -e # install stable arrow -mkdir -p ./arrow/r-arrow +mkdir -p ./R-arrow/r-arrow Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")' diff --git a/R-arrow/upg-R-arrow.sh b/R-arrow/upg-R-arrow.sh index d1743c85..4d677d3e 100755 --- a/R-arrow/upg-R-arrow.sh +++ b/R-arrow/upg-R-arrow.sh @@ -3,4 +3,4 @@ set -e # upgrade all packages in arrow library only if new arrow is out echo 'upgrading arrow...' -Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' +Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' diff --git a/R-arrow/ver-R-arrow.sh b/R-arrow/ver-R-arrow.sh index b920df45..8c24e043 100755 --- a/R-arrow/ver-R-arrow.sh +++ b/R-arrow/ver-R-arrow.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' +Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' diff --git a/_launcher/launcher.R b/_launcher/launcher.R index 0a7bc36c..57e7f962 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -14,7 +14,7 @@ readret = function(x) { file.ext = function(x) { ans = switch( x, - "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", + "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", diff --git a/_launcher/solution.R b/_launcher/solution.R index f66b4311..35d3a6a2 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) { file.ext = function(x) { ans = switch( x, - "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", + "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl" diff --git a/_report/report.R b/_report/report.R index 4b0db151..29405a0d 100644 --- a/_report/report.R +++ b/_report/report.R @@ -71,7 +71,7 @@ clean_time = function(d) { old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6") # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) - d[solution == "arrow", solution := "R-arrow"] + d[which(solution == "arrow"),c("solution")] == "R-arrow" d[!nzchar(git), git := NA_character_ ][,"on_disk" := as.logical(on_disk) ][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_ diff --git a/run.sh b/run.sh index 04399708..e834a09e 100755 --- a/run.sh +++ b/run.sh @@ -72,7 +72,7 @@ if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi; if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-upg-arrow.sh; fi; -if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-ver-arrow.sh; fi; +if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/ver-R-arrow.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/upg-duckdb.sh; fi; if [[ "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/ver-duckdb.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/setup-duckdb-latest.sh; fi;