-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
Keeping a record here incase anything goes wrong.
factor.R
#' Reorder levels of an ordinal factor
#'
#' This function reorders the levels of a factor based on the sentiment scores of the levels.
#' Using this function can be expensive (depending on the LLM used and user's computer spec)
#' so users may wish to use this interactively only and copy the output into their script.
#'
#'
#' @param .f A character vector that is assumed to be an ordinal factor.
#' @param copy A logical value to indicate whether the output should be copied into the
#' user's clipboard.
#' @examples
#' # to get the new level order
#' # users should check if the new order
#' emend_lvl_order(likerts$likert1)
#' # `copy = TRUE` copies the output into clipboard in a format that can be
#' # entered easiliy in the user's script
#' emend_lvl_order(likerts$likert1, copy = TRUE)
#' # to apply the new levels directly to the input
#' emend_fct_reorder(likerts$likert1)
#'
#' @export
emend_lvl_order <- function(.f, copy = FALSE, ...) {
lvls <- unique(.f)
res <- reorder_3(lvls, ...)
if(copy) clipr::write_clip(paste0(deparse(res), collapse = ""))
res
}
#' @rdname emend_lvl_order
#' @export
emend_fct_reorder <- function(.f) {
lvls <- emend_lvl_order(.f)
factor(.f, levels = lvls)
}
# This seems to be better
reorder_2 <- function(lvls, ...) {
out <- emend_assist("Give the sentiment score, where negative score means negative sentiment,
for each level of the input: {lvls*}",
format = "json")
res <- names(sort(unlist(out)))
if(length(res) != length(lvls) | !all(res %in% lvls)) {
cli::cli_warn("Could not reorder the levels meaningfully.")
return(lvls)
}
res
}
# works for all
reorder_3 <- function(lvls, ...) {
dots <- list(...)
out <- emend_assist(c(list("Rank the sentiment scores for each level of the input: {lvls*}.
Positive connotations like satisfied or likely should have positive scores.
Negative connotations like unsatisfied or unlikely should have negative scores.
Satisfied should have a higher score than somewhat satisfied.
Agree should have a higher score than somewhat agree.
Neutral elements should have a score of 0.
Just give the scores.
If 'not applicable' or 'not answered', assign NA."), dots),
format = "json")
vec <- unlist(out)
# break ties
if(any(duplicated(vec))) {
dups <- vec[duplicated(vec)]
out2 <- emend_assist(c(list("Rank the sentiment scores for each level of the input: {lvls[vec %in% dups]*}.
Positive connotations like satisfied or likely should have positive scores.
Negative connotations like unsatisfied or unlikely should have negative scores.
Satisfied should have a higher score than somewhat satisfied.
Agree should have a higher score than somewhat agree.
Neutral elements should have a score of 0.
Just give the scores for each element.
If 'not applicable' or 'not answered', assign NA."), dots),
format = "json")
vec2 <- unlist(out2)
vec2 <- vec2 / sum(vec2)
vec[vec %in% dups] <- vec[vec %in% dups] + vec2
}
res <- names(sort(vec))
if(length(vec) != length(lvls)) {
cli::cli_warn("Could not reorder the levels meaningfully.")
return(lvls)
}
if(!all(res %in% lvls)) {
res <- names(sort(setNames(vec, lvls)))
}
res
}
# Another strategy above is to compare two at a time
# Or if there are many categories, split them and compare like any sorting algorithm
#' Sweep factor levels to group similar levels together
#'
#' This function attempts to automatically standardise input labels that should
#' have been the same by making a few assumptions. The assumptions include that
#' the levels with high frequency are correct and low frequency levels may contain
#' typos or alternative representation of other existing levels.
#'
#' Be warned that this function is experimental and may not work as intended.
#'
#' @param .f A factor
#' @param known A character vector of the levels that are known to be correct. If none
#' are provided, it is assumed that no correct values are known. If an element has a name
#' associated with it, it is assumed that the name is what is recorded and the value is
#' what the actual label should be.
#' @param wrong A character vector of the levels known to be wrong and should be
#' grouped with another level.
#' @param nlevels_max The maximum number of levels.
#' @param nlevels_min The minimum number of levels.
#' @param nlevels_top The number of levels that are correct based on the top frequencies, excluding
#' levels that have observations less than `n_min`.
#' @param nlevels_bottom The number of levels that are incorrect based on the bottom frequencies,
#' excluding those that have observation less than `n_min`.
#' @param n_min The minimum of observations for each level. The default is 1.
#' @seealso [emend_lvl_match()]
#' @export
emend_fct_sweep <- function(.f,
known = NULL,
wrong = NULL,
nlevels_max = length(unique(.f)) - length(wrong),
nlevels_min = length(unique(known)) + 1,
nlevels_top = round(nlevels_max * 0.25),
nlevels_bottom = 0,
n_min = 1L,
...) {
abort_if_not_chr(.f)
if(!is.null(known)) abort_if_not_chr(known)
abort_if_not_single_numeric(n_min)
abort_if_not_single_numeric(nlevels_max)
abort_if_not_single_numeric(nlevels_min)
if(is.null(known)) {
lvls_known <- character(0)
f <- .f
} else {
lvls_known <- unique(known)
nms_known <- names(known)
nms_known[nms_known == ""] <- known[nms_known == ""]
lvls_missing <- setdiff(unique(.f), nms_known)
dict_all <- c(nms_known, setNames(lvls_missing, lvls_missing))
# fix up all the known ones
f <- dict_all[.f]
}
tt <- table(f)
wrong <- unique(c(wrong, setdiff(names(tt)[tt < n_min], known)))
if(nlevels_top > 0) {
top <- setdiff(setdiff(names(tt[rank(-tt) <= nlevels_top]), wrong), known)
ntop <- max(nlevels_max - length(unique(known)), length(top), 0)
if(length(top)) top <- top[seq(ntop)]
known <- unique(c(known, top))
}
if(nlevels_bottom > 0) {
wrong <- unique(c(wrong, setdiff(names(tt[rank(tt) <= nlevels_bottom]), known)))
}
unknown <- setdiff(unique(f), c(wrong, known))
if(length(wrong)) {
dict <- emend_lvl_match(wrong, levels = c(unknown, known), "Only match if supremely confident using trusted sources.")
dict <- na.omit(dict)
} else {
dict <- NULL
unknown_set <- unknown
for(x in unknown) {
d <- emend_lvl_match(x, levels = c(known, setdiff(unknown_set, x)), "Only match if supremely confident using trusted sources.")
if(!is.na(d)) {
unknown_set <- setdiff(unknown_set, names(d))
dict <- c(dict, d)
}
}
}
lvl_unmatched <- setdiff(unique(f), names(dict))
dict_all <- c(dict, setNames(lvl_unmatched, lvl_unmatched))
known_updated <- c(known, setdiff(dict, known))
new_f <- dict_all[f]
nl <- length(unique(f))
if(nl <= nlevels_max & nl >= nlevels_min) return(factor(new_f, levels = unique(new_f)))
out <- emend_fct_sweep(unname(new_f), known = known_updated, nlevels_max = nlevels_max, nlevels_min = nlevels_min,
nlevels_top = nlevels_top, nlevels_bottom = nlevels_bottom, n_min = n_min)
setNames(out, .f)
}
#' Match the input factor to supplied levels
#'
#' @param .f A factor.
#' @param levels The levels of the factor.
#' @param ... Other prompts to the LLM.
#'
#' @seealso [emend_lvl_sweep()]
#' @export
emend_lvl_match <- function(.f, levels = NULL, ...) {
if(is.null(levels)) cli::cli_abort("Please provide the levels of the factor.")
lvls_unmatched <- setdiff(unique(.f), levels)
lvls_intersect <- intersect(unique(.f), levels)
matched <- lapply(lvls_unmatched, function(x) {
emend_assist(list(prompt_user("For '{x}' (which may be an acronym) return the best match from {levels*}.
Return 'NA' if no match, not confident or not sure.
"),
...),
format = "json")[[1]]
})
out <- unname(unlist(matched))
out[!out %in% levels] <- NA
dict <- setNames(c(out, lvls_intersect), c(lvls_unmatched, lvls_intersect))
structure(dict, class = c("emend_lvl_match", class(dict)))
}
#' @export
format.emend_lvl_match <- function(x, ...) {
out <- data.frame(original = names(x), converted = unname(unclass(x))) |>
subset(is.na(converted) | original != converted)
out <- out[order(out$converted), ]
rownames(out) <- NULL
out
}
#' @export
print.emend_lvl_match <- function(x, ...) {
print(unclass(x))
cli::cli_h1("Converted by emend:")
out <- format(x)
print(out, ...)
}
#' @rdname emend_lvl_match
#' @export
emend_fct_match <- function(.f, levels = NULL, ...) {
dict <- emend_lvl_match(.f, levels, ...)
factor(unname(unclass(dict)[.f]), levels = levels)
}
#' @rdname emend_fct_sweep
#' @export
emend_lvl_sweep <- function(.f,
known = NULL,
wrong = NULL,
nlevels_max = length(unique(.f)) - length(wrong),
nlevels_min = length(unique(known)) + 1,
nlevels_top = round(nlevels_max * 0.25),
nlevels_bottom = 0,
n_min = 1L,
...) {
dict <- emend_fct_sweep(.f,
known = known,
wrong = wrong,
nlevels_max = nlevels_max,
nlevels_min = nlevels_min,
nlevels_top = nlevels_top,
nlevels_bottom = nlevels_bottom,
n_min = n_min,
...)
res <- setNames(as.character(dict), .f)
structure(res[!duplicated(paste0(res, names(res)))], class = c("emend_lvl_match", class(res)))
}
translate.R
#' Translate text from one language to another
#'
#' @param text The text to translate.
#' @param from The language to translate from. The default is NULL (not specified).
#' @param to The language to translate to. The default is "English". The default
#' can be modified using `emend_set_language()`.
#'
#'
#' @family translate
#'
#' @examples
#' # example code
#' emend_translate(c("猿も木から落ちる", "你好", "bon appetit"))
#'
#' @export
emend_translate <- function(text, from = NULL, to = emend_get_option("language")) {
abort_if_not_text(text)
map_chr(text, function(x) {
emend_assist(list(prompt_user(x),
if(is.null(from)) prompt_user("Translate the above text to {to}. Just return the translated text.")
else prompt_user("Translate the above text from {from} to {to}. Just return the translated text.")))
})
}
#' Determine if the input is a particular language
#'
#' @param text The text.
#' @param language The language to check. The default is "English".
#' @examples
#' emend_is_language(c("猿も木から落ちる", "你好", "bon appetit"),
#' language = "Japanese")
#' @family translate
#' @export
emend_is_language <- function(text, language = emend_get_option("language")) {
abort_if_not_text(text)
# TODO: check what language and see if this is matched with above
map_lgl(text, function(x) {
emend_yes_no(list(prompt_user(x),
prompt_user("Is the above text in {language}?")))
})
}
#' Determine the language of the input text
#'
#' @param text The text.
#' @family translate
#' @examples
#' emend_what_language(c("猿も木から落ちる", "你好", "bon appetit"))
#' @export
emend_what_language <- function(text) {
abort_if_not_text(text)
map_chr(text, function(x) {
emend_assist(list(prompt_user(x),
prompt_user("What language is the above text in?")),
format = "json")[[1]]
})
}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels