Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions community_tasks/_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@
"""

import numpy as np
from aenum import extend_enum

from lighteval.metrics.metrics import SampleLevelMetric
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.tasks.requests import Doc


# DEFINE YOUR PROMPT FUNCTIONS
Expand All @@ -46,7 +49,7 @@ def prompt_fn(line, task_name: str = None):
return Doc(
task_name=task_name,
query="",
choices=[""],
choices="",
gold_index=0,
instruction="",
)
Expand All @@ -65,7 +68,7 @@ def prompt_fn(line, task_name: str = None):
evaluation_splits=[],
few_shots_split="",
few_shots_select="",
metrics=[], # select your metric in Metrics
metric=[], # select your metric in Metrics
)

# EVALS WITH SUBSET
Expand All @@ -88,7 +91,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
hf_repo="",
metrics=[custom_metric], # select your metric in Metrics or use your custom_metric
metric=[custom_metric], # select your metric in Metrics or use your custom_metric
hf_avail_splits=[],
evaluation_splits=[],
few_shots_split="",
Expand All @@ -108,7 +111,8 @@ def __init__(
custom_metric = SampleLevelMetric(
metric_name="my_custom_metric_name",
higher_is_better=True,
category=SamplingMethod.GENERATIVE, # or LOGPROBS, PERPLEXITY, etc.
category=MetricCategory.IGNORED,
use_case=MetricUseCase.NONE,
sample_level_fn=lambda x: x, # how to compute score for one sample
corpus_level_fn=np.mean, # aggregation
)
48 changes: 25 additions & 23 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@
from typing import Any, Dict, List, Optional, Union

from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.metrics import Metric, Metrics
from lighteval.metrics.metrics import Metric, MetricCategory, Metrics
from lighteval.metrics.utils.metric_utils import MetricUseCase
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.tasks.requests import Doc


# fmt: off
Expand Down Expand Up @@ -103,7 +104,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_pfn,
hf_repo="MBZUAI/ArabicMMLU",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=["dev"],
Expand Down Expand Up @@ -165,7 +166,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_ht_pfn,
hf_repo="MBZUAI/human_translated_arabic_mmlu",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
Expand Down Expand Up @@ -230,7 +231,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=arabic_mmlu_mt_pfn,
hf_repo="OALL/Arabic_MMLU",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "dev"],
evaluation_splits=["test"],
few_shots_split="dev",
Expand Down Expand Up @@ -286,7 +287,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=acva_pfn,
hf_repo="OALL/ACVA",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand Down Expand Up @@ -343,7 +344,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=aratrust_pfn,
hf_repo="asas-ai/AraTrust-categorized",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
Expand Down Expand Up @@ -392,7 +393,7 @@ def arabic_exams_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -443,7 +444,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=alghafa_pfn,
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand All @@ -470,7 +471,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -487,7 +488,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -504,7 +505,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -521,7 +522,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -538,7 +539,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -555,7 +556,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -593,7 +594,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -628,7 +629,7 @@ def copa_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -672,7 +673,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -709,7 +710,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -760,7 +761,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -818,7 +819,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=madinah_qa_pfn,
hf_repo="MBZUAI/MadinahQA",
metrics=[Metrics.loglikelihood_acc_norm],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=["dev"],
Expand Down Expand Up @@ -848,10 +849,11 @@ def __init__(self, judge: JudgeLM):
"""
self.judge = judge
self.metric_name = "llm_as_judge"
self.category = SamplingMethod.GENERATIVE
self.category = MetricCategory.LLM_AS_JUDGE
self.corpus_level_fn = self.aggregate_scores
self.sample_level_fn = self._sample_level_fn
self.higher_is_better = True # Fixed tuple syntax
self.use_case = MetricUseCase.NONE

def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
"""
Expand Down Expand Up @@ -1037,7 +1039,7 @@ def process_judge_response(response) -> float:
hf_subset=None,
hf_avail_splits=["train"],
evaluation_splits=["train"],
metrics=[wrapped_judge],
metric=[wrapped_judge],
trust_dataset=True,
generation_size=200,
stop_sequence=[],
Expand Down
17 changes: 13 additions & 4 deletions community_tasks/french_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@

import random

from lighteval.metrics.metrics import Metrics
import numpy as np
from aenum import extend_enum

import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
from lighteval.metrics.utils.metric_utils import (
MetricCategory,
MetricUseCase,
SampleLevelMetricGrouping,
)
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.extended.ifeval.main import ifeval_metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
Expand Down Expand Up @@ -97,7 +106,7 @@ def prompt_bac_fr(line, task_name: str = None):
suite=["community"],
hf_repo="fr-gouv-coordination-ia/IFEval-fr",
hf_subset="default",
metrics=[ifeval_metrics],
metric=[ifeval_metrics],
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split="train",
Expand All @@ -119,7 +128,7 @@ def prompt_bac_fr(line, task_name: str = None):
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=1,
metrics=[Metrics.loglikelihood_acc],
metric=[Metrics.loglikelihood_acc],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
Expand All @@ -137,7 +146,7 @@ def prompt_bac_fr(line, task_name: str = None):
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=1,
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
metric=[Metrics.quasi_exact_match_math, Metrics.exact_match],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
Expand Down
2 changes: 1 addition & 1 deletion community_tasks/german_rag_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):

# Task 3: Question-answer match.
# Given is a question and an answer.
# The task is to decide whether the answer actually answers the question.
# The task is to decide whether the answer actualy answers the question.
task3 = LightevalTaskConfig(
name="german_rag_eval:question_answer_match",
prompt_function=prompt_fn_question_answer_match,
Expand Down
Loading
Loading