diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index e69de29bb2d1..9b5d5df495c7 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -0,0 +1,38 @@
+# DiscoveryBench with OpenHands
+
+[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers.
+
+
+
+
+
+
+
+
+## Setup Environment and LLM Configuration
+
+1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally
+
+2. Execute the bash script to start DiscoveryBench Evaluation
+
+```
+./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
+```
+Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
+
+
+## Run Inference on DiscoveryBench Instances
+
+When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`.
+
+
+```
+./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
+```
+
+- `MODEL_CONFIG`: Name of the model you want to evaluate with
+- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2.
+- `AGENT`: Use CoderActAgent, right now it only supports that.
+- `EVAL_LIMIT`: Number of samples to evaluate.
+- `NUM_WORKERS`: Number of workers to parallelize the evaluation process.
+
diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md
new file mode 100644
index 000000000000..0a349139907a
--- /dev/null
+++ b/evaluation/discoverybench/eval_utils/README.md
@@ -0,0 +1,7 @@
+## Evaluation Utils
+
+- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses.
+- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process.
+- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks.
+- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation.
+- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses.
diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/discoverybench/eval_utils/arguments.py b/evaluation/discoverybench/eval_utils/arguments.py
deleted file mode 100644
index abf45ff7f7a0..000000000000
--- a/evaluation/discoverybench/eval_utils/arguments.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import argparse
-
-
-class Arguments(argparse.ArgumentParser):
- def __init__(self, groups=None):
- super().__init__(conflict_handler='resolve')
- # Common flags
- self.add_argument('--out_dir', type=str, default='outputs')
- self.add_argument(
- '--debug', action=argparse.BooleanOptionalAction, default=False
- )
- self.add_argument(
- '--verbose', action=argparse.BooleanOptionalAction, default=False
- )
- self.add_argument('--seed', type=int, default=17)
- self.add_argument('--run_id', type=str)
-
- if not isinstance(
- groups, list
- ): # COMMENT: changed from type check to isinstance
- groups = [groups]
-
- for group in groups:
- if group == 'eval':
- self.add_argument('--in_dir', type=str)
- self.add_argument(
- '--save', action=argparse.BooleanOptionalAction, default=True
- )
- elif group == 'generate':
- self.add_argument('--n_instances', type=int, default=None)
- self.add_argument(
- '--save', action=argparse.BooleanOptionalAction, default=False
- )
- self.add_argument(
- '--inject_semantics',
- action=argparse.BooleanOptionalAction,
- default=False,
- )
- self.add_argument('--topics_fpath', type=str)
- self.add_argument(
- '--openai_topic_model', type=str, default='gpt-3.5-turbo'
- )
- self.add_argument('--n_topics', type=int, default=50)
- self.add_argument(
- '--openai_interp_model', type=str, default='gpt-3.5-turbo'
- )
- self.add_argument('--max_generation_retries', type=int, default=3)
- self.add_argument(
- '--sample_unique_topics',
- action=argparse.BooleanOptionalAction,
- default=True,
- )
- self.add_argument('--test_set_prop', type=float, default=0.4)
- self.add_argument(
- '--eval_gold', action=argparse.BooleanOptionalAction, default=True
- )
- self.add_argument(
- '--skip_on_error',
- action=argparse.BooleanOptionalAction,
- default=False,
- )
- self.add_argument(
- '--datasets', action=argparse.BooleanOptionalAction, default=False
- )
- self.add_argument('--semantics_fpath', type=str)
- self.add_argument('--datasets_fpath', type=str)
- self.add_argument(
- '--openai_semantics_model', type=str, default='gpt-3.5-turbo'
- )
- self.add_argument(
- '--openai_datasets_model', type=str, default='gpt-3.5-turbo'
- )
- self.add_argument(
- '--openai_query_model', type=str, default='gpt-3.5-turbo'
- )
- self.add_argument('--n_rows', type=int, default=500)
- self.add_argument('--semantic_depth', type=int, default=3)
- self.add_argument('--leaf_prob', type=float, default=0.4)
- self.add_argument(
- '--benchmark', action=argparse.BooleanOptionalAction, default=False
- )
diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
index bb8e2d08139a..a80df8279cfb 100644
--- a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
+++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
@@ -3,8 +3,8 @@
from openai import OpenAI
-from evaluation.discoverybench.eval_utils.lm_utils import run_chatgpt_query_multi_turn
-from evaluation.discoverybench.eval_utils.openai_helpers import get_response
+from .lm_utils import run_chatgpt_query_multi_turn
+from .openai_helpers import get_response
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
@@ -49,7 +49,7 @@ def get_score_from_answer(type, answer):
}
print(f'var_eval: {eval_rec}')
return eval_rec
- except Exception: # COMMENT: added "Exception"
+ except Exception: # COMMENT: added Exception
return {'p': -1.0, 'r': -1.0, 'f1': -1.0}
elif type == 'rel':
print(answer)
@@ -229,19 +229,17 @@ def get_sub_hypotheses(
):
client = OpenAI()
extraction_prompt = """\
- Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract the \
- set of sub-hypotheses that are present in the hypothesis such that each sub-hypothesis covers a separate context, is \
- self-sufficient, and operates on a coherent set of 3 dimensions: Context, Variables, and Relations. \
+ Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \
Here are the definitions for these dimensions:
- - Contexts: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over \
+ - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \
the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption.
- Variables: Known concepts that interact in a meaningful way under a given context to \
- produce the sub-hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
+ produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
- Relations: Interactions between a given set of variables under a given context to produce \
- the sub-hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
+ the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
or "None" if there is no interacting relationship.
Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \
- If no sub-hypotheses can be extracted, return an empty list.
+ For each dimension, be specific, and do not omit any important details.
Here is the metadata for the task:
```json
@@ -257,11 +255,10 @@ def get_sub_hypotheses(
{
"sub_hypo": [
{
- "text": the sub-hypothesis in natural language,
- "context": a short text description of the context of the sub-hypothesis,
- "variables": a list of columns involved in the sub-hypothesis,
- "relations": a short text description of the relationship between the variables of the sub-hypothesis,
- "explanation": a short text explanation for the breakdown of the sub-hypothesis
+ "text": the hypothesis in natural language,
+ "context": a short text description of the context of the hypothesis,
+ "variables": a list of columns involved in the hypothesis,
+ "relations": a short text description of the relationship between the variables of the hypothesis
},
...
]
@@ -391,11 +388,11 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
use_column_metadata=True,
):
# Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp}
- # Output: score
+ # Output: eval_rec json includes final_score
# Procedure:
# Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg}
- # Gold: [Hg1, Hg2] (pre-store) Hg1 is a NL form of subhypothesis
+ # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis
# Predicted: [Hp1, Hp2] (compute on the fly)
# Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction)
@@ -409,6 +406,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
# r_v_list ← f1_v * score_r
# accuracy_score = mean(r_v_list)
# score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )]
+
# recall_context = 1.0 # COMMENT: never used
eval_rec = {
'query': query,
@@ -487,10 +485,6 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
else:
context_score = 0.0
- # question, answer, context_score = ask_dimension_question(query, gold_subh, gold_workflow,
- # gen_subh, gen_workflow, dataset_meta, llm_used,
- # dimension="context")
-
if context_score == 1.0: # match only when context_score = 1.0
gen_subh_to_gold_subh[p_id] = g_id
gold_subh_covered.append(g_id)
diff --git a/evaluation/discoverybench/eval_utils/helpers.py b/evaluation/discoverybench/eval_utils/helpers.py
deleted file mode 100644
index 4c7afa42b9e7..000000000000
--- a/evaluation/discoverybench/eval_utils/helpers.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import atexit
-import json
-import logging
-import os
-from itertools import chain, combinations
-
-import numpy as np
-from sympy import preorder_traversal
-from sympy.core.numbers import Float as SympyFloat
-
-
-def setup_logger(run_id, log_dir='./logs'):
- os.makedirs(log_dir, exist_ok=True)
- log_fname = f'{log_dir}/{run_id}.log'
- logger = logging.getLogger() # get root logger
- file_handler = logging.FileHandler(log_fname, mode='a', delay=False)
- file_handler.setFormatter(
- logging.Formatter(
- fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
- datefmt='%m/%d/%Y %H:%M:%S',
- )
- )
- file_handler.setLevel(logging.INFO)
- logger.addHandler(
- file_handler
- ) # all other loggers propagate to root; write to one log file from root
- print(f'Log path: {log_fname}')
- atexit.register(lambda: print(f'Log path: {log_fname}'))
-
-
-def deep_get(obj, *keys, default):
- default = default if default is not None else {}
- rtn = obj
- if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict
- return default
- for k in keys:
- rtn = rtn.get(k, default)
- if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict
- return rtn
- return rtn
-
-
-def printj(obj, indent=2, logger=None):
- fn = print if logger is None else logger
- fn(json.dumps(obj, indent=indent))
-
-
-def extract_bracket_substrings(input_str):
- substrings = []
- stack = []
-
- for i, char in enumerate(input_str):
- if char == '(':
- stack.append(i)
- elif char == ')':
- if stack:
- start_index = stack.pop()
- substrings.append(input_str[start_index : i + 1])
-
- return substrings
-
-
-def extract_variable(input_str, var_prefix='x'):
- split = input_str.split()
- rtn = []
- for s in split:
- _s = s.strip().strip('(').strip(')')
- if _s.startswith(var_prefix):
- rtn.append(_s)
- return rtn
-
-
-def round_sympy_expr(expr, precision=2):
- new = expr
- for a in preorder_traversal(expr):
- if isinstance(a, SympyFloat):
- new = new.subs(a, round(a, precision))
- return new
-
-
-def powerset(iterable):
- s = list(iterable)
- return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
-
-
-def get_const_from_sympy(sym):
- return [arg for arg in sym.args if arg not in sym.free_symbols][0]
-
-
-def safe_exp(expr, exp, default=0.0):
- if exp < 0:
- return np.where(expr != 0, np.power(expr, exp), default)
- return np.power(expr, exp)
diff --git a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
deleted file mode 100644
index 83cc44d7fbb7..000000000000
--- a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
+++ /dev/null
@@ -1,121 +0,0 @@
-PROMPT_QUERY = """\
-Given a dataset and a known true hypothesis that can be proven from it, construct a hard question \
-that tests someone's ability to find the true hypothesis using data analysis. \
-Make sure to not reveal the true hypothesis in the question. \
-Do not provide too many details. You may start your question in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...".
-
-Dataset and hypothesis:
-```json
-{
- "domain": "%s",
- "description": "%s",
- "columns": %s,
- "true_hypothesis": "%s"
-}
-```
-
-Give your answer as a new JSON with the following format:
-```json
-{
- "question": "..."
-}
-```"""
-
-PROMPT_QUERY_VARIABLE = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal_in_question the true hypothesis. For each question, we will \
-provide an instruction of what information to hold back. You may start your question text in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". \
-Make sure that the question is not leading (i.e. it does not indicate what the true answer is). \
-
-Dataset and hypothesis:
-```json
-{
- "domain": "%s",
- "description": "%s",
- "columns": %s,
- "hypothesis": {
- "text": "%s",
- "target_col": "%s",
- "target_col_derivation": "%s"
- },
- "questions": [
- {
- "reveal_in_question": [],
- "hide_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
- "text": "..."
- },
- {
- "reveal_in_question": ["target concept"],
- "hide_in_question": ["concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
- "text": "..."
- },
- {
- "reveal_in_question": ["target concept", "concepts that affect the target concept"],
- "hide_in_question": ["specific sub-group(s), if any, the relationship is applicable to"],
- "text": "..."
- },
- {
- "reveal_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
- "hide_in_question": [],
- "text": "..."
- }
- ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
- "questions": [
- {"text": "..."},
- {"text": "..."},
- ...
- ]
-}```"""
-
-
-PROMPT_QUERY_RELATIONSHIP = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal the true hypothesis. For each question, we will provide an \
-instruction of what information to hold back. You may start your question text in the following manner: "What is the \
-relationship between...", "Is there a relationship...", "How does...", "What might...". Make sure that the question is \
-not leading (i.e. it does not indicate what the true answer is).
-
-Dataset and hypothesis:
-```json
-{
- "domain": "%s",
- "description": "%s",
- "columns": %s,
- "hypothesis": {
- "text": "%s",
- "target_col": "%s",
- "target_col_derivation": "%s"
- },
- "questions": [
- {
- "reveal_in_question": [],
- "hide_in_question": ["any information about the relationship between the interacting concepts"],
- "text": "..."
- },
- {
- "reveal_in_question": ["nature of the relationship (e.g., positive/negative, increase/decrease, etc.)", "numerics of the relationship (e.g. quadratic relationship, change by x amount, etc.)"],
- "hide_in_question": [],
- "text": "..."
- }
- ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
- "questions": [
- {"text": "..."},
- {"text": "..."},
- ...
- ]
-}
-```"""
diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py
index 6685e7c56c26..c41a9c902664 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/discoverybench/run_infer.py
@@ -37,6 +37,16 @@
DATA_FILES = {}
+LIBRARIES = [
+ 'pandas',
+ 'numpy',
+ 'scipy',
+ 'matplotlib',
+ 'seaborn',
+ 'scikit-learn',
+ 'statsmodels',
+]
+
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
}
@@ -70,6 +80,22 @@ def get_config(
def get_dv_query_for_real(
datasets, question, domain_knowledge=None, workflow_tags=None
):
+ """
+ Prepare a structured query for the agent to execute on the specified datasets.
+
+ This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags.
+
+ Args:
+ datasets: List of datasets
+ question: Query to be answered
+ domain_knowledge: Domain knowledge if any
+ workflow_tags: Workflow tags if any
+
+ Returns:
+ query_to_dv: Query to be run on the dataset
+ dataset_meta: Metadata of the dataset
+ """
+
dataset_meta = ''
for dataset_metadata in datasets:
dataset_meta += 'Dataset name: ' + dataset_metadata['name']
@@ -129,6 +155,12 @@ def initialize_runtime(runtime: Runtime, csv_file: list[str]):
'/workspace',
)
+ for lib in LIBRARIES:
+ action = CmdRunAction(command=f'pip install {lib}')
+ logger.info(action, extra={'msg_type': 'ACTION'})
+ obs = runtime.run_action(action)
+ assert obs.exit_code == 0
+
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
@@ -193,6 +225,23 @@ def process_instance(
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
+ """
+ Process and evaluate a single instance of the dataset.
+
+ This function executes the OpenHands agent
+ for a specific instance of the dataset. It retrieves
+ the agent's results and evaluates them against the gold
+ hypothesis.
+
+ Args:
+ instance: A single row of the dataset
+ metadata: Metadata for the evaluation
+ reset_logger: Whether to reset the logger
+
+ Returns:
+ output: EvalOutput object
+ """
+
config = get_config(metadata)
# use a session id for concurrent evaluation
@@ -281,11 +330,18 @@ def process_instance(
def create_dataset(repo_location: str, split: str = 'test'):
- # walk through the repository for test split
- # as soon as a metadata_{}.json file is found, load
- # it and extract domain knowledge, workflow tags, queries, datasets, gold_hypothesis,
- # and gold_workflow
- # add all these to a pandas dataframe
+ """
+ Create a dataset from the discoverybench repository
+ by walking through the repository and extracting metadata
+ from the metadata_{}.json files
+
+ Args:
+ repo_location: Location of the repository
+ split: Split of the dataset to use
+
+ Returns:
+ df: DataFrame containing the dataset instances
+ """
data_dict = {}