diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index e69de29bb2d1..9b5d5df495c7 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -0,0 +1,38 @@ +# DiscoveryBench with OpenHands + +[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers. + +

+ + DiscoveryBench Background + +

+ + +## Setup Environment and LLM Configuration + +1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally + +2. Execute the bash script to start DiscoveryBench Evaluation + +``` +./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] +``` +Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` + + +## Run Inference on DiscoveryBench Instances + +When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`. + + +``` +./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] +``` + +- `MODEL_CONFIG`: Name of the model you want to evaluate with +- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2. +- `AGENT`: Use CoderActAgent, right now it only supports that. +- `EVAL_LIMIT`: Number of samples to evaluate. +- `NUM_WORKERS`: Number of workers to parallelize the evaluation process. + diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md new file mode 100644 index 000000000000..0a349139907a --- /dev/null +++ b/evaluation/discoverybench/eval_utils/README.md @@ -0,0 +1,7 @@ +## Evaluation Utils + +- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses. +- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process. +- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks. +- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation. +- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses. diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/discoverybench/eval_utils/arguments.py b/evaluation/discoverybench/eval_utils/arguments.py deleted file mode 100644 index abf45ff7f7a0..000000000000 --- a/evaluation/discoverybench/eval_utils/arguments.py +++ /dev/null @@ -1,81 +0,0 @@ -import argparse - - -class Arguments(argparse.ArgumentParser): - def __init__(self, groups=None): - super().__init__(conflict_handler='resolve') - # Common flags - self.add_argument('--out_dir', type=str, default='outputs') - self.add_argument( - '--debug', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument( - '--verbose', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument('--seed', type=int, default=17) - self.add_argument('--run_id', type=str) - - if not isinstance( - groups, list - ): # COMMENT: changed from type check to isinstance - groups = [groups] - - for group in groups: - if group == 'eval': - self.add_argument('--in_dir', type=str) - self.add_argument( - '--save', action=argparse.BooleanOptionalAction, default=True - ) - elif group == 'generate': - self.add_argument('--n_instances', type=int, default=None) - self.add_argument( - '--save', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument( - '--inject_semantics', - action=argparse.BooleanOptionalAction, - default=False, - ) - self.add_argument('--topics_fpath', type=str) - self.add_argument( - '--openai_topic_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--n_topics', type=int, default=50) - self.add_argument( - '--openai_interp_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--max_generation_retries', type=int, default=3) - self.add_argument( - '--sample_unique_topics', - action=argparse.BooleanOptionalAction, - default=True, - ) - self.add_argument('--test_set_prop', type=float, default=0.4) - self.add_argument( - '--eval_gold', action=argparse.BooleanOptionalAction, default=True - ) - self.add_argument( - '--skip_on_error', - action=argparse.BooleanOptionalAction, - default=False, - ) - self.add_argument( - '--datasets', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument('--semantics_fpath', type=str) - self.add_argument('--datasets_fpath', type=str) - self.add_argument( - '--openai_semantics_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument( - '--openai_datasets_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument( - '--openai_query_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--n_rows', type=int, default=500) - self.add_argument('--semantic_depth', type=int, default=3) - self.add_argument('--leaf_prob', type=float, default=0.4) - self.add_argument( - '--benchmark', action=argparse.BooleanOptionalAction, default=False - ) diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py index bb8e2d08139a..a80df8279cfb 100644 --- a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py +++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py @@ -3,8 +3,8 @@ from openai import OpenAI -from evaluation.discoverybench.eval_utils.lm_utils import run_chatgpt_query_multi_turn -from evaluation.discoverybench.eval_utils.openai_helpers import get_response +from .lm_utils import run_chatgpt_query_multi_turn +from .openai_helpers import get_response logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -49,7 +49,7 @@ def get_score_from_answer(type, answer): } print(f'var_eval: {eval_rec}') return eval_rec - except Exception: # COMMENT: added "Exception" + except Exception: # COMMENT: added Exception return {'p': -1.0, 'r': -1.0, 'f1': -1.0} elif type == 'rel': print(answer) @@ -229,19 +229,17 @@ def get_sub_hypotheses( ): client = OpenAI() extraction_prompt = """\ - Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract the \ - set of sub-hypotheses that are present in the hypothesis such that each sub-hypothesis covers a separate context, is \ - self-sufficient, and operates on a coherent set of 3 dimensions: Context, Variables, and Relations. \ + Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \ Here are the definitions for these dimensions: - - Contexts: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over \ + - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \ the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption. - Variables: Known concepts that interact in a meaningful way under a given context to \ - produce the sub-hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. + produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. - Relations: Interactions between a given set of variables under a given context to produce \ - the sub-hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ + the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ or "None" if there is no interacting relationship. Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \ - If no sub-hypotheses can be extracted, return an empty list. + For each dimension, be specific, and do not omit any important details. Here is the metadata for the task: ```json @@ -257,11 +255,10 @@ def get_sub_hypotheses( { "sub_hypo": [ { - "text": the sub-hypothesis in natural language, - "context": a short text description of the context of the sub-hypothesis, - "variables": a list of columns involved in the sub-hypothesis, - "relations": a short text description of the relationship between the variables of the sub-hypothesis, - "explanation": a short text explanation for the breakdown of the sub-hypothesis + "text": the hypothesis in natural language, + "context": a short text description of the context of the hypothesis, + "variables": a list of columns involved in the hypothesis, + "relations": a short text description of the relationship between the variables of the hypothesis }, ... ] @@ -391,11 +388,11 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( use_column_metadata=True, ): # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp} - # Output: score + # Output: eval_rec json includes final_score # Procedure: # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg} - # Gold: [Hg1, Hg2] (pre-store) Hg1 is a NL form of subhypothesis + # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis # Predicted: [Hp1, Hp2] (compute on the fly) # Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction) @@ -409,6 +406,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( # r_v_list ← f1_v * score_r # accuracy_score = mean(r_v_list) # score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )] + # recall_context = 1.0 # COMMENT: never used eval_rec = { 'query': query, @@ -487,10 +485,6 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( else: context_score = 0.0 - # question, answer, context_score = ask_dimension_question(query, gold_subh, gold_workflow, - # gen_subh, gen_workflow, dataset_meta, llm_used, - # dimension="context") - if context_score == 1.0: # match only when context_score = 1.0 gen_subh_to_gold_subh[p_id] = g_id gold_subh_covered.append(g_id) diff --git a/evaluation/discoverybench/eval_utils/helpers.py b/evaluation/discoverybench/eval_utils/helpers.py deleted file mode 100644 index 4c7afa42b9e7..000000000000 --- a/evaluation/discoverybench/eval_utils/helpers.py +++ /dev/null @@ -1,93 +0,0 @@ -import atexit -import json -import logging -import os -from itertools import chain, combinations - -import numpy as np -from sympy import preorder_traversal -from sympy.core.numbers import Float as SympyFloat - - -def setup_logger(run_id, log_dir='./logs'): - os.makedirs(log_dir, exist_ok=True) - log_fname = f'{log_dir}/{run_id}.log' - logger = logging.getLogger() # get root logger - file_handler = logging.FileHandler(log_fname, mode='a', delay=False) - file_handler.setFormatter( - logging.Formatter( - fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - ) - ) - file_handler.setLevel(logging.INFO) - logger.addHandler( - file_handler - ) # all other loggers propagate to root; write to one log file from root - print(f'Log path: {log_fname}') - atexit.register(lambda: print(f'Log path: {log_fname}')) - - -def deep_get(obj, *keys, default): - default = default if default is not None else {} - rtn = obj - if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict - return default - for k in keys: - rtn = rtn.get(k, default) - if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict - return rtn - return rtn - - -def printj(obj, indent=2, logger=None): - fn = print if logger is None else logger - fn(json.dumps(obj, indent=indent)) - - -def extract_bracket_substrings(input_str): - substrings = [] - stack = [] - - for i, char in enumerate(input_str): - if char == '(': - stack.append(i) - elif char == ')': - if stack: - start_index = stack.pop() - substrings.append(input_str[start_index : i + 1]) - - return substrings - - -def extract_variable(input_str, var_prefix='x'): - split = input_str.split() - rtn = [] - for s in split: - _s = s.strip().strip('(').strip(')') - if _s.startswith(var_prefix): - rtn.append(_s) - return rtn - - -def round_sympy_expr(expr, precision=2): - new = expr - for a in preorder_traversal(expr): - if isinstance(a, SympyFloat): - new = new.subs(a, round(a, precision)) - return new - - -def powerset(iterable): - s = list(iterable) - return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1)) - - -def get_const_from_sympy(sym): - return [arg for arg in sym.args if arg not in sym.free_symbols][0] - - -def safe_exp(expr, exp, default=0.0): - if exp < 0: - return np.where(expr != 0, np.power(expr, exp), default) - return np.power(expr, exp) diff --git a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py deleted file mode 100644 index 83cc44d7fbb7..000000000000 --- a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py +++ /dev/null @@ -1,121 +0,0 @@ -PROMPT_QUERY = """\ -Given a dataset and a known true hypothesis that can be proven from it, construct a hard question \ -that tests someone's ability to find the true hypothesis using data analysis. \ -Make sure to not reveal the true hypothesis in the question. \ -Do not provide too many details. You may start your question in the following manner: \ -"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "true_hypothesis": "%s" -} -``` - -Give your answer as a new JSON with the following format: -```json -{ - "question": "..." -} -```""" - -PROMPT_QUERY_VARIABLE = """\ -Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \ -test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \ -different amounts of information making sure to not reveal_in_question the true hypothesis. For each question, we will \ -provide an instruction of what information to hold back. You may start your question text in the following manner: \ -"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". \ -Make sure that the question is not leading (i.e. it does not indicate what the true answer is). \ - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "hypothesis": { - "text": "%s", - "target_col": "%s", - "target_col_derivation": "%s" - }, - "questions": [ - { - "reveal_in_question": [], - "hide_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept"], - "hide_in_question": ["concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept", "concepts that affect the target concept"], - "hide_in_question": ["specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "hide_in_question": [], - "text": "..." - } - ] -}``` - -Give your answer as a new JSON with the following format: -```json -{ - "questions": [ - {"text": "..."}, - {"text": "..."}, - ... - ] -}```""" - - -PROMPT_QUERY_RELATIONSHIP = """\ -Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \ -test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \ -different amounts of information making sure to not reveal the true hypothesis. For each question, we will provide an \ -instruction of what information to hold back. You may start your question text in the following manner: "What is the \ -relationship between...", "Is there a relationship...", "How does...", "What might...". Make sure that the question is \ -not leading (i.e. it does not indicate what the true answer is). - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "hypothesis": { - "text": "%s", - "target_col": "%s", - "target_col_derivation": "%s" - }, - "questions": [ - { - "reveal_in_question": [], - "hide_in_question": ["any information about the relationship between the interacting concepts"], - "text": "..." - }, - { - "reveal_in_question": ["nature of the relationship (e.g., positive/negative, increase/decrease, etc.)", "numerics of the relationship (e.g. quadratic relationship, change by x amount, etc.)"], - "hide_in_question": [], - "text": "..." - } - ] -}``` - -Give your answer as a new JSON with the following format: -```json -{ - "questions": [ - {"text": "..."}, - {"text": "..."}, - ... - ] -} -```""" diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py index 6685e7c56c26..c41a9c902664 100644 --- a/evaluation/discoverybench/run_infer.py +++ b/evaluation/discoverybench/run_infer.py @@ -37,6 +37,16 @@ DATA_FILES = {} +LIBRARIES = [ + 'pandas', + 'numpy', + 'scipy', + 'matplotlib', + 'seaborn', + 'scikit-learn', + 'statsmodels', +] + AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, } @@ -70,6 +80,22 @@ def get_config( def get_dv_query_for_real( datasets, question, domain_knowledge=None, workflow_tags=None ): + """ + Prepare a structured query for the agent to execute on the specified datasets. + + This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags. + + Args: + datasets: List of datasets + question: Query to be answered + domain_knowledge: Domain knowledge if any + workflow_tags: Workflow tags if any + + Returns: + query_to_dv: Query to be run on the dataset + dataset_meta: Metadata of the dataset + """ + dataset_meta = '' for dataset_metadata in datasets: dataset_meta += 'Dataset name: ' + dataset_metadata['name'] @@ -129,6 +155,12 @@ def initialize_runtime(runtime: Runtime, csv_file: list[str]): '/workspace', ) + for lib in LIBRARIES: + action = CmdRunAction(command=f'pip install {lib}') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") @@ -193,6 +225,23 @@ def process_instance( metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: + """ + Process and evaluate a single instance of the dataset. + + This function executes the OpenHands agent + for a specific instance of the dataset. It retrieves + the agent's results and evaluates them against the gold + hypothesis. + + Args: + instance: A single row of the dataset + metadata: Metadata for the evaluation + reset_logger: Whether to reset the logger + + Returns: + output: EvalOutput object + """ + config = get_config(metadata) # use a session id for concurrent evaluation @@ -281,11 +330,18 @@ def process_instance( def create_dataset(repo_location: str, split: str = 'test'): - # walk through the repository for test split - # as soon as a metadata_{}.json file is found, load - # it and extract domain knowledge, workflow tags, queries, datasets, gold_hypothesis, - # and gold_workflow - # add all these to a pandas dataframe + """ + Create a dataset from the discoverybench repository + by walking through the repository and extracting metadata + from the metadata_{}.json files + + Args: + repo_location: Location of the repository + split: Split of the dataset to use + + Returns: + df: DataFrame containing the dataset instances + """ data_dict = {}