From 54949ea5800f8d676a886ccc35e65bd241426474 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 18 Oct 2024 14:28:07 +0530 Subject: [PATCH 01/15] chore: remove useless modules --- .../discoverybench/eval_utils/arguments.py | 81 ------------ .../discoverybench/eval_utils/helpers.py | 93 -------------- .../eval_utils/openai_query_gen_prompts.py | 121 ------------------ 3 files changed, 295 deletions(-) delete mode 100644 evaluation/discoverybench/eval_utils/arguments.py delete mode 100644 evaluation/discoverybench/eval_utils/helpers.py delete mode 100644 evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py diff --git a/evaluation/discoverybench/eval_utils/arguments.py b/evaluation/discoverybench/eval_utils/arguments.py deleted file mode 100644 index abf45ff7f7a0..000000000000 --- a/evaluation/discoverybench/eval_utils/arguments.py +++ /dev/null @@ -1,81 +0,0 @@ -import argparse - - -class Arguments(argparse.ArgumentParser): - def __init__(self, groups=None): - super().__init__(conflict_handler='resolve') - # Common flags - self.add_argument('--out_dir', type=str, default='outputs') - self.add_argument( - '--debug', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument( - '--verbose', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument('--seed', type=int, default=17) - self.add_argument('--run_id', type=str) - - if not isinstance( - groups, list - ): # COMMENT: changed from type check to isinstance - groups = [groups] - - for group in groups: - if group == 'eval': - self.add_argument('--in_dir', type=str) - self.add_argument( - '--save', action=argparse.BooleanOptionalAction, default=True - ) - elif group == 'generate': - self.add_argument('--n_instances', type=int, default=None) - self.add_argument( - '--save', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument( - '--inject_semantics', - action=argparse.BooleanOptionalAction, - default=False, - ) - self.add_argument('--topics_fpath', type=str) - self.add_argument( - '--openai_topic_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--n_topics', type=int, default=50) - self.add_argument( - '--openai_interp_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--max_generation_retries', type=int, default=3) - self.add_argument( - '--sample_unique_topics', - action=argparse.BooleanOptionalAction, - default=True, - ) - self.add_argument('--test_set_prop', type=float, default=0.4) - self.add_argument( - '--eval_gold', action=argparse.BooleanOptionalAction, default=True - ) - self.add_argument( - '--skip_on_error', - action=argparse.BooleanOptionalAction, - default=False, - ) - self.add_argument( - '--datasets', action=argparse.BooleanOptionalAction, default=False - ) - self.add_argument('--semantics_fpath', type=str) - self.add_argument('--datasets_fpath', type=str) - self.add_argument( - '--openai_semantics_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument( - '--openai_datasets_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument( - '--openai_query_model', type=str, default='gpt-3.5-turbo' - ) - self.add_argument('--n_rows', type=int, default=500) - self.add_argument('--semantic_depth', type=int, default=3) - self.add_argument('--leaf_prob', type=float, default=0.4) - self.add_argument( - '--benchmark', action=argparse.BooleanOptionalAction, default=False - ) diff --git a/evaluation/discoverybench/eval_utils/helpers.py b/evaluation/discoverybench/eval_utils/helpers.py deleted file mode 100644 index 4c7afa42b9e7..000000000000 --- a/evaluation/discoverybench/eval_utils/helpers.py +++ /dev/null @@ -1,93 +0,0 @@ -import atexit -import json -import logging -import os -from itertools import chain, combinations - -import numpy as np -from sympy import preorder_traversal -from sympy.core.numbers import Float as SympyFloat - - -def setup_logger(run_id, log_dir='./logs'): - os.makedirs(log_dir, exist_ok=True) - log_fname = f'{log_dir}/{run_id}.log' - logger = logging.getLogger() # get root logger - file_handler = logging.FileHandler(log_fname, mode='a', delay=False) - file_handler.setFormatter( - logging.Formatter( - fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - ) - ) - file_handler.setLevel(logging.INFO) - logger.addHandler( - file_handler - ) # all other loggers propagate to root; write to one log file from root - print(f'Log path: {log_fname}') - atexit.register(lambda: print(f'Log path: {log_fname}')) - - -def deep_get(obj, *keys, default): - default = default if default is not None else {} - rtn = obj - if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict - return default - for k in keys: - rtn = rtn.get(k, default) - if not isinstance(rtn, dict): # COMMENT: changed from type(rtn) is not dict - return rtn - return rtn - - -def printj(obj, indent=2, logger=None): - fn = print if logger is None else logger - fn(json.dumps(obj, indent=indent)) - - -def extract_bracket_substrings(input_str): - substrings = [] - stack = [] - - for i, char in enumerate(input_str): - if char == '(': - stack.append(i) - elif char == ')': - if stack: - start_index = stack.pop() - substrings.append(input_str[start_index : i + 1]) - - return substrings - - -def extract_variable(input_str, var_prefix='x'): - split = input_str.split() - rtn = [] - for s in split: - _s = s.strip().strip('(').strip(')') - if _s.startswith(var_prefix): - rtn.append(_s) - return rtn - - -def round_sympy_expr(expr, precision=2): - new = expr - for a in preorder_traversal(expr): - if isinstance(a, SympyFloat): - new = new.subs(a, round(a, precision)) - return new - - -def powerset(iterable): - s = list(iterable) - return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1)) - - -def get_const_from_sympy(sym): - return [arg for arg in sym.args if arg not in sym.free_symbols][0] - - -def safe_exp(expr, exp, default=0.0): - if exp < 0: - return np.where(expr != 0, np.power(expr, exp), default) - return np.power(expr, exp) diff --git a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py deleted file mode 100644 index 83cc44d7fbb7..000000000000 --- a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py +++ /dev/null @@ -1,121 +0,0 @@ -PROMPT_QUERY = """\ -Given a dataset and a known true hypothesis that can be proven from it, construct a hard question \ -that tests someone's ability to find the true hypothesis using data analysis. \ -Make sure to not reveal the true hypothesis in the question. \ -Do not provide too many details. You may start your question in the following manner: \ -"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "true_hypothesis": "%s" -} -``` - -Give your answer as a new JSON with the following format: -```json -{ - "question": "..." -} -```""" - -PROMPT_QUERY_VARIABLE = """\ -Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \ -test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \ -different amounts of information making sure to not reveal_in_question the true hypothesis. For each question, we will \ -provide an instruction of what information to hold back. You may start your question text in the following manner: \ -"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". \ -Make sure that the question is not leading (i.e. it does not indicate what the true answer is). \ - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "hypothesis": { - "text": "%s", - "target_col": "%s", - "target_col_derivation": "%s" - }, - "questions": [ - { - "reveal_in_question": [], - "hide_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept"], - "hide_in_question": ["concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept", "concepts that affect the target concept"], - "hide_in_question": ["specific sub-group(s), if any, the relationship is applicable to"], - "text": "..." - }, - { - "reveal_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"], - "hide_in_question": [], - "text": "..." - } - ] -}``` - -Give your answer as a new JSON with the following format: -```json -{ - "questions": [ - {"text": "..."}, - {"text": "..."}, - ... - ] -}```""" - - -PROMPT_QUERY_RELATIONSHIP = """\ -Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \ -test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \ -different amounts of information making sure to not reveal the true hypothesis. For each question, we will provide an \ -instruction of what information to hold back. You may start your question text in the following manner: "What is the \ -relationship between...", "Is there a relationship...", "How does...", "What might...". Make sure that the question is \ -not leading (i.e. it does not indicate what the true answer is). - -Dataset and hypothesis: -```json -{ - "domain": "%s", - "description": "%s", - "columns": %s, - "hypothesis": { - "text": "%s", - "target_col": "%s", - "target_col_derivation": "%s" - }, - "questions": [ - { - "reveal_in_question": [], - "hide_in_question": ["any information about the relationship between the interacting concepts"], - "text": "..." - }, - { - "reveal_in_question": ["nature of the relationship (e.g., positive/negative, increase/decrease, etc.)", "numerics of the relationship (e.g. quadratic relationship, change by x amount, etc.)"], - "hide_in_question": [], - "text": "..." - } - ] -}``` - -Give your answer as a new JSON with the following format: -```json -{ - "questions": [ - {"text": "..."}, - {"text": "..."}, - ... - ] -} -```""" From 903a00d9b4ce54679255b0694adf93c75265ae1e Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 18 Oct 2024 15:06:11 +0530 Subject: [PATCH 02/15] fix: update discoverybench evaluation --- .../discoverybench/eval_utils/__init__.py | 0 .../eval_utils/eval_w_subhypo_gen.py | 36 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) create mode 100644 evaluation/discoverybench/eval_utils/__init__.py diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py index bb8e2d08139a..a80df8279cfb 100644 --- a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py +++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py @@ -3,8 +3,8 @@ from openai import OpenAI -from evaluation.discoverybench.eval_utils.lm_utils import run_chatgpt_query_multi_turn -from evaluation.discoverybench.eval_utils.openai_helpers import get_response +from .lm_utils import run_chatgpt_query_multi_turn +from .openai_helpers import get_response logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -49,7 +49,7 @@ def get_score_from_answer(type, answer): } print(f'var_eval: {eval_rec}') return eval_rec - except Exception: # COMMENT: added "Exception" + except Exception: # COMMENT: added Exception return {'p': -1.0, 'r': -1.0, 'f1': -1.0} elif type == 'rel': print(answer) @@ -229,19 +229,17 @@ def get_sub_hypotheses( ): client = OpenAI() extraction_prompt = """\ - Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract the \ - set of sub-hypotheses that are present in the hypothesis such that each sub-hypothesis covers a separate context, is \ - self-sufficient, and operates on a coherent set of 3 dimensions: Context, Variables, and Relations. \ + Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \ Here are the definitions for these dimensions: - - Contexts: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over \ + - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \ the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption. - Variables: Known concepts that interact in a meaningful way under a given context to \ - produce the sub-hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. + produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. - Relations: Interactions between a given set of variables under a given context to produce \ - the sub-hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ + the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ or "None" if there is no interacting relationship. Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \ - If no sub-hypotheses can be extracted, return an empty list. + For each dimension, be specific, and do not omit any important details. Here is the metadata for the task: ```json @@ -257,11 +255,10 @@ def get_sub_hypotheses( { "sub_hypo": [ { - "text": the sub-hypothesis in natural language, - "context": a short text description of the context of the sub-hypothesis, - "variables": a list of columns involved in the sub-hypothesis, - "relations": a short text description of the relationship between the variables of the sub-hypothesis, - "explanation": a short text explanation for the breakdown of the sub-hypothesis + "text": the hypothesis in natural language, + "context": a short text description of the context of the hypothesis, + "variables": a list of columns involved in the hypothesis, + "relations": a short text description of the relationship between the variables of the hypothesis }, ... ] @@ -391,11 +388,11 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( use_column_metadata=True, ): # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp} - # Output: score + # Output: eval_rec json includes final_score # Procedure: # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg} - # Gold: [Hg1, Hg2] (pre-store) Hg1 is a NL form of subhypothesis + # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis # Predicted: [Hp1, Hp2] (compute on the fly) # Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction) @@ -409,6 +406,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( # r_v_list ← f1_v * score_r # accuracy_score = mean(r_v_list) # score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )] + # recall_context = 1.0 # COMMENT: never used eval_rec = { 'query': query, @@ -487,10 +485,6 @@ def run_eval_gold_vs_gen_NL_hypo_workflow( else: context_score = 0.0 - # question, answer, context_score = ask_dimension_question(query, gold_subh, gold_workflow, - # gen_subh, gen_workflow, dataset_meta, llm_used, - # dimension="context") - if context_score == 1.0: # match only when context_score = 1.0 gen_subh_to_gold_subh[p_id] = g_id gold_subh_covered.append(g_id) From 985eedf6210af2d933ff8bb24e2edf91cb11b50a Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 18 Oct 2024 16:18:50 +0530 Subject: [PATCH 03/15] feat: initialize runtime with libraries --- evaluation/discoverybench/run_infer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py index 6685e7c56c26..71bb94b5de46 100644 --- a/evaluation/discoverybench/run_infer.py +++ b/evaluation/discoverybench/run_infer.py @@ -37,6 +37,16 @@ DATA_FILES = {} +LIBRARIES = [ + 'pandas', + 'numpy', + 'scipy', + 'matplotlib', + 'seaborn', + 'scikit-learn', + 'statsmodels', +] + AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, } @@ -129,6 +139,12 @@ def initialize_runtime(runtime: Runtime, csv_file: list[str]): '/workspace', ) + for lib in LIBRARIES: + action = CmdRunAction(command=f'pip install {lib}') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") From a97319bfa992642d66fcb1b67034de0f9463b6a8 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 18 Oct 2024 16:19:38 +0530 Subject: [PATCH 04/15] init: add README --- evaluation/discoverybench/README.md | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index e69de29bb2d1..967b6cfca8ca 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -0,0 +1,33 @@ +# 🛰️ DiscoveryBench with 🙌 OpenHands + +DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated. + + +## ⚙️ Setup Environment and LLM Configuration + +1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally + +2. Execute the bash script to start DiscoveryBench Evaluation + +``` +./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] +``` +Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` + +3. Other configurations +``` +./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] +``` + +- `MODEL_CONFIG`: Name of the model you want to evaluate with +- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2. +- `AGENT`: For the agent, it appears you're using CodeActAgent. Replace [AGENT] with CodeActAgent. +- `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval. +- `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process. + +## ✨ Overview + +- A DiscoveryBench instance is a scientific discovery task in natural language. +- In each iteration, OpenHands' agent try to solve the problem provided to it using python. +- After the iteration is complete, we evaluate the agent result based on our gold hypothesis. +- The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs` From ec2721bc53a4fb29759957d4762c69c9ef81fb90 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 18 Oct 2024 19:13:54 +0530 Subject: [PATCH 05/15] docs: update README to add todo --- evaluation/discoverybench/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index 967b6cfca8ca..76810a5d44b5 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -1,5 +1,6 @@ # 🛰️ DiscoveryBench with 🙌 OpenHands +TODO: This is sample description, need to update it before upstream PR. DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated. From 2f3689c656e25ae3ca2962783237b78769459185 Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Thu, 24 Oct 2024 10:30:31 +0530 Subject: [PATCH 06/15] Create README.md --- evaluation/discoverybench/eval_utils/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 evaluation/discoverybench/eval_utils/README.md diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md new file mode 100644 index 000000000000..8e9a82b1d6cb --- /dev/null +++ b/evaluation/discoverybench/eval_utils/README.md @@ -0,0 +1 @@ +TODO: Add a single line description for each file From 622edf227fa3ab83556aaeb904e92718107d5d60 Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Thu, 24 Oct 2024 10:33:43 +0530 Subject: [PATCH 07/15] docs: Update run_infer.py to add TODO for docstrings --- evaluation/discoverybench/run_infer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py index 71bb94b5de46..fb3f40891234 100644 --- a/evaluation/discoverybench/run_infer.py +++ b/evaluation/discoverybench/run_infer.py @@ -77,6 +77,7 @@ def get_config( return config +# TODO: add docstring def get_dv_query_for_real( datasets, question, domain_knowledge=None, workflow_tags=None ): @@ -204,6 +205,7 @@ def complete_runtime( return test_result +# TODO: add docstring def process_instance( instance: pd.Series, metadata: EvalMetadata, @@ -295,7 +297,7 @@ def process_instance( ) return output - +# TODO: add docstring def create_dataset(repo_location: str, split: str = 'test'): # walk through the repository for test split # as soon as a metadata_{}.json file is found, load From e62082addae006c6a0ebd5456618cac7aec49518 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Thu, 24 Oct 2024 16:38:09 +0530 Subject: [PATCH 08/15] docs: add function doc strings --- evaluation/discoverybench/run_infer.py | 54 ++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py index fb3f40891234..c41a9c902664 100644 --- a/evaluation/discoverybench/run_infer.py +++ b/evaluation/discoverybench/run_infer.py @@ -77,10 +77,25 @@ def get_config( return config -# TODO: add docstring def get_dv_query_for_real( datasets, question, domain_knowledge=None, workflow_tags=None ): + """ + Prepare a structured query for the agent to execute on the specified datasets. + + This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags. + + Args: + datasets: List of datasets + question: Query to be answered + domain_knowledge: Domain knowledge if any + workflow_tags: Workflow tags if any + + Returns: + query_to_dv: Query to be run on the dataset + dataset_meta: Metadata of the dataset + """ + dataset_meta = '' for dataset_metadata in datasets: dataset_meta += 'Dataset name: ' + dataset_metadata['name'] @@ -205,12 +220,28 @@ def complete_runtime( return test_result -# TODO: add docstring def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: + """ + Process and evaluate a single instance of the dataset. + + This function executes the OpenHands agent + for a specific instance of the dataset. It retrieves + the agent's results and evaluates them against the gold + hypothesis. + + Args: + instance: A single row of the dataset + metadata: Metadata for the evaluation + reset_logger: Whether to reset the logger + + Returns: + output: EvalOutput object + """ + config = get_config(metadata) # use a session id for concurrent evaluation @@ -297,13 +328,20 @@ def process_instance( ) return output -# TODO: add docstring + def create_dataset(repo_location: str, split: str = 'test'): - # walk through the repository for test split - # as soon as a metadata_{}.json file is found, load - # it and extract domain knowledge, workflow tags, queries, datasets, gold_hypothesis, - # and gold_workflow - # add all these to a pandas dataframe + """ + Create a dataset from the discoverybench repository + by walking through the repository and extracting metadata + from the metadata_{}.json files + + Args: + repo_location: Location of the repository + split: Split of the dataset to use + + Returns: + df: DataFrame containing the dataset instances + """ data_dict = {} From 26a831f8d65ccf64b68056ec87aa1c5df367f8da Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Thu, 24 Oct 2024 16:39:05 +0530 Subject: [PATCH 09/15] docs: add one line eval utils descriptions in README --- evaluation/discoverybench/eval_utils/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md index 8e9a82b1d6cb..2a08998bb430 100644 --- a/evaluation/discoverybench/eval_utils/README.md +++ b/evaluation/discoverybench/eval_utils/README.md @@ -1 +1,7 @@ -TODO: Add a single line description for each file +## 📝 Evaluation File Descriptions + +- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses. +- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process. +- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks. +- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation. +- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses. From cf1f3c12e0ec7fe7ab720780d6d9f348a3bc717a Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Fri, 25 Oct 2024 19:18:48 +0530 Subject: [PATCH 10/15] docs: Update README.md for more clarity --- evaluation/discoverybench/README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index 76810a5d44b5..a15f202d2df0 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -1,10 +1,10 @@ -# 🛰️ DiscoveryBench with 🙌 OpenHands +# DiscoveryBench with OpenHands TODO: This is sample description, need to update it before upstream PR. DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated. -## ⚙️ Setup Environment and LLM Configuration +## Setup Environment and LLM Configuration 1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally @@ -15,7 +15,10 @@ DiscoveryBench is designed to systematically assess current model capabilities i ``` Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` -3. Other configurations + +## Run Inference on DiscoveryBench Instances + + ``` ./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] ``` @@ -26,7 +29,7 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in ` - `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval. - `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process. -## ✨ Overview +## Overview - A DiscoveryBench instance is a scientific discovery task in natural language. - In each iteration, OpenHands' agent try to solve the problem provided to it using python. From a9673c547eda53dd52ea083c38ebb9310fa77ce0 Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Fri, 25 Oct 2024 19:31:03 +0530 Subject: [PATCH 11/15] docs: Update README.md for more clarity on DiscoveryBench process --- evaluation/discoverybench/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index a15f202d2df0..2b1cf9a87519 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -18,6 +18,8 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in ` ## Run Inference on DiscoveryBench Instances +When the run_infer.sh script is started, it will automatically pull the latest DiscoveryBench & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. + ``` ./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] From 81c82717b058de71f9486d069e7e32978cf23671 Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Fri, 25 Oct 2024 19:33:23 +0530 Subject: [PATCH 12/15] docs: Update utils README.md --- evaluation/discoverybench/eval_utils/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md index 2a08998bb430..0a349139907a 100644 --- a/evaluation/discoverybench/eval_utils/README.md +++ b/evaluation/discoverybench/eval_utils/README.md @@ -1,4 +1,4 @@ -## 📝 Evaluation File Descriptions +## Evaluation Utils - **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses. - **`lm_utils.py`**: Provides utility functions necessary for the evaluation process. From edc134f794de34000f24eaf491764086076f505d Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Fri, 25 Oct 2024 19:44:26 +0530 Subject: [PATCH 13/15] docs: Update discoverybench README.md to eval context --- evaluation/discoverybench/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index 2b1cf9a87519..db3b90c4479a 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -1,7 +1,12 @@ # DiscoveryBench with OpenHands -TODO: This is sample description, need to update it before upstream PR. -DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated. +[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers. + +

+ + DiscoveryBench Background + +

## Setup Environment and LLM Configuration From 6337c523246f55098bb8c46907f0e76636be6a84 Mon Sep 17 00:00:00 2001 From: Harshit Surana Date: Fri, 25 Oct 2024 19:46:33 +0530 Subject: [PATCH 14/15] docs: Update formatting for discoverybench README.md --- evaluation/discoverybench/README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index db3b90c4479a..6d8ed8eff027 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -23,7 +23,7 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in ` ## Run Inference on DiscoveryBench Instances -When the run_infer.sh script is started, it will automatically pull the latest DiscoveryBench & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. +When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`. ``` @@ -36,9 +36,3 @@ When the run_infer.sh script is started, it will automatically pull the latest D - `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval. - `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process. -## Overview - -- A DiscoveryBench instance is a scientific discovery task in natural language. -- In each iteration, OpenHands' agent try to solve the problem provided to it using python. -- After the iteration is complete, we evaluate the agent result based on our gold hypothesis. -- The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs` From fee00c3f1463114001ba5df90f57f00b03cd258d Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Fri, 25 Oct 2024 20:02:31 +0530 Subject: [PATCH 15/15] docs: Update README.md for clarity --- evaluation/discoverybench/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md index 6d8ed8eff027..9b5d5df495c7 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/discoverybench/README.md @@ -32,7 +32,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest - `MODEL_CONFIG`: Name of the model you want to evaluate with - `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2. -- `AGENT`: For the agent, it appears you're using CodeActAgent. Replace [AGENT] with CodeActAgent. -- `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval. -- `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process. +- `AGENT`: Use CoderActAgent, right now it only supports that. +- `EVAL_LIMIT`: Number of samples to evaluate. +- `NUM_WORKERS`: Number of workers to parallelize the evaluation process.