From 54949ea5800f8d676a886ccc35e65bd241426474 Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 18 Oct 2024 14:28:07 +0530
Subject: [PATCH 01/15] chore: remove useless modules

---
 .../discoverybench/eval_utils/arguments.py    |  81 ------------
 .../discoverybench/eval_utils/helpers.py      |  93 --------------
 .../eval_utils/openai_query_gen_prompts.py    | 121 ------------------
 3 files changed, 295 deletions(-)
 delete mode 100644 evaluation/discoverybench/eval_utils/arguments.py
 delete mode 100644 evaluation/discoverybench/eval_utils/helpers.py
 delete mode 100644 evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py

diff --git a/evaluation/discoverybench/eval_utils/arguments.py b/evaluation/discoverybench/eval_utils/arguments.py
deleted file mode 100644
index abf45ff7f7a0..000000000000
--- a/evaluation/discoverybench/eval_utils/arguments.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import argparse
-
-
-class Arguments(argparse.ArgumentParser):
-    def __init__(self, groups=None):
-        super().__init__(conflict_handler='resolve')
-        # Common flags
-        self.add_argument('--out_dir', type=str, default='outputs')
-        self.add_argument(
-            '--debug', action=argparse.BooleanOptionalAction, default=False
-        )
-        self.add_argument(
-            '--verbose', action=argparse.BooleanOptionalAction, default=False
-        )
-        self.add_argument('--seed', type=int, default=17)
-        self.add_argument('--run_id', type=str)
-
-        if not isinstance(
-            groups, list
-        ):  # COMMENT: changed from type check to isinstance
-            groups = [groups]
-
-        for group in groups:
-            if group == 'eval':
-                self.add_argument('--in_dir', type=str)
-                self.add_argument(
-                    '--save', action=argparse.BooleanOptionalAction, default=True
-                )
-            elif group == 'generate':
-                self.add_argument('--n_instances', type=int, default=None)
-                self.add_argument(
-                    '--save', action=argparse.BooleanOptionalAction, default=False
-                )
-                self.add_argument(
-                    '--inject_semantics',
-                    action=argparse.BooleanOptionalAction,
-                    default=False,
-                )
-                self.add_argument('--topics_fpath', type=str)
-                self.add_argument(
-                    '--openai_topic_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--n_topics', type=int, default=50)
-                self.add_argument(
-                    '--openai_interp_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--max_generation_retries', type=int, default=3)
-                self.add_argument(
-                    '--sample_unique_topics',
-                    action=argparse.BooleanOptionalAction,
-                    default=True,
-                )
-                self.add_argument('--test_set_prop', type=float, default=0.4)
-                self.add_argument(
-                    '--eval_gold', action=argparse.BooleanOptionalAction, default=True
-                )
-                self.add_argument(
-                    '--skip_on_error',
-                    action=argparse.BooleanOptionalAction,
-                    default=False,
-                )
-                self.add_argument(
-                    '--datasets', action=argparse.BooleanOptionalAction, default=False
-                )
-                self.add_argument('--semantics_fpath', type=str)
-                self.add_argument('--datasets_fpath', type=str)
-                self.add_argument(
-                    '--openai_semantics_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument(
-                    '--openai_datasets_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument(
-                    '--openai_query_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--n_rows', type=int, default=500)
-                self.add_argument('--semantic_depth', type=int, default=3)
-                self.add_argument('--leaf_prob', type=float, default=0.4)
-                self.add_argument(
-                    '--benchmark', action=argparse.BooleanOptionalAction, default=False
-                )
diff --git a/evaluation/discoverybench/eval_utils/helpers.py b/evaluation/discoverybench/eval_utils/helpers.py
deleted file mode 100644
index 4c7afa42b9e7..000000000000
--- a/evaluation/discoverybench/eval_utils/helpers.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import atexit
-import json
-import logging
-import os
-from itertools import chain, combinations
-
-import numpy as np
-from sympy import preorder_traversal
-from sympy.core.numbers import Float as SympyFloat
-
-
-def setup_logger(run_id, log_dir='./logs'):
-    os.makedirs(log_dir, exist_ok=True)
-    log_fname = f'{log_dir}/{run_id}.log'
-    logger = logging.getLogger()  # get root logger
-    file_handler = logging.FileHandler(log_fname, mode='a', delay=False)
-    file_handler.setFormatter(
-        logging.Formatter(
-            fmt='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-            datefmt='%m/%d/%Y %H:%M:%S',
-        )
-    )
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(
-        file_handler
-    )  # all other loggers propagate to root; write to one log file from root
-    print(f'Log path: {log_fname}')
-    atexit.register(lambda: print(f'Log path: {log_fname}'))
-
-
-def deep_get(obj, *keys, default):
-    default = default if default is not None else {}
-    rtn = obj
-    if not isinstance(rtn, dict):  # COMMENT: changed from type(rtn) is not dict
-        return default
-    for k in keys:
-        rtn = rtn.get(k, default)
-        if not isinstance(rtn, dict):  # COMMENT: changed from type(rtn) is not dict
-            return rtn
-    return rtn
-
-
-def printj(obj, indent=2, logger=None):
-    fn = print if logger is None else logger
-    fn(json.dumps(obj, indent=indent))
-
-
-def extract_bracket_substrings(input_str):
-    substrings = []
-    stack = []
-
-    for i, char in enumerate(input_str):
-        if char == '(':
-            stack.append(i)
-        elif char == ')':
-            if stack:
-                start_index = stack.pop()
-                substrings.append(input_str[start_index : i + 1])
-
-    return substrings
-
-
-def extract_variable(input_str, var_prefix='x'):
-    split = input_str.split()
-    rtn = []
-    for s in split:
-        _s = s.strip().strip('(').strip(')')
-        if _s.startswith(var_prefix):
-            rtn.append(_s)
-    return rtn
-
-
-def round_sympy_expr(expr, precision=2):
-    new = expr
-    for a in preorder_traversal(expr):
-        if isinstance(a, SympyFloat):
-            new = new.subs(a, round(a, precision))
-    return new
-
-
-def powerset(iterable):
-    s = list(iterable)
-    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
-
-
-def get_const_from_sympy(sym):
-    return [arg for arg in sym.args if arg not in sym.free_symbols][0]
-
-
-def safe_exp(expr, exp, default=0.0):
-    if exp < 0:
-        return np.where(expr != 0, np.power(expr, exp), default)
-    return np.power(expr, exp)
diff --git a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
deleted file mode 100644
index 83cc44d7fbb7..000000000000
--- a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
+++ /dev/null
@@ -1,121 +0,0 @@
-PROMPT_QUERY = """\
-Given a dataset and a known true hypothesis that can be proven from it, construct a hard question \
-that tests someone's ability to find the true hypothesis using data analysis. \
-Make sure to not reveal the true hypothesis in the question. \
-Do not provide too many details. You may start your question in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...".
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "true_hypothesis": "%s"
-}
-```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "question": "..."
-}
-```"""
-
-PROMPT_QUERY_VARIABLE = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal_in_question the true hypothesis. For each question, we will \
-provide an instruction of what information to hold back. You may start your question text in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". \
-Make sure that the question is not leading (i.e. it does not indicate what the true answer is). \
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "hypothesis": {
-        "text": "%s",
-        "target_col": "%s",
-        "target_col_derivation": "%s"
-    },
-    "questions": [
-        {
-            "reveal_in_question": [],
-            "hide_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept"],
-            "hide_in_question": ["concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept", "concepts that affect the target concept"],
-            "hide_in_question": ["specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "hide_in_question": [],
-            "text": "..."
-        }
-    ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "questions": [
-        {"text": "..."},
-        {"text": "..."},
-        ...
-    ]
-}```"""
-
-
-PROMPT_QUERY_RELATIONSHIP = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal the true hypothesis. For each question, we will provide an \
-instruction of what information to hold back. You may start your question text in the following manner: "What is the \
-relationship between...", "Is there a relationship...", "How does...", "What might...". Make sure that the question is \
-not leading (i.e. it does not indicate what the true answer is).
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "hypothesis": {
-        "text": "%s",
-        "target_col": "%s",
-        "target_col_derivation": "%s"
-    },
-    "questions": [
-        {
-            "reveal_in_question": [],
-            "hide_in_question": ["any information about the relationship between the interacting concepts"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["nature of the relationship (e.g., positive/negative, increase/decrease, etc.)", "numerics of the relationship (e.g. quadratic relationship, change by x amount, etc.)"],
-            "hide_in_question": [],
-            "text": "..."
-        }
-    ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "questions": [
-        {"text": "..."},
-        {"text": "..."},
-        ...
-    ]
-}
-```"""

From 903a00d9b4ce54679255b0694adf93c75265ae1e Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 18 Oct 2024 15:06:11 +0530
Subject: [PATCH 02/15] fix: update discoverybench evaluation

---
 .../discoverybench/eval_utils/__init__.py     |  0
 .../eval_utils/eval_w_subhypo_gen.py          | 36 ++++++++-----------
 2 files changed, 15 insertions(+), 21 deletions(-)
 create mode 100644 evaluation/discoverybench/eval_utils/__init__.py

diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
index bb8e2d08139a..a80df8279cfb 100644
--- a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
+++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
@@ -3,8 +3,8 @@
 
 from openai import OpenAI
 
-from evaluation.discoverybench.eval_utils.lm_utils import run_chatgpt_query_multi_turn
-from evaluation.discoverybench.eval_utils.openai_helpers import get_response
+from .lm_utils import run_chatgpt_query_multi_turn
+from .openai_helpers import get_response
 
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -49,7 +49,7 @@ def get_score_from_answer(type, answer):
             }
             print(f'var_eval: {eval_rec}')
             return eval_rec
-        except Exception:  # COMMENT: added "Exception"
+        except Exception:  # COMMENT: added Exception
             return {'p': -1.0, 'r': -1.0, 'f1': -1.0}
     elif type == 'rel':
         print(answer)
@@ -229,19 +229,17 @@ def get_sub_hypotheses(
 ):
     client = OpenAI()
     extraction_prompt = """\
-        Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract the \
-        set of sub-hypotheses that are present in the hypothesis such that each sub-hypothesis covers a separate context, is \
-        self-sufficient, and operates on a coherent set of 3 dimensions: Context, Variables, and Relations. \
+        Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \
         Here are the definitions for these dimensions:
-        - Contexts: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over \
+        - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \
         the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption.
         - Variables: Known concepts that interact in a meaningful way under a given context to \
-        produce the sub-hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
+        produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
         - Relations: Interactions between a given set of variables under a given context to produce \
-        the sub-hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
+        the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
         or "None" if there is no interacting relationship.
         Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \
-        If no sub-hypotheses can be extracted, return an empty list.
+        For each dimension, be specific, and do not omit any important details.
 
         Here is the metadata for the task:
         ```json
@@ -257,11 +255,10 @@ def get_sub_hypotheses(
         {
         "sub_hypo": [
             {
-                "text": the sub-hypothesis in natural language,
-                "context": a short text description of the context of the sub-hypothesis,
-                "variables": a list of columns involved in the sub-hypothesis,
-                "relations": a short text description of the relationship between the variables of the sub-hypothesis,
-                "explanation": a short text explanation for the breakdown of the sub-hypothesis
+                "text": the hypothesis in natural language,
+                "context": a short text description of the context of the hypothesis,
+                "variables": a list of columns involved in the hypothesis,
+                "relations": a short text description of the relationship between the variables of the hypothesis
             },
             ...
         ]
@@ -391,11 +388,11 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
     use_column_metadata=True,
 ):
     # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp}
-    # Output: score
+    # Output: eval_rec json includes final_score
 
     # Procedure:
     # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg}
-    # Gold: [Hg1, Hg2] (pre-store) Hg1 is a NL form of subhypothesis
+    # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis
     # Predicted: [Hp1, Hp2] (compute on the fly)
 
     # Compute Intersection: [(Hg_i, Hp_j), …]  # tuples of (gold,pred) that matched with context (do this w/o explicit extraction)
@@ -409,6 +406,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
     # 	r_v_list ← f1_v * score_r
     # accuracy_score = mean(r_v_list)
     # score =   [ recall_context * mean over predicted context(context_score * var_score *rel_score )]
+
     # recall_context = 1.0  # COMMENT: never used
     eval_rec = {
         'query': query,
@@ -487,10 +485,6 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
             else:
                 context_score = 0.0
 
-            # question, answer, context_score = ask_dimension_question(query, gold_subh, gold_workflow,
-            #                    gen_subh, gen_workflow, dataset_meta, llm_used,
-            #                    dimension="context")
-
             if context_score == 1.0:  # match only when context_score = 1.0
                 gen_subh_to_gold_subh[p_id] = g_id
                 gold_subh_covered.append(g_id)

From 985eedf6210af2d933ff8bb24e2edf91cb11b50a Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 18 Oct 2024 16:18:50 +0530
Subject: [PATCH 03/15] feat: initialize runtime with libraries

---
 evaluation/discoverybench/run_infer.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py
index 6685e7c56c26..71bb94b5de46 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/discoverybench/run_infer.py
@@ -37,6 +37,16 @@
 
 DATA_FILES = {}
 
+LIBRARIES = [
+    'pandas',
+    'numpy',
+    'scipy',
+    'matplotlib',
+    'seaborn',
+    'scikit-learn',
+    'statsmodels',
+]
+
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
 }
@@ -129,6 +139,12 @@ def initialize_runtime(runtime: Runtime, csv_file: list[str]):
             '/workspace',
         )
 
+    for lib in LIBRARIES:
+        action = CmdRunAction(command=f'pip install {lib}')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+
     logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
 
 

From a97319bfa992642d66fcb1b67034de0f9463b6a8 Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 18 Oct 2024 16:19:38 +0530
Subject: [PATCH 04/15] init: add README

---
 evaluation/discoverybench/README.md | 33 +++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index e69de29bb2d1..967b6cfca8ca 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -0,0 +1,33 @@
+# 🛰️ DiscoveryBench with 🙌 OpenHands
+
+DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated.
+
+
+## ⚙️ Setup Environment and LLM Configuration
+
+1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally
+
+2. Execute the bash script to start DiscoveryBench Evaluation
+
+```
+./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
+```
+Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
+
+3. Other configurations
+```
+./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
+```
+
+- `MODEL_CONFIG`: Name of the model you want to evaluate with
+- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2.
+- `AGENT`: For the agent, it appears you're using CodeActAgent. Replace [AGENT] with CodeActAgent.
+- `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval.
+- `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process.
+
+## ✨ Overview
+
+- A DiscoveryBench instance is a scientific discovery task in natural language.
+- In each iteration, OpenHands' agent try to solve the problem provided to it using python.
+- After the iteration is complete, we evaluate the agent result based on our gold hypothesis.
+- The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`

From ec2721bc53a4fb29759957d4762c69c9ef81fb90 Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 18 Oct 2024 19:13:54 +0530
Subject: [PATCH 05/15] docs: update README to add todo

---
 evaluation/discoverybench/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index 967b6cfca8ca..76810a5d44b5 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -1,5 +1,6 @@
 # 🛰️ DiscoveryBench with 🙌 OpenHands
 
+TODO: This is sample description, need to update it before upstream PR.
 DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated.
 
 

From 2f3689c656e25ae3ca2962783237b78769459185 Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Thu, 24 Oct 2024 10:30:31 +0530
Subject: [PATCH 06/15] Create README.md

---
 evaluation/discoverybench/eval_utils/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 evaluation/discoverybench/eval_utils/README.md

diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md
new file mode 100644
index 000000000000..8e9a82b1d6cb
--- /dev/null
+++ b/evaluation/discoverybench/eval_utils/README.md
@@ -0,0 +1 @@
+TODO: Add a single line description for each file 

From 622edf227fa3ab83556aaeb904e92718107d5d60 Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Thu, 24 Oct 2024 10:33:43 +0530
Subject: [PATCH 07/15] docs: Update run_infer.py to add TODO for docstrings

---
 evaluation/discoverybench/run_infer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py
index 71bb94b5de46..fb3f40891234 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/discoverybench/run_infer.py
@@ -77,6 +77,7 @@ def get_config(
     return config
 
 
+# TODO: add docstring
 def get_dv_query_for_real(
     datasets, question, domain_knowledge=None, workflow_tags=None
 ):
@@ -204,6 +205,7 @@ def complete_runtime(
     return test_result
 
 
+# TODO: add docstring
 def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
@@ -295,7 +297,7 @@ def process_instance(
     )
     return output
 
-
+# TODO: add docstring
 def create_dataset(repo_location: str, split: str = 'test'):
     # walk through the repository for test split
     # as soon as a metadata_{}.json file is found, load

From e62082addae006c6a0ebd5456618cac7aec49518 Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Thu, 24 Oct 2024 16:38:09 +0530
Subject: [PATCH 08/15] docs: add function doc strings

---
 evaluation/discoverybench/run_infer.py | 54 ++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py
index fb3f40891234..c41a9c902664 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/discoverybench/run_infer.py
@@ -77,10 +77,25 @@ def get_config(
     return config
 
 
-# TODO: add docstring
 def get_dv_query_for_real(
     datasets, question, domain_knowledge=None, workflow_tags=None
 ):
+    """
+    Prepare a structured query for the agent to execute on the specified datasets.
+
+    This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags.
+
+    Args:
+        datasets: List of datasets
+        question: Query to be answered
+        domain_knowledge: Domain knowledge if any
+        workflow_tags: Workflow tags if any
+
+    Returns:
+        query_to_dv: Query to be run on the dataset
+        dataset_meta: Metadata of the dataset
+    """
+
     dataset_meta = ''
     for dataset_metadata in datasets:
         dataset_meta += 'Dataset name: ' + dataset_metadata['name']
@@ -205,12 +220,28 @@ def complete_runtime(
     return test_result
 
 
-# TODO: add docstring
 def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
+    """
+    Process and evaluate a single instance of the dataset.
+
+    This function executes the OpenHands agent
+    for a specific instance of the dataset. It retrieves
+    the agent's results and evaluates them against the gold
+    hypothesis.
+
+    Args:
+        instance: A single row of the dataset
+        metadata: Metadata for the evaluation
+        reset_logger: Whether to reset the logger
+
+    Returns:
+        output: EvalOutput object
+    """
+
     config = get_config(metadata)
 
     # use a session id for concurrent evaluation
@@ -297,13 +328,20 @@ def process_instance(
     )
     return output
 
-# TODO: add docstring
+
 def create_dataset(repo_location: str, split: str = 'test'):
-    # walk through the repository for test split
-    # as soon as a metadata_{}.json file is found, load
-    # it and extract domain knowledge, workflow tags, queries, datasets, gold_hypothesis,
-    # and gold_workflow
-    # add all these to a pandas dataframe
+    """
+    Create a dataset from the discoverybench repository
+    by walking through the repository and extracting metadata
+    from the metadata_{}.json files
+
+    Args:
+        repo_location: Location of the repository
+        split: Split of the dataset to use
+
+    Returns:
+        df: DataFrame containing the dataset instances
+    """
 
     data_dict = {}
 

From 26a831f8d65ccf64b68056ec87aa1c5df367f8da Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Thu, 24 Oct 2024 16:39:05 +0530
Subject: [PATCH 09/15] docs: add one line eval utils descriptions in README

---
 evaluation/discoverybench/eval_utils/README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md
index 8e9a82b1d6cb..2a08998bb430 100644
--- a/evaluation/discoverybench/eval_utils/README.md
+++ b/evaluation/discoverybench/eval_utils/README.md
@@ -1 +1,7 @@
-TODO: Add a single line description for each file 
+## 📝 Evaluation File Descriptions
+
+- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses.
+- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process.
+- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks.
+- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation.
+- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses.

From cf1f3c12e0ec7fe7ab720780d6d9f348a3bc717a Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Fri, 25 Oct 2024 19:18:48 +0530
Subject: [PATCH 10/15] docs: Update README.md for more clarity

---
 evaluation/discoverybench/README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index 76810a5d44b5..a15f202d2df0 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -1,10 +1,10 @@
-# 🛰️ DiscoveryBench with 🙌 OpenHands
+# DiscoveryBench with OpenHands
 
 TODO: This is sample description, need to update it before upstream PR.
 DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated.
 
 
-## ⚙️ Setup Environment and LLM Configuration
+## Setup Environment and LLM Configuration
 
 1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally
 
@@ -15,7 +15,10 @@ DiscoveryBench is designed to systematically assess current model capabilities i
 ```
 Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
 
-3. Other configurations
+
+## Run Inference on DiscoveryBench Instances
+
+
 ```
 ./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
 ```
@@ -26,7 +29,7 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `
 - `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval.
 - `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process.
 
-## ✨ Overview
+## Overview
 
 - A DiscoveryBench instance is a scientific discovery task in natural language.
 - In each iteration, OpenHands' agent try to solve the problem provided to it using python.

From a9673c547eda53dd52ea083c38ebb9310fa77ce0 Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Fri, 25 Oct 2024 19:31:03 +0530
Subject: [PATCH 11/15] docs: Update README.md for more clarity on
 DiscoveryBench process

---
 evaluation/discoverybench/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index a15f202d2df0..2b1cf9a87519 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -18,6 +18,8 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `
 
 ## Run Inference on DiscoveryBench Instances
 
+When the run_infer.sh script is started, it will automatically pull the latest DiscoveryBench & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench.
+
 
 ```
 ./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]

From 81c82717b058de71f9486d069e7e32978cf23671 Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Fri, 25 Oct 2024 19:33:23 +0530
Subject: [PATCH 12/15] docs: Update utils README.md

---
 evaluation/discoverybench/eval_utils/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md
index 2a08998bb430..0a349139907a 100644
--- a/evaluation/discoverybench/eval_utils/README.md
+++ b/evaluation/discoverybench/eval_utils/README.md
@@ -1,4 +1,4 @@
-## 📝 Evaluation File Descriptions
+## Evaluation Utils
 
 - **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses.
 - **`lm_utils.py`**: Provides utility functions necessary for the evaluation process.

From edc134f794de34000f24eaf491764086076f505d Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Fri, 25 Oct 2024 19:44:26 +0530
Subject: [PATCH 13/15] docs: Update discoverybench README.md to eval context

---
 evaluation/discoverybench/README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index 2b1cf9a87519..db3b90c4479a 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -1,7 +1,12 @@
 # DiscoveryBench with OpenHands
 
-TODO: This is sample description, need to update it before upstream PR.
-DiscoveryBench is designed to systematically assess current model capabilities in data-driven discovery tasks and provide a useful resource for improving them. Each DiscoveryBench task consists of a goal and dataset(s). Solving the task requires both statistical analysis and semantic reasoning. A faceted evaluation allows open-ended final answers to be rigorously evaluated.
+[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers.  
+
+<p align="center">
+  <a href="[https://github.com/allenai/discoverybench](https://github.com/allenai/discoverybench)">
+    <img src="https://raw.githubusercontent.com/allenai/discoverybench/refs/heads/main/assets/discoverybench-openhands-teaser.png" width="100%" alt="DiscoveryBench Background" />
+  </a>
+</p>
 
 
 ## Setup Environment and LLM Configuration

From 6337c523246f55098bb8c46907f0e76636be6a84 Mon Sep 17 00:00:00 2001
From: Harshit Surana <surana.h@gmail.com>
Date: Fri, 25 Oct 2024 19:46:33 +0530
Subject: [PATCH 14/15] docs: Update formatting for discoverybench README.md

---
 evaluation/discoverybench/README.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index db3b90c4479a..6d8ed8eff027 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -23,7 +23,7 @@ Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `
 
 ## Run Inference on DiscoveryBench Instances
 
-When the run_infer.sh script is started, it will automatically pull the latest DiscoveryBench & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench.
+When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`.
 
 
 ```
@@ -36,9 +36,3 @@ When the run_infer.sh script is started, it will automatically pull the latest D
 - `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval.
 - `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process.
 
-## Overview
-
-- A DiscoveryBench instance is a scientific discovery task in natural language.
-- In each iteration, OpenHands' agent try to solve the problem provided to it using python.
-- After the iteration is complete, we evaluate the agent result based on our gold hypothesis.
-- The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`

From fee00c3f1463114001ba5df90f57f00b03cd258d Mon Sep 17 00:00:00 2001
From: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Date: Fri, 25 Oct 2024 20:02:31 +0530
Subject: [PATCH 15/15] docs: Update README.md for clarity

---
 evaluation/discoverybench/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index 6d8ed8eff027..9b5d5df495c7 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -32,7 +32,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest
 
 - `MODEL_CONFIG`: Name of the model you want to evaluate with
 - `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2.
-- `AGENT`: For the agent, it appears you're using CodeActAgent. Replace [AGENT] with CodeActAgent.
-- `EVAL_LIMIT`: This should be the number of samples to evaluate, e.g., num_samples_eval.
-- `NUM_WORKERS`: This would be the number of workers to parallelize the evaluation process.
+- `AGENT`: Use CoderActAgent, right now it only supports that.
+- `EVAL_LIMIT`: Number of samples to evaluate.
+- `NUM_WORKERS`: Number of workers to parallelize the evaluation process.