diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md
index e69de29bb2d1..9b5d5df495c7 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/discoverybench/README.md
@@ -0,0 +1,38 @@
+# DiscoveryBench with OpenHands
+
+[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers.  
+
+<p align="center">
+  <a href="[https://github.com/allenai/discoverybench](https://github.com/allenai/discoverybench)">
+    <img src="https://raw.githubusercontent.com/allenai/discoverybench/refs/heads/main/assets/discoverybench-openhands-teaser.png" width="100%" alt="DiscoveryBench Background" />
+  </a>
+</p>
+
+
+## Setup Environment and LLM Configuration
+
+1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally
+
+2. Execute the bash script to start DiscoveryBench Evaluation
+
+```
+./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
+```
+Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
+
+
+## Run Inference on DiscoveryBench Instances
+
+When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`.
+
+
+```
+./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
+```
+
+- `MODEL_CONFIG`: Name of the model you want to evaluate with
+- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2.
+- `AGENT`: Use CoderActAgent, right now it only supports that.
+- `EVAL_LIMIT`: Number of samples to evaluate.
+- `NUM_WORKERS`: Number of workers to parallelize the evaluation process.
+
diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md
new file mode 100644
index 000000000000..0a349139907a
--- /dev/null
+++ b/evaluation/discoverybench/eval_utils/README.md
@@ -0,0 +1,7 @@
+## Evaluation Utils
+
+- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses.
+- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process.
+- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks.
+- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation.
+- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses.
diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/discoverybench/eval_utils/arguments.py b/evaluation/discoverybench/eval_utils/arguments.py
deleted file mode 100644
index abf45ff7f7a0..000000000000
--- a/evaluation/discoverybench/eval_utils/arguments.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import argparse
-
-
-class Arguments(argparse.ArgumentParser):
-    def __init__(self, groups=None):
-        super().__init__(conflict_handler='resolve')
-        # Common flags
-        self.add_argument('--out_dir', type=str, default='outputs')
-        self.add_argument(
-            '--debug', action=argparse.BooleanOptionalAction, default=False
-        )
-        self.add_argument(
-            '--verbose', action=argparse.BooleanOptionalAction, default=False
-        )
-        self.add_argument('--seed', type=int, default=17)
-        self.add_argument('--run_id', type=str)
-
-        if not isinstance(
-            groups, list
-        ):  # COMMENT: changed from type check to isinstance
-            groups = [groups]
-
-        for group in groups:
-            if group == 'eval':
-                self.add_argument('--in_dir', type=str)
-                self.add_argument(
-                    '--save', action=argparse.BooleanOptionalAction, default=True
-                )
-            elif group == 'generate':
-                self.add_argument('--n_instances', type=int, default=None)
-                self.add_argument(
-                    '--save', action=argparse.BooleanOptionalAction, default=False
-                )
-                self.add_argument(
-                    '--inject_semantics',
-                    action=argparse.BooleanOptionalAction,
-                    default=False,
-                )
-                self.add_argument('--topics_fpath', type=str)
-                self.add_argument(
-                    '--openai_topic_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--n_topics', type=int, default=50)
-                self.add_argument(
-                    '--openai_interp_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--max_generation_retries', type=int, default=3)
-                self.add_argument(
-                    '--sample_unique_topics',
-                    action=argparse.BooleanOptionalAction,
-                    default=True,
-                )
-                self.add_argument('--test_set_prop', type=float, default=0.4)
-                self.add_argument(
-                    '--eval_gold', action=argparse.BooleanOptionalAction, default=True
-                )
-                self.add_argument(
-                    '--skip_on_error',
-                    action=argparse.BooleanOptionalAction,
-                    default=False,
-                )
-                self.add_argument(
-                    '--datasets', action=argparse.BooleanOptionalAction, default=False
-                )
-                self.add_argument('--semantics_fpath', type=str)
-                self.add_argument('--datasets_fpath', type=str)
-                self.add_argument(
-                    '--openai_semantics_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument(
-                    '--openai_datasets_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument(
-                    '--openai_query_model', type=str, default='gpt-3.5-turbo'
-                )
-                self.add_argument('--n_rows', type=int, default=500)
-                self.add_argument('--semantic_depth', type=int, default=3)
-                self.add_argument('--leaf_prob', type=float, default=0.4)
-                self.add_argument(
-                    '--benchmark', action=argparse.BooleanOptionalAction, default=False
-                )
diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
index bb8e2d08139a..a80df8279cfb 100644
--- a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
+++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
@@ -3,8 +3,8 @@
 
 from openai import OpenAI
 
-from evaluation.discoverybench.eval_utils.lm_utils import run_chatgpt_query_multi_turn
-from evaluation.discoverybench.eval_utils.openai_helpers import get_response
+from .lm_utils import run_chatgpt_query_multi_turn
+from .openai_helpers import get_response
 
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -49,7 +49,7 @@ def get_score_from_answer(type, answer):
             }
             print(f'var_eval: {eval_rec}')
             return eval_rec
-        except Exception:  # COMMENT: added "Exception"
+        except Exception:  # COMMENT: added Exception
             return {'p': -1.0, 'r': -1.0, 'f1': -1.0}
     elif type == 'rel':
         print(answer)
@@ -229,19 +229,17 @@ def get_sub_hypotheses(
 ):
     client = OpenAI()
     extraction_prompt = """\
-        Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract the \
-        set of sub-hypotheses that are present in the hypothesis such that each sub-hypothesis covers a separate context, is \
-        self-sufficient, and operates on a coherent set of 3 dimensions: Context, Variables, and Relations. \
+        Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \
         Here are the definitions for these dimensions:
-        - Contexts: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over \
+        - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \
         the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption.
         - Variables: Known concepts that interact in a meaningful way under a given context to \
-        produce the sub-hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
+        produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
         - Relations: Interactions between a given set of variables under a given context to produce \
-        the sub-hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
+        the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
         or "None" if there is no interacting relationship.
         Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \
-        If no sub-hypotheses can be extracted, return an empty list.
+        For each dimension, be specific, and do not omit any important details.
 
         Here is the metadata for the task:
         ```json
@@ -257,11 +255,10 @@ def get_sub_hypotheses(
         {
         "sub_hypo": [
             {
-                "text": the sub-hypothesis in natural language,
-                "context": a short text description of the context of the sub-hypothesis,
-                "variables": a list of columns involved in the sub-hypothesis,
-                "relations": a short text description of the relationship between the variables of the sub-hypothesis,
-                "explanation": a short text explanation for the breakdown of the sub-hypothesis
+                "text": the hypothesis in natural language,
+                "context": a short text description of the context of the hypothesis,
+                "variables": a list of columns involved in the hypothesis,
+                "relations": a short text description of the relationship between the variables of the hypothesis
             },
             ...
         ]
@@ -391,11 +388,11 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
     use_column_metadata=True,
 ):
     # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp}
-    # Output: score
+    # Output: eval_rec json includes final_score
 
     # Procedure:
     # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg}
-    # Gold: [Hg1, Hg2] (pre-store) Hg1 is a NL form of subhypothesis
+    # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis
     # Predicted: [Hp1, Hp2] (compute on the fly)
 
     # Compute Intersection: [(Hg_i, Hp_j), …]  # tuples of (gold,pred) that matched with context (do this w/o explicit extraction)
@@ -409,6 +406,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
     # 	r_v_list ← f1_v * score_r
     # accuracy_score = mean(r_v_list)
     # score =   [ recall_context * mean over predicted context(context_score * var_score *rel_score )]
+
     # recall_context = 1.0  # COMMENT: never used
     eval_rec = {
         'query': query,
@@ -487,10 +485,6 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
             else:
                 context_score = 0.0
 
-            # question, answer, context_score = ask_dimension_question(query, gold_subh, gold_workflow,
-            #                    gen_subh, gen_workflow, dataset_meta, llm_used,
-            #                    dimension="context")
-
             if context_score == 1.0:  # match only when context_score = 1.0
                 gen_subh_to_gold_subh[p_id] = g_id
                 gold_subh_covered.append(g_id)
diff --git a/evaluation/discoverybench/eval_utils/helpers.py b/evaluation/discoverybench/eval_utils/helpers.py
deleted file mode 100644
index 4c7afa42b9e7..000000000000
--- a/evaluation/discoverybench/eval_utils/helpers.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import atexit
-import json
-import logging
-import os
-from itertools import chain, combinations
-
-import numpy as np
-from sympy import preorder_traversal
-from sympy.core.numbers import Float as SympyFloat
-
-
-def setup_logger(run_id, log_dir='./logs'):
-    os.makedirs(log_dir, exist_ok=True)
-    log_fname = f'{log_dir}/{run_id}.log'
-    logger = logging.getLogger()  # get root logger
-    file_handler = logging.FileHandler(log_fname, mode='a', delay=False)
-    file_handler.setFormatter(
-        logging.Formatter(
-            fmt='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-            datefmt='%m/%d/%Y %H:%M:%S',
-        )
-    )
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(
-        file_handler
-    )  # all other loggers propagate to root; write to one log file from root
-    print(f'Log path: {log_fname}')
-    atexit.register(lambda: print(f'Log path: {log_fname}'))
-
-
-def deep_get(obj, *keys, default):
-    default = default if default is not None else {}
-    rtn = obj
-    if not isinstance(rtn, dict):  # COMMENT: changed from type(rtn) is not dict
-        return default
-    for k in keys:
-        rtn = rtn.get(k, default)
-        if not isinstance(rtn, dict):  # COMMENT: changed from type(rtn) is not dict
-            return rtn
-    return rtn
-
-
-def printj(obj, indent=2, logger=None):
-    fn = print if logger is None else logger
-    fn(json.dumps(obj, indent=indent))
-
-
-def extract_bracket_substrings(input_str):
-    substrings = []
-    stack = []
-
-    for i, char in enumerate(input_str):
-        if char == '(':
-            stack.append(i)
-        elif char == ')':
-            if stack:
-                start_index = stack.pop()
-                substrings.append(input_str[start_index : i + 1])
-
-    return substrings
-
-
-def extract_variable(input_str, var_prefix='x'):
-    split = input_str.split()
-    rtn = []
-    for s in split:
-        _s = s.strip().strip('(').strip(')')
-        if _s.startswith(var_prefix):
-            rtn.append(_s)
-    return rtn
-
-
-def round_sympy_expr(expr, precision=2):
-    new = expr
-    for a in preorder_traversal(expr):
-        if isinstance(a, SympyFloat):
-            new = new.subs(a, round(a, precision))
-    return new
-
-
-def powerset(iterable):
-    s = list(iterable)
-    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
-
-
-def get_const_from_sympy(sym):
-    return [arg for arg in sym.args if arg not in sym.free_symbols][0]
-
-
-def safe_exp(expr, exp, default=0.0):
-    if exp < 0:
-        return np.where(expr != 0, np.power(expr, exp), default)
-    return np.power(expr, exp)
diff --git a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
deleted file mode 100644
index 83cc44d7fbb7..000000000000
--- a/evaluation/discoverybench/eval_utils/openai_query_gen_prompts.py
+++ /dev/null
@@ -1,121 +0,0 @@
-PROMPT_QUERY = """\
-Given a dataset and a known true hypothesis that can be proven from it, construct a hard question \
-that tests someone's ability to find the true hypothesis using data analysis. \
-Make sure to not reveal the true hypothesis in the question. \
-Do not provide too many details. You may start your question in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...".
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "true_hypothesis": "%s"
-}
-```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "question": "..."
-}
-```"""
-
-PROMPT_QUERY_VARIABLE = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal_in_question the true hypothesis. For each question, we will \
-provide an instruction of what information to hold back. You may start your question text in the following manner: \
-"What is the relationship between...", "Is there a relationship...", "How does...", "What might...". \
-Make sure that the question is not leading (i.e. it does not indicate what the true answer is). \
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "hypothesis": {
-        "text": "%s",
-        "target_col": "%s",
-        "target_col_derivation": "%s"
-    },
-    "questions": [
-        {
-            "reveal_in_question": [],
-            "hide_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept"],
-            "hide_in_question": ["concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept", "concepts that affect the target concept"],
-            "hide_in_question": ["specific sub-group(s), if any, the relationship is applicable to"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["target concept", "concepts that affect the target concept", "specific sub-group(s), if any, the relationship is applicable to"],
-            "hide_in_question": [],
-            "text": "..."
-        }
-    ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "questions": [
-        {"text": "..."},
-        {"text": "..."},
-        ...
-    ]
-}```"""
-
-
-PROMPT_QUERY_RELATIONSHIP = """\
-Given a dataset and a known true hypothesis that can be proven using that dataset, we want to construct questions to \
-test whether someone can find this true hypothesis given only the dataset. Generate a set of questions revealing \
-different amounts of information making sure to not reveal the true hypothesis. For each question, we will provide an \
-instruction of what information to hold back. You may start your question text in the following manner: "What is the \
-relationship between...", "Is there a relationship...", "How does...", "What might...". Make sure that the question is \
-not leading (i.e. it does not indicate what the true answer is).
-
-Dataset and hypothesis:
-```json
-{
-    "domain": "%s",
-    "description": "%s",
-    "columns": %s,
-    "hypothesis": {
-        "text": "%s",
-        "target_col": "%s",
-        "target_col_derivation": "%s"
-    },
-    "questions": [
-        {
-            "reveal_in_question": [],
-            "hide_in_question": ["any information about the relationship between the interacting concepts"],
-            "text": "..."
-        },
-        {
-            "reveal_in_question": ["nature of the relationship (e.g., positive/negative, increase/decrease, etc.)", "numerics of the relationship (e.g. quadratic relationship, change by x amount, etc.)"],
-            "hide_in_question": [],
-            "text": "..."
-        }
-    ]
-}```
-
-Give your answer as a new JSON with the following format:
-```json
-{
-    "questions": [
-        {"text": "..."},
-        {"text": "..."},
-        ...
-    ]
-}
-```"""
diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py
index 6685e7c56c26..c41a9c902664 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/discoverybench/run_infer.py
@@ -37,6 +37,16 @@
 
 DATA_FILES = {}
 
+LIBRARIES = [
+    'pandas',
+    'numpy',
+    'scipy',
+    'matplotlib',
+    'seaborn',
+    'scikit-learn',
+    'statsmodels',
+]
+
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
 }
@@ -70,6 +80,22 @@ def get_config(
 def get_dv_query_for_real(
     datasets, question, domain_knowledge=None, workflow_tags=None
 ):
+    """
+    Prepare a structured query for the agent to execute on the specified datasets.
+
+    This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags.
+
+    Args:
+        datasets: List of datasets
+        question: Query to be answered
+        domain_knowledge: Domain knowledge if any
+        workflow_tags: Workflow tags if any
+
+    Returns:
+        query_to_dv: Query to be run on the dataset
+        dataset_meta: Metadata of the dataset
+    """
+
     dataset_meta = ''
     for dataset_metadata in datasets:
         dataset_meta += 'Dataset name: ' + dataset_metadata['name']
@@ -129,6 +155,12 @@ def initialize_runtime(runtime: Runtime, csv_file: list[str]):
             '/workspace',
         )
 
+    for lib in LIBRARIES:
+        action = CmdRunAction(command=f'pip install {lib}')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+
     logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
 
 
@@ -193,6 +225,23 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
+    """
+    Process and evaluate a single instance of the dataset.
+
+    This function executes the OpenHands agent
+    for a specific instance of the dataset. It retrieves
+    the agent's results and evaluates them against the gold
+    hypothesis.
+
+    Args:
+        instance: A single row of the dataset
+        metadata: Metadata for the evaluation
+        reset_logger: Whether to reset the logger
+
+    Returns:
+        output: EvalOutput object
+    """
+
     config = get_config(metadata)
 
     # use a session id for concurrent evaluation
@@ -281,11 +330,18 @@ def process_instance(
 
 
 def create_dataset(repo_location: str, split: str = 'test'):
-    # walk through the repository for test split
-    # as soon as a metadata_{}.json file is found, load
-    # it and extract domain knowledge, workflow tags, queries, datasets, gold_hypothesis,
-    # and gold_workflow
-    # add all these to a pandas dataframe
+    """
+    Create a dataset from the discoverybench repository
+    by walking through the repository and extracting metadata
+    from the metadata_{}.json files
+
+    Args:
+        repo_location: Location of the repository
+        split: Split of the dataset to use
+
+    Returns:
+        df: DataFrame containing the dataset instances
+    """
 
     data_dict = {}