diff --git a/brainsmith/primitives/transforms/extract_shell_integration_metadata.py b/brainsmith/primitives/transforms/extract_shell_integration_metadata.py
index 12d1a611..b0c81538 100644
--- a/brainsmith/primitives/transforms/extract_shell_integration_metadata.py
+++ b/brainsmith/primitives/transforms/extract_shell_integration_metadata.py
@@ -4,9 +4,12 @@
 """Shell integration metadata extraction transform."""
 
 import json
+import os
+import shutil
+import numpy as np
 from qonnx.transformation.base import Transformation
 import qonnx.custom_op.registry as registry
-
+from finn.util.mlo_sim import dat_file_to_numpy_array
 
 class ExtractShellIntegrationMetadata(Transformation):
     """Walks the ONNX graph and extracts all relevant metadata for shell integration
@@ -19,26 +22,72 @@ def __init__(self, metadata_file: str):
     def apply(self, model):
         graph = model.graph
 
+        # destination dir to copy artifacts
+        dirname = os.path.dirname(self.metadata_file)
+
+        # Search for FINNLoop ops (Does not currently support nested FINNLoops)
+        finn_loops={}
+        mlo = False
+        for node in model.graph.node:
+            if node.op_type == "FINNLoop":
+                finnloop_op = registry.getCustomOp(node)
+                finnloop_body = finnloop_op.get_nodeattr("body");
+
+                mvau_hbm_weights = {}
+                extern_idx = 0
+                for idx, lb_inp in enumerate(finnloop_body.graph.input):
+                    downstream = finnloop_body.find_consumer(lb_inp.name)
+                    if downstream.op_type.startswith("MVAU"):
+                        mlo = True
+                        mvau_hbm_weights[idx] = {}
+                        mvau_hbm_weights[idx]["name"] = lb_inp.name
+                        datfile = (
+                            f"{finnloop_op.get_nodeattr('code_gen_dir_ipgen')}/memblock_MVAU_rtl_id_{idx}.dat"
+                        )
+
+                        # Save the weights as a numpy file
+                        np_dat = dat_file_to_numpy_array(datfile)
+                        mvau_hbm_weights[idx]["weight_npy"] = f"memblock_MVAU_rtl_id_{idx}.npy"
+                        np.save(f"{dirname}/{mvau_hbm_weights[idx]['weight_npy']}", np_dat)
+
+                        # Copy to the destination dir
+                        mvau_hbm_weights[idx]["extern_idx"] = extern_idx
+                        mvau_hbm_weights[idx]["extern_name"] = f"m_axi_MVAU_id_{idx}"
+                        mlo_mvau = registry.getCustomOp(downstream)
+                        mvau_hbm_weights[idx]["PE"] = mlo_mvau.get_nodeattr("PE")
+                        mvau_hbm_weights[idx]["SIMD"] = mlo_mvau.get_nodeattr("SIMD")
+                        mvau_hbm_weights[idx]["MH"] = mlo_mvau.get_nodeattr("MH")
+                        mvau_hbm_weights[idx]["MW"] = mlo_mvau.get_nodeattr("MW")
+                        mvau_hbm_weights[idx]["weightDataType"] = mlo_mvau.get_nodeattr("weightDataType")
+                        extern_idx += 1
+                finn_loops[node.name] = mvau_hbm_weights
+        self.md["mlo"] = mlo
+        self.md["finn_loops"] = finn_loops
+
+
+        # Extract instream widths
         instreams = {}
         for input_tensor in graph.input:
             consumer = model.find_consumer(input_tensor.name)
             inst = registry.getCustomOp(consumer)
-            instreams[input_tensor.name] = {
-                'width': inst.get_instream_width(),
-                'shape': inst.get_normal_input_shape()
-            }
-        self.md['instreams'] = instreams
+            instream = {}
+            instream['width'] = inst.get_instream_width()
+            instreams[input_tensor.name] = instream
+            instream['shape'] = inst.get_normal_input_shape()
+            instream['datatype'] = inst.get_input_datatype().name
+        self.md['insteams'] = instreams
 
         outstreams = {}
         for output_tensor in graph.output:
             producer = model.find_producer(output_tensor.name)
             inst = registry.getCustomOp(producer)
-            outstreams[output_tensor.name] = {
-                'width': inst.get_outstream_width(),
-                'shape': inst.get_normal_output_shape()
-            }
-        self.md['outstreams'] = outstreams
-    
+            outstream = {}
+            outstream['width'] = inst.get_outstream_width()
+            outstreams[output_tensor.name] = outstream
+            outstream['shape'] = inst.get_normal_output_shape()
+            outstream['datatype'] = inst.get_output_datatype().name
+        self.md['outsteams'] = outstreams
+
         static_matmuls = {}
         for node in graph.node:
             if (node.op_type == "MVAU_rtl"):
@@ -54,4 +103,4 @@ def apply(self, model):
         with open(self.metadata_file, "w") as fp:
             json.dump(self.md, fp, indent=4)
 
-        return (model, False)
+        return(model, False)
diff --git a/brainsmith/steps/bert_custom_steps.py b/brainsmith/steps/bert_custom_steps.py
index 436ee055..c06f0b27 100644
--- a/brainsmith/steps/bert_custom_steps.py
+++ b/brainsmith/steps/bert_custom_steps.py
@@ -89,7 +89,8 @@ def bert_streamlining_step(model: Any, cfg: Any) -> Any:
         MoveScalarMulPastMatMul(),
         MoveScalarLinearPastInvariants(),
         AbsorbMulIntoMultiThreshold(),
-        AbsorbAddIntoMultiThreshold()
+        AbsorbAddIntoMultiThreshold(),
+        RoundAndClipThresholds()
     ]:
         model = model.transform(transform)
 
diff --git a/docker/fetch-repos.sh b/docker/fetch-repos.sh
index 1069ffcb..734be698 100755
--- a/docker/fetch-repos.sh
+++ b/docker/fetch-repos.sh
@@ -77,9 +77,21 @@ fi
 
 # Define our Git dependencies - URLs and revisions
 declare -A GIT_DEPS=(
+<<<<<<< HEAD
+    ["brevitas"]="https://github.com/Xilinx/brevitas.git@95edaa0bdc8e639e39b1164466278c59df4877be"
+    ["qonnx"]="https://github.com/fastmachinelearning/qonnx.git@custom/brainsmith"
+    ["finn"]="https://github.com/tafk7/finn.git@custom/transformer"
+    ["onnxscript"]="https://github.com/jsmonson/onnxscript.git@62c7110aba46554432ce8e82ba2d8a086bd6227c"
+||||||| c35477d
+    ["brevitas"]="https://github.com/Xilinx/brevitas.git@95edaa0bdc8e639e39b1164466278c59df4877be"
+    ["qonnx"]="https://github.com/fastmachinelearning/qonnx.git@f2c4ccd3e71795c9f116ee5a0c87a7dfd590c6d0"
+    ["finn"]="https://github.com/tafk7/finn.git@custom/transformer"
+    ["onnxscript"]="https://github.com/jsmonson/onnxscript.git@62c7110aba46554432ce8e82ba2d8a086bd6227c"
+=======
     ["brevitas"]="https://github.com/Xilinx/brevitas.git@c10ef8764967e9cacc60347ce185be14e4ad97c4"
     ["qonnx"]="https://github.com/fastmachinelearning/qonnx.git@f2c4ccd3e71795c9f116ee5a0c87a7dfd590c6d0"
     ["finn"]="https://github.com/tafk7/finn.git@feature/logging-integration-transformer"
+>>>>>>> develop
     ["finn-experimental"]="https://github.com/Xilinx/finn-experimental.git@0724be21111a21f0d81a072fccc1c446e053f851"
     ["dataset-loading"]="https://github.com/fbcotter/dataset_loading.git@0.0.4"
 )
diff --git a/docs/multilayer_offload.md b/docs/multilayer_offload.md
new file mode 100644
index 00000000..969c153c
--- /dev/null
+++ b/docs/multilayer_offload.md
@@ -0,0 +1,605 @@
+# Multilayer Offload (MLO)
+
+Multilayer Offload (MLO) is a powerful feature recently added to FINN that enables the implementation of much larger neural networks by implementing a repeating slice of the model (such as a single transformer encoder layer) in hardware and cycling model weights through external memory (DRAM/HBM). This technique allows models that would otherwise be too large to be mapped to the FPGA.
+
+## Overview
+
+In many cases large Deep Learning models such as transformers and SLMs (and LLMs for that matter) have millions or billions of parameters processed over several identical repeating layers. One solution would be to map these layers to multiple FPGAs but the sheer quantity of layers (e.g. 32 layers in the PHI-4 Mini) makes it impractical to spread the design across so many devices. MLO overcomes this limitation by:
+
+1. **Implementing a single repeating layer** (e.g., one transformer encoder) in hardware
+2. **Storing weights off-chip** in high-bandwidth memory (HBM/DRAM)
+3. **Streaming weights** into the accelerator as needed for each layer
+4. **Reusing the same hardware** to process multiple layers sequentially
+
+This approach trades some throughput for the ability to handle much larger models, making it ideal for larger transformer models such as SLMs, vision transformers, and other deep architectures.
+
+## How It Works
+
+### Loop Body Hierarchy
+
+MLO works by identifying a repeating structure in the neural network and implementing only that structure in hardware. **Currently, loop body discovery is not automated** - users must manually identify one iteration of the repeating pattern and specify it using the `loop_body_hierarchy` parameter:
+
+```yaml
+finn_config:
+  loop_body_hierarchy: [['encoder', 'encoder.layer.0']]
+```
+
+**Manual Loop Body Identification:**
+The `loop_body_hierarchy` configuration must match the hierarchical naming structure in your ONNX model, which corresponds to the `pkg.torch.onnx.name_scopes` field used during model export. The loop rolling transformation uses these name scopes to determine which levels of hierarchy to include in the loop body.
+
+> **⚠️ Important:** You must use `dynamo=True` when exporting your PyTorch model to ONNX. Exporting with `dynamo=True` generates the metadata (name scopes) that MLO requires to identify repeating structures. Without this flag, the ONNX model will lack the hierarchical metadata needed for loop body discovery, and the MLO transformation will fail to locate the repeating patterns.
+
+**Technical Implementation:**
+The node extraction mechanism is implemented in FINN's loop rolling transformations:
+
+- **Step Location**: `deps/finn/src/finn/builder/build_dataflow_steps.py`
+- **Extraction Process**: `deps/finn/src/finn/transformation/fpgadataflow/loop_rolling.py` (LoopExtraction class)
+- **Hierarchy Matching**: `deps/finn/src/finn/util/onnxscript_helpers.py` (PytorchHierarchyNode class)
+
+The extraction works by:
+1. Creating a hierarchy parser from PyTorch metadata (`pkg.torch.onnx.name_scopes`)
+2. Adding each ONNX node to the parser based on its hierarchy path
+3. Using prefix matching to find all nodes under the specified hierarchy paths
+4. Extracting matching nodes to create loop templates and removing originals from the main graph
+
+This process requires the PyTorch exporter metadata generated by `dynamo=True`, which contains the module instance hierarchies that map ONNX nodes back to their originating PyTorch modules.
+
+This configuration tells Brainsmith:
+- Look for a repeating pattern called 'encoder' (top-level hierarchy)
+- The repeating unit is 'encoder.layer.0' (one complete encoder layer)
+- All encoder layers (layer.0, layer.1, layer.2, etc.) will be processed using the same hardware
+- The name scopes must exactly match the ONNX node names for proper identification
+
+#### Multiple Hierarchy Groups
+
+For models with multiple independent repeating structures, you can specify multiple hierarchy groups in the `loop_body_hierarchy` configuration:
+
+```yaml
+finn_config:
+  loop_body_hierarchy: [
+    ['encoder', 'encoder.layer.0'],
+    ['encoder', 'encoder.layer.1']
+  ]
+```
+
+This advanced configuration enables the following:
+- **Multiple Loop Iterations in a Single Body** - Include nodes from consecutive layers (e.g., layer.0 and layer.1) to unroll multiple iterations into the hardware implementation
+- **Fine-tuning Node Selection** - Adjust which nodes are included in the loop body when metadata is lost or inexact during ONNX export
+
+**Multiple Group Behavior:**
+- The loop body will include **all** of the nodes belonging to each hierarchy region within the loop body.
+
+#### Hierarchy Level Specification
+
+The `loop_body_hierarchy` can specify multiple levels of hierarchy to precisely control what gets included in the loop body:
+
+**Two-level hierarchy (simple case):**
+```yaml
+loop_body_hierarchy: [['encoder', 'encoder.layer.0']]
+```
+- Includes all nodes under `encoder.layer.0.*`
+- Good for simple transformer architectures
+
+**Three-level hierarchy (precise control):**
+```yaml
+loop_body_hierarchy: [
+  ['bert', 'bert.encoder', 'bert.encoder.layer.0']
+]
+```
+- Specifies the full path: model → encoder stack → specific layer
+- Provides more precise control over node selection
+- Useful for complex models with nested structures
+
+The FINN loop rolling step will find all ONNX nodes whose names start with the final hierarchy level (e.g., `bert.encoder.layer.0`) and extract them as the loop body.
+
+### Loop Rolling Process
+
+The loop rolling transformation (`step_loop_rolling` in FINN) performs these key operations:
+
+1. **Parses the `loop_body_hierarchy`** to identify which nodes belong to the repeating structure
+2. **Extracts nodes by name scope matching** - finds all ONNX nodes whose names match the specified hierarchy pattern (e.g., nodes starting with 'bert.encoder.layer.0')
+3. **Generates loop iteration logic** - creates control structures to iterate through all layers using the same hardware
+4. **Sets up weight streaming infrastructure** - configures memory interfaces to stream different weights for each iteration
+6. **Updates folding configuration** - modifies parallelization parameters to account for the loop structure
+
+#### Loop Body Extraction Details
+
+The specific extraction logic is implemented in the FINN library (`finn.builder.build_dataflow_steps.step_loop_rolling`). While the exact source code lines are not visible in this repository, the process performs these operations based on observable behavior:
+
+**Node Selection Process:**
+```python
+# Conceptual extraction logic (actual implementation in FINN)
+def extract_loop_body_nodes(model, loop_body_hierarchy):
+    """Extract nodes matching the loop body hierarchy pattern."""
+    extracted_nodes = []
+
+    # Get the target pattern from hierarchy (e.g., 'bert.encoder.layer.0')
+    target_pattern = loop_body_hierarchy[0][-1]  # Final level
+
+    # Find all nodes whose names start with the target pattern
+    for node in model.graph.node:
+        if node.name.startswith(target_pattern):
+            extracted_nodes.append(node)
+
+    return extracted_nodes
+
+```
+
+The metadata fields exported by PyTorch Dynamo are not always reliable and in some cases can be removed by optimization passes. When encountered, these issues are reported to the onnxscript team and are often resolved. However, we have tried to make the Loop Body Extraction process as robust as possible in the presence of missing metadata.
+
+In some cases, the Loop Body Extraction process can identify nodes with missing metadata fields. For example, if a node is missing its metadata field, Loop Extract attempts to infer the missing information for that node by checking the metadata of its input and output nodes.
+
+
+## Configuration
+
+### Basic MLO Setup
+
+To enable MLO in your blueprint, add the `loop_body_hierarchy` configuration:
+
+```yaml
+name: "BERT with MLO"
+description: "BERT model with Multilayer Offload"
+
+finn_config:
+  loop_body_hierarchy: [['encoder', 'encoder.layer.0']]
+  split_large_fifos: true
+  fifosim_n_inferences: 2  # Speed up FIFO simulation
+
+design_space:
+  steps:
+    - "qonnx_to_finn"
+    - "bert_streamlining"
+    - "infer_kernels"
+    - "create_dataflow_partition"
+    - "specialize_layers"
+    - "loop_rolling"        # This step implements MLO
+    - "target_fps_parallelization"
+    - "apply_folding_config"
+    # ... rest of pipeline
+```
+
+The easiest way to identify the proper loop body hierarchy is to open the model in Netron and check the values of the node metadata that you'd like to include in the loop body.
+
+
+### BERT MLO Example
+
+For BERT models, a typical MLO configuration looks like:
+
+```yaml
+# bert_mlo_demo.yaml
+name: "BERT Demo"
+description: "Hugging face BERT model with MLO"
+
+extends: "../../brainsmith/blueprints/bert.yaml"
+
+finn_config:
+  loop_body_hierarchy: [['encoder', 'encoder.layer.0']]
+  split_large_fifos: true
+  fifosim_n_inferences: 2
+  verify_steps: ['folded_hls_cppsim', 'stitched_ip_rtlsim']
+
+design_space:
+  steps:
+    - at_start:
+        insert:
+          - "bert_cleanup"
+          - "remove_head"
+          - "remove_tail"
+          - "generate_reference_io"
+    - at_end:
+        insert: "shell_metadata_handover"
+```
+
+
+## Example: BERT MLO Demo
+
+The `examples/bert/bert_mlo_demo.sh` demonstrates a complete MLO workflow:
+
+```bash
+#!/bin/bash
+# BERT MLO Demo
+
+# Generate folding configuration
+python gen_folding_config.py \
+    --simd 4 \
+    --pe 4 \
+    --num_layers 2 \
+    -t 1 \
+    -o ./configs/bert_mlo_demo.json
+
+# Run BERT demo with MLO
+python bert_demo.py \
+    -o bert_mlo_demo \
+    -n 4 \                    # 4 attention heads
+    -l 2 \                    # 2 layers total
+    -z 64 \                   # Hidden size 64
+    -i 256 \                  # Intermediate size 256
+    -b 8 \                    # 8-bit quantization
+    -q 32 \                   # Sequence length 32
+    --blueprint ./bert_mlo_demo.yaml
+```
+
+This creates a BERT model with 2 encoder layers where only the first layer is implemented in hardware, and the second layer reuses the same hardware with different weights.
+
+**CRITICAL: ONNX Export Requirements**
+```python
+# When exporting your model to ONNX, you MUST use dynamo=True
+# This generates the metadata (name scopes) that MLO requires for loop body discovery
+import brevitas.onnx as bo
+
+bo.export_qonnx(
+    model,
+    inputs,
+    output_path,
+    dynamo=True,              # Generates name scope metadata for MLO
+    input_names=['input_ids'],
+    opset_version=18,
+    do_constant_folding=True
+)
+```
+
+**Alternative: Custom Loop Rolling for Non-Dynamo Export**
+
+If you cannot use `dynamo=True` (due to compatibility issues, model complexity, or other constraints), you can either add the metadata manually or you can implement a custom loop rolling step.
+
+**Adding Metadata Manually**
+
+If your ONNX model was exported without `dynamo=True` or the metadata was lost during optimization, you can manually add the required `pkg.torch.onnx.name_scopes` metadata to enable MLO. This approach requires modifying the ONNX model's metadata properties directly.
+
+**Step 1: Understanding the Metadata Structure**
+
+The `pkg.torch.onnx.name_scopes` metadata field contains hierarchical naming information that maps each ONNX node back to its originating PyTorch module. The metadata is stored as a list of strings representing the hierarchy path from the root module to the specific operation.
+
+For example, in a BERT model:
+```python
+# Layer 0 attention query node
+['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.self.query']
+
+# Layer 0 attention key node
+['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.self.key']
+
+# Layer 1 attention query node
+['bert', 'bert.encoder', 'bert.encoder.layer.1', 'bert.encoder.layer.1.attention.self.query']
+```
+
+**Step 2: Identify Your Model's Hierarchy**
+
+First, determine the hierarchical structure of your model:
+
+```python
+import torch
+
+# Example: Print your PyTorch model structure
+model = YourModel()
+for name, module in model.named_modules():
+    print(name)
+
+# Output might look like:
+# encoder
+# encoder.layer.0
+# encoder.layer.0.attention
+# encoder.layer.0.attention.self
+# encoder.layer.1.attention
+# encoder.layer.1.attention.self
+```
+
+**Step 3: Add Metadata to ONNX Nodes**
+
+Use the following script to add metadata to your ONNX model:
+
+```python
+import onnx
+from onnx import helper
+
+def add_name_scope_metadata(model_path, output_path, node_hierarchy_map):
+    """
+    Add pkg.torch.onnx.name_scopes metadata to ONNX nodes.
+
+    Args:
+        model_path: Path to input ONNX model
+        output_path: Path to save modified ONNX model
+        node_hierarchy_map: Dict mapping node names to hierarchy paths (as list of strings)
+                           e.g., {'MatMul_0': ['encoder', 'encoder.layer.0', 'encoder.layer.0.attention']}
+    """
+    model = onnx.load(model_path)
+
+    for node in model.graph.node:
+        if node.name in node_hierarchy_map:
+            hierarchy_list = node_hierarchy_map[node.name]
+            # Convert list to the string format expected by ONNX metadata
+            # Format: serialized list of strings
+            hierarchy_str = str(hierarchy_list)
+
+            # Add or update the metadata attribute
+            metadata_found = False
+            for attr in node.attribute:
+                if attr.name == "pkg.torch.onnx.name_scopes":
+                    attr.s = hierarchy_str.encode('utf-8')
+                    metadata_found = True
+                    break
+
+            if not metadata_found:
+                # Create new metadata attribute
+                metadata_attr = helper.make_attribute(
+                    "pkg.torch.onnx.name_scopes",
+                    hierarchy_str
+                )
+                node.attribute.append(metadata_attr)
+
+    onnx.save(model, output_path)
+    print(f"Model with metadata saved to {output_path}")
+
+# Example usage for a BERT model
+node_hierarchy_map = {
+    # Attention layer nodes
+    'MatMul_0': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.self.query'],
+    'MatMul_1': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.self.key'],
+    'MatMul_2': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.self.value'],
+    'MatMul_3': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.output.dense'],
+
+    # Intermediate layer nodes
+    'MatMul_4': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.intermediate.dense'],
+    'MatMul_5': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.output.dense'],
+
+    # LayerNorm nodes
+    'LayerNormalization_0': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention.output.LayerNorm'],
+    'LayerNormalization_1': ['bert', 'bert.encoder', 'bert.encoder.layer.0', 'bert.encoder.layer.0.output.LayerNorm'],
+
+    # You only need to add metadata for the nodes used in the loop body template
+}
+
+add_name_scope_metadata(
+    'model_without_metadata.onnx',
+    'model_with_metadata.onnx',
+    node_hierarchy_map
+)
+```
+
+**Step 4: Verify Metadata with Netron**
+
+After adding metadata, open the modified model in Netron and inspect node properties to verify the `pkg.torch.onnx.name_scopes` field appears correctly.
+
+**Step 5: Use in MLO Configuration**
+
+Once metadata is added, configure your blueprint with the appropriate `loop_body_hierarchy`:
+
+```yaml
+finn_config:
+  loop_body_hierarchy: [['encoder', 'encoder.layer.0']]  # Must match your hierarchy paths
+```
+
+**Important Notes:**
+- Metadata must accurately reflect the repeating structure of your model
+- All nodes within a layer should have consistent hierarchy prefixes
+- Test with a small model (2-3 layers) before applying to larger models
+- Incorrect metadata will cause loop body extraction to fail or extract wrong nodes
+
+
+**Custom Loop Rolling Step**
+
+If you cannot export via PyTorch Dynamo, you can write your own *Loop Extraction* transform and then leverage the existing *Loop Rolling* transform to create the FINNLoop ONNX node. At present, you'll need to copy the *Loop Rolling* step in FINN and replace the *Loop Extraction* functionality. In the future, we plan to update the Loop Rolling step to accept a custom *Loop Extraction* function.
+
+The standard Loop Rolling build step consists of two transformations: *Loop Body Extraction* and *Loop Rolling*. *Loop Body Extraction* returns a *LoopBodyTemplate* object which is used by the *LoopRolling* transformation to as a pattern to identify individual instances of each loop body. The *LoopBody* template object is created using an ONNX file that contains one copy of the LoopBody you'd like to create.
+
+If you have a graph of the loop body or can easily create one, then you can simply create a custom Loop Rolling step in BrainSmith that creates the LoopBodyTemplate object from the ONNX file and passes it to the LoopRolling transformation as shown in the example code below.
+
+**Example: Custom Loop Rolling Step with Pre-built Loop Body Template**
+
+```python
+from brainsmith.core.plugins import step
+from finn.transformation.fpgadataflow.loop_rolling import LoopBodyTemplate, LoopRolling
+
+@step(name="custom_loop_rolling_with_template")
+def custom_loop_rolling_with_template(model, cfg):
+    """
+    Custom loop rolling step that uses a pre-created loop body ONNX file.
+
+    Use this approach when you have manually created or extracted the loop body
+    graph and saved it to an ONNX file.
+    """
+    # Load the loop body template from a pre-created ONNX file
+    # This file should contain one complete iteration of your loop body
+    loop_body_template_path = "path/to/your/loop_body_template.onnx"
+    loop_body_template = LoopBodyTemplate(loop_body_template_path)
+
+    # Apply the loop rolling transformation using your custom template
+    model = model.transform(LoopRolling(loop_body_template))
+
+    return model
+```
+
+In this approach, you need to manually create `loop_body_template.onnx` containing one instance of your repeating layer structure. You can create this file by:
+1. Extracting a subgraph from your full model using ONNX tools
+2. Building it programmatically using ONNX IR or onnxscript
+3. Exporting a single layer model from PyTorch
+
+Otherwise, you can create a custom LoopBodyExtraction transform. One approach to creating this transform is to create a *python* list of ONNX nodes within the model that fully comprise an iteration of the LoopBody. Then you can use that list to create a SubGraphView object which can in turn be saved to an ONNX file and then used to create the LoopBodyTemplate as shown in the example code below.
+
+**Example: Custom Loop Extraction and Rolling**
+
+```python
+from brainsmith.core.plugins import step
+from finn.transformation.fpgadataflow.loop_rolling import LoopBodyTemplate, LoopRolling
+from finn.util import onnxscript_helpers as osh
+import onnxscript
+from onnxscript import ir
+import onnx
+
+class CustomLoopExtraction:
+    """
+    Custom loop body extraction that identifies loop body nodes
+    without relying on PyTorch metadata.
+    """
+
+    def __init__(self, loop_body_hierarchy):
+        self.loop_body_hierarchy = loop_body_hierarchy
+        self.loop_body_template = None
+
+    def extract_loop_body_nodes(self, graph, target_pattern):
+        """
+        Identify nodes that belong to the loop body.
+
+        This is where you implement your custom logic to find the nodes.
+        You can use pattern matching, graph analysis, or any other method.
+        """
+        extracted_nodes = []
+
+        # Strategy 1: Simple name prefix matching
+        for node in graph._nodes:
+            if node.name.startswith(target_pattern):
+                extracted_nodes.append(node)
+
+        # Strategy 2: If prefix matching fails, try pattern in node name
+        if not extracted_nodes:
+            layer_id = target_pattern.split('.')[-1]
+            for node in graph._nodes:
+                if f".{layer_id}." in node.name or f"_{layer_id}_" in node.name:
+                    extracted_nodes.append(node)
+
+        return extracted_nodes
+
+    def apply(self, model):
+        """Extract loop body and create template file."""
+        # Deserialize the model to ONNX IR
+        model_ir = onnxscript.ir.serde.deserialize_model(model.model)
+        graph = model_ir.graph
+
+        # Get the target pattern from hierarchy
+        target_pattern = self.loop_body_hierarchy[0][-1]
+
+        # Extract nodes belonging to the loop body
+        nodes = self.extract_loop_body_nodes(graph, target_pattern)
+
+        if not nodes:
+            raise ValueError(f"No nodes found matching pattern: {target_pattern}")
+
+        print(f"Extracted {len(nodes)} nodes for loop body")
+
+        # Create a SubGraphView containing only the loop body nodes
+        loop_body_graph_view = osh.SubGraphView(graph, "loop-body", nodes)
+
+        # Create an ONNX model from the subgraph
+        loop_body_model = onnxscript.ir.Model(
+            loop_body_graph_view,
+            ir_version=model.model.ir_version
+        )
+
+        # Serialize and save the loop body template
+        proto = onnxscript.ir.serde.serialize_model(loop_body_model)
+        template_path = "loop-body-template.onnx"
+        onnx.save(proto, template_path)
+
+        print(f"Loop body template saved to: {template_path}")
+
+        # Create the LoopBodyTemplate object
+        self.loop_body_template = LoopBodyTemplate(template_path)
+
+        return model
+
+@step(name="custom_loop_rolling_full")
+def custom_loop_rolling_full(model, cfg):
+    """
+    Complete custom loop rolling step with custom extraction.
+
+    This approach:
+    1. Uses custom logic to identify loop body nodes
+    2. Creates a loop body template from those nodes
+    3. Applies FINN's LoopRolling transformation
+    """
+    # Get loop body hierarchy from config
+    hierarchy = cfg.loop_body_hierarchy if hasattr(cfg, 'loop_body_hierarchy') \
+                else [['encoder', 'encoder.layer.0']]
+
+    # Step 1: Custom extraction to create loop body template
+    extractor = CustomLoopExtraction(hierarchy)
+    model = extractor.apply(model)
+
+    # Step 2: Apply FINN's loop rolling with the custom template
+    if extractor.loop_body_template is None:
+        raise ValueError("Loop body extraction failed - no template created")
+
+    model = model.transform(LoopRolling(extractor.loop_body_template))
+
+    print("Custom loop rolling completed successfully")
+
+    return model
+```
+
+**Key Points:**
+
+1. **CustomLoopExtraction.extract_loop_body_nodes()**: This is where you implement your custom logic to identify which nodes belong to the loop body. The example shows simple name matching, but you can implement more sophisticated graph analysis.
+
+2. **SubGraphView**: This FINN utility class creates a view of a subgraph given a list of nodes. It automatically handles:
+   - Finding all necessary inputs/outputs
+   - Maintaining graph connectivity
+   - Preserving node attributes and metadata
+
+3. **LoopBodyTemplate**: This class (from FINN) wraps the loop body ONNX file and provides the pattern matching infrastructure that LoopRolling needs.
+
+4. **LoopRolling transformation**: This is FINN's standard transformation that:
+   - Finds all instances of the loop body pattern in your model
+   - Replaces them with a single FINNLoop node
+   - Sets up weight streaming infrastructure
+   - Handles I/O normalization and type checking
+
+**Usage in Blueprint:**
+
+```yaml
+design_space:
+  steps:
+    - "qonnx_to_finn"
+    - "bert_streamlining"
+    - "infer_kernels"
+    - "create_dataflow_partition"
+    - "specialize_layers"
+    - "custom_loop_rolling_full"  # Your custom step
+    - "target_fps_parallelization"
+    - "apply_folding_config"
+```
+
+
+## Debugging MLO Issues
+
+### Common Problems
+
+**Missing or incorrect metadata (most common):**
+- Ensure ONNX export used `dynamo=True` to generate name scope metadata
+- Verify the ONNX model contains proper hierarchical node names
+- If unable to use dynamo export, implement custom loop rolling step (see Loop Body Identification section)
+
+**Missing Loop Body Nodes**
+
+If a node that should be in the loop body is not included during *Loop Extraction*, this can appear in `loopbody_template.onnx` as unexpected inputs and outputs to the loop body graph. Further, this can result in loop rolling failure or errors in subsequent build steps like `step_create_dataflow_partition`.
+
+Sometimes a node in the middle of the loop body will be excluded from the loop body. This can result in a self-referencing loop error in `step_create_dataflow_partition`, where the partitioning process detects invalid circular dependencies.
+
+**Debugging Steps:**
+1. Open `loopbody_template.onnx` in your build directory using Netron
+2. Check for unexpected graph inputs/outputs that should be internal to the loop body
+3. Identify which nodes are missing by comparing against your expected layer structure
+4. Adjust the `loop_body_hierarchy` configuration to include missing nodes:
+   - Try adding an additional hierarchy group for the missing node's namespace
+   - Use a broader hierarchy prefix to capture more nodes
+   - If using custom loop extraction, verify your node matching patterns
+5. Verify metadata on the missing nodes (check `pkg.torch.onnx.name_scopes` field in Netron)
+6. Rebuild and verify the `loopbody_template.onnx` contains all expected nodes
+
+
+**Incorrect loop body identification:**
+- Check `loop_body_hierarchy` matches your model structure
+- Verify layer naming conventions in ONNX graph
+
+
+### Debug Tools
+
+1. **Save intermediate models** - Use `save_intermediate_models: true`
+2. **Enable verification** - Use RTL simulation to check correctness
+3. **Memory tracing** - Monitor weight loading patterns
+4. **Performance counters** - Track cycles, bandwidth utilization
+
+## See Also
+
+- [Design Space Exploration](design_space_exploration.md) - Understanding execution trees
+- [Blueprint Schema](blueprint_schema.md) - Configuration syntax
+- [Hardware Kernels](hardware_kernels.md) - Building custom accelerators
+- [BERT Examples](../examples/bert/) - Complete MLO implementations
diff --git a/examples/bert/bert_mlo_demo.sh b/examples/bert/bert_mlo_demo.sh
new file mode 100755
index 00000000..04341143
--- /dev/null
+++ b/examples/bert/bert_mlo_demo.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Quick test script - matches functionality of old quicktest.sh
+
+set -e
+
+# Set longer timeout for RTL simulation (BERT models can take longer)
+export LIVENESS_THRESHOLD=10000000
+
+echo "Running BERT Modern Demo with Loop Rolling Test"
+echo "==============================================="
+
+# Change to demo directory
+cd "$(dirname "$0")"
+
+# Clean up any existing bert_mlo_demo build directory
+if [ -d "${BSMITH_BUILD_DIR}/bert_mlo_demo" ]; then
+    echo "Removing existing bert_mlo_demo build directory..."
+    rm -rf "${BSMITH_BUILD_DIR}/bert_mlo_demo"
+fi
+
+# Generate folding config
+echo "Generating folding configuration..."
+python gen_folding_config.py \
+    --simd 4 \
+    --pe 4 \
+    --num_layers 2 \
+    -t 1 \
+    -o ./configs/bert_mlo_demo.json
+
+# Run BERT demo
+echo "Running BERT demo with 2 layers..."
+python bert_demo.py \
+    -o bert_mlo_demo \
+    -n 4 \
+    -l 2 \
+    -z 64 \
+    -i 256 \
+    -b 8 \
+    -q 32 \
+    --blueprint ./bert_mlo_demo.yaml
+
+echo "Bert MLO test completed!"
diff --git a/examples/bert/bert_mlo_demo.yaml b/examples/bert/bert_mlo_demo.yaml
new file mode 100644
index 00000000..256f3ca0
--- /dev/null
+++ b/examples/bert/bert_mlo_demo.yaml
@@ -0,0 +1,35 @@
+
+name: "BERT Demo"
+description: "Hugging face BERT model"
+
+extends: "../../brainsmith/blueprints/bert.yaml"
+
+# Configuration overrides
+clock_ns: 5.0                       # Target clock period in nanoseconds
+output: "bitfile"                   # estimates | rtl | bitfile
+board: "V80"                        # Target FPGA board
+save_intermediate_models: true      # Save intermediate ONNX models
+
+finn_config:
+  loop_body_hierarchy: [['encoder', 'encoder.layer.0']]
+  split_large_fifos: true
+  fifosim_n_inferences: 2           # Speed up FIFO
+  verify_steps: ['folded_hls_cppsim', 'stitched_ip_rtlsim']
+    #verify_save_rtlsim_waveforms: true
+
+
+design_space:
+  # Inherit kernels from parent blueprint (don't override with empty list)
+  # kernels are defined in parent bert.yaml
+
+  # Add pre/post-processing steps to standard BERT blueprint
+  steps:
+    - at_start:
+        insert:
+          - "bert_cleanup"
+          - "remove_head"
+          - "remove_tail"
+          - "generate_reference_io"
+
+    - at_end:
+        insert: "shell_metadata_handover"
diff --git a/examples/bert_training/Layers1_config.json b/examples/bert_training/Layers1_config.json
new file mode 100644
index 00000000..c352ebd2
--- /dev/null
+++ b/examples/bert_training/Layers1_config.json
@@ -0,0 +1,1113 @@
+{
+  "Defaults": {},
+  "StreamingFIFO_rtl_0": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_0": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_1": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_1": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      378
+    ]
+  },
+  "StreamingFIFO_rtl_2": {
+    "depth": 378,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "LayerNorm_hls_0": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      378
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_3": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_0": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_4": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_2": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_5": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "DuplicateStreams_hls_0": {
+    "PE": 1,
+    "outFIFODepths": [
+      2,
+      98301
+    ],
+    "inFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_6": {
+    "depth": 98301,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_7": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_0": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_8": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "DuplicateStreams_hls_1": {
+    "PE": 1,
+    "outFIFODepths": [
+      2,
+      2,
+      2
+    ],
+    "inFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_9": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_10": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_11": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_0": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      32
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_1": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      32
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_2": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      32
+    ]
+  },
+  "StreamingFIFO_rtl_12": {
+    "depth": 32,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_13": {
+    "depth": 32,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_14": {
+    "depth": 32,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_0": {
+    "PE": 96,
+    "SIMD": 4,
+    "inFIFODepths": [
+      32
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "MVAU_rtl_1": {
+    "PE": 96,
+    "SIMD": 4,
+    "inFIFODepths": [
+      32
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "MVAU_rtl_2": {
+    "PE": 96,
+    "SIMD": 4,
+    "inFIFODepths": [
+      32
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_15": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_16": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_17": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_3": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_4": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_5": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_18": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_19": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_20": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_1": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "ElementwiseMul_hls_2": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "ElementwiseMul_hls_3": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_21": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_22": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_23": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_3": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      59943
+    ]
+  },
+  "ElementwiseAdd_hls_4": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      59943
+    ]
+  },
+  "ElementwiseAdd_hls_5": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      59943
+    ]
+  },
+  "StreamingFIFO_rtl_24": {
+    "depth": 59943,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_25": {
+    "depth": 59943,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_26": {
+    "depth": 59943,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "Shuffle_hls_0": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      59943
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "Shuffle_hls_1": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      59943
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "Shuffle_hls_2": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      59943
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_27": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_28": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_29": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_1": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "Thresholding_rtl_2": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "Thresholding_rtl_3": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_30": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_31": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_32": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_6": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      850
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_7": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2001
+    ]
+  },
+  "StreamingDataWidthConverter_rtl_8": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_33": {
+    "depth": 850,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_34": {
+    "depth": 2001,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_35": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_3": {
+    "PE": 32,
+    "SIMD": 4,
+    "inFIFODepths": [
+      2001,
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_36": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_9": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_37": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_4": {
+    "PE": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_38": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_4": {
+    "PE": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      26623
+    ]
+  },
+  "StreamingFIFO_rtl_39": {
+    "depth": 26623,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_10": {
+    "inFIFODepths": [
+      26623
+    ],
+    "outFIFODepths": [
+      6049
+    ]
+  },
+  "StreamingFIFO_rtl_40": {
+    "depth": 6049,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "HWSoftmax_hls_0": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      6049
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_41": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_11": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_42": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_5": {
+    "PE": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_43": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_4": {
+    "PE": 32,
+    "SIMD": 4,
+    "inFIFODepths": [
+      2,
+      850
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_44": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_12": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      32
+    ]
+  },
+  "StreamingFIFO_rtl_45": {
+    "depth": 32,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Shuffle_hls_3": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      32
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_46": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_6": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_47": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_13": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_48": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_5": {
+    "PE": 96,
+    "SIMD": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_49": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_14": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_50": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_5": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_51": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_6": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_52": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_7": {
+    "PE": 1,
+    "inFIFODepths": [
+      2,
+      98301
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_53": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "LayerNorm_hls_1": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_54": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_6": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_55": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_8": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_56": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "DuplicateStreams_hls_2": {
+    "PE": 1,
+    "outFIFODepths": [
+      2,
+      381
+    ],
+    "inFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_57": {
+    "depth": 381,
+    "impl_style": "vivado",
+    "ram_style": "auto"
+  },
+  "StreamingFIFO_rtl_58": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_7": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_59": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_15": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_60": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_6": {
+    "PE": 384,
+    "SIMD": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_61": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_16": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_62": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_8": {
+    "PE": 3,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_63": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_hls_0": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_64": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_7": {
+    "PE": 384,
+    "SIMD": 4,
+    "inFIFODepths": [
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_65": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_17": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_66": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_7": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_67": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_9": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_68": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_10": {
+    "PE": 1,
+    "inFIFODepths": [
+      2,
+      381
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_69": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "LayerNorm_hls_2": {
+    "SIMD": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_70": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_8": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_71": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_11": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_72": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Crop_hls_0": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_73": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_9": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_74": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "StreamingDataWidthConverter_rtl_18": {
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_75": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_8": {
+    "PE": 1,
+    "SIMD": 3,
+    "inFIFODepths": [
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_76": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_10": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_77": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "Thresholding_rtl_11": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_78": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "MVAU_rtl_9": {
+    "PE": 1,
+    "SIMD": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "resType": "auto",
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_79": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseMul_hls_9": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_80": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  },
+  "ElementwiseAdd_hls_12": {
+    "PE": 1,
+    "inFIFODepths": [
+      2
+    ],
+    "outFIFODepths": [
+      2
+    ]
+  },
+  "StreamingFIFO_rtl_81": {
+    "depth": 2,
+    "impl_style": "rtl",
+    "ram_style": "auto"
+  }
+}
\ No newline at end of file
diff --git a/examples/bert_training/bert_demo.py b/examples/bert_training/bert_demo.py
new file mode 100644
index 00000000..9de81583
--- /dev/null
+++ b/examples/bert_training/bert_demo.py
@@ -0,0 +1,161 @@
+############################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# SPDX-License-Identifier: MIT
+#
+# @author       Shane T. Fleming <shane.fleming@amd.com>
+# @author       Thomas Keller <thomaskeller@microsoft.com>
+############################################################################
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import tempfile
+import warnings
+from pathlib import Path
+
+import numpy as np
+import onnx
+from onnxsim import simplify
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.cleanup import cleanup
+
+import custom_steps  # Import custom steps to trigger registration
+
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent.parent.parent))
+
+from brainsmith import forge
+
+warnings.simplefilter("ignore")
+
+
+def generate_bert_model(args):
+    """Load BERT model from specified ONNX file."""
+    if not os.path.exists(args.model_path):
+        raise FileNotFoundError(f"Model file not found: {args.model_path}")
+    
+    model = onnx.load(args.model_path)
+    return model
+
+
+def run_brainsmith_dse(model, args):
+    """Run Brainsmith with new execution tree architecture."""
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    model_dir = os.path.join(args.output_dir, "intermediate_models")
+    os.makedirs(model_dir, exist_ok=True)
+
+    onnx.save(model, os.path.join(args.output_dir, "input.onnx"))
+
+    # Get blueprint path from args
+    blueprint_path = Path(__file__).parent / args.blueprint
+    
+    # Forge the FPGA accelerator
+    print("Forging FPGA accelerator...")
+    results = forge(
+        model_path=os.path.join(args.output_dir, "input.onnx"),
+        blueprint_path=str(blueprint_path),
+        output_dir=args.output_dir
+    )
+    
+    # Results are automatically logged by forge()
+    # Just check if we succeeded
+    stats = results.stats
+    if stats['successful'] == 0:
+        raise RuntimeError(f"No successful builds")
+    
+    # The new execution tree handles output automatically
+    final_model_dst = os.path.join(args.output_dir, "output.onnx")
+    
+    # Find the output from the successful execution
+    for segment_id, result in results.segment_results.items():
+        if result.success and result.output_model:
+            shutil.copy2(result.output_model, final_model_dst)
+            break
+    
+    # Handle shell metadata (matches old hw_compiler.py)
+    handover_file = os.path.join(args.output_dir, "stitched_ip", "shell_handover.json")
+    if os.path.exists(handover_file):
+        with open(handover_file, "r") as fp:
+            handover = json.load(fp)
+        handover["num_layers"] = args.num_hidden_layers
+        with open(handover_file, "w") as fp:
+            json.dump(handover, fp, indent=4)
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='BERT FINN demo using pre-trained ONNX model'
+    )
+    
+    # Model configuration
+    parser.add_argument('-o', '--output', help='Output build directory name', required=True)
+    parser.add_argument('-m', '--model', dest='model_path', help='Path to ONNX model file', required=True)
+    parser.add_argument('-z', '--hidden_size', type=int, default=384, 
+                       help='BERT hidden_size parameter')
+    parser.add_argument('-n', '--num_attention_heads', type=int, default=12, 
+                       help='BERT num_attention_heads parameter')
+    parser.add_argument('-l', '--num_hidden_layers', type=int, default=1, 
+                       help='Number of hidden layers')
+    parser.add_argument('-i', '--intermediate_size', type=int, default=1536, 
+                       help='BERT intermediate_size parameter')
+    parser.add_argument('-b', '--bitwidth', type=int, default=8, 
+                       help='Quantization bitwidth (4 or 8)')
+    parser.add_argument('-q', '--seqlen', type=int, default=128, 
+                       help='Sequence length parameter')
+    
+    # Blueprint configuration
+    parser.add_argument('--blueprint', type=str, default='bert_demo.yaml',
+                       help='Blueprint YAML file to use (default: bert_demo.yaml)')
+    
+    args = parser.parse_args()
+    
+    # Determine output directory
+    build_dir = os.environ.get("BSMITH_BUILD_DIR", "./build")
+    print(build_dir)
+    args.output_dir = os.path.join(build_dir, args.output)
+    
+    print("=" * 70)
+    print("BERT Demo Using Brainsmith DSE")
+    print("=" * 70)
+    print(f"Configuration:")
+    print(f"  Hidden layers: {args.num_hidden_layers}")
+    print(f"  Hidden size: {args.hidden_size}")
+    print(f"  Attention heads: {args.num_attention_heads}")
+    print(f"  Intermediate size: {args.intermediate_size}")
+    print(f"  Bitwidth: {args.bitwidth}")
+    print(f"  Sequence length: {args.seqlen}")
+    print(f"  Blueprint: {args.blueprint}")
+    print(f"  Output directory: {args.output_dir}")
+    print("=" * 70)
+    
+    try:
+        # Step 1: Generate BERT model
+        print("\nStep 1: Generating quantized BERT model...")
+        model = generate_bert_model(args)
+        
+        # Step 2: Run Brainsmith DSE
+        print("\nStep 2: Running Brainsmith DSE pipeline...")
+        result = run_brainsmith_dse(model, args)
+        
+        print("\n" + "=" * 70)
+        print("BUILD COMPLETED SUCCESSFULLY")
+        print("=" * 70)
+        print(f"Output directory: {args.output_dir}")
+        
+    except Exception as e:
+        print(f"\nERROR: Build failed with error: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/bert_training/bert_demo.yaml b/examples/bert_training/bert_demo.yaml
new file mode 100644
index 00000000..59500ce5
--- /dev/null
+++ b/examples/bert_training/bert_demo.yaml
@@ -0,0 +1,43 @@
+
+name: "BERT Demo"
+description: "Hugging face BERT model"
+
+extends: "${BSMITH_DIR}/brainsmith/blueprints/bert.yaml"
+
+# Configuration overrides
+clock_ns: 5.0                       # Target clock period in nanoseconds
+output: "bitfile"                   # estimates | rtl | bitfile
+board: "V80"                        # Target FPGA board
+save_intermediate_models: true      # Save intermediate ONNX models
+
+# Direct override FINN configuration options
+finn_config:
+  loop_body_hierarchy: [
+    ['bert', 'bert.encoder', 'bert.encoder.layer.0']
+  ]
+  standalone_thresholds: true
+  folding_config_file: "${BSMITH_DIR}/examples/bert_training/initial_folding.json"
+  split_large_fifos: true
+  auto_fifo_depths: true
+  fifosim_n_inferences: 2         # Speed up FIFO sizing
+  stitched_ip_gen_dcp: true
+  verify_steps:
+    - "stitched_ip_rtlsim"
+  #verify_save_rtlsim_waveforms: true #This is really big
+  verify_save_full_context: true
+  verification_atol: 0.1
+
+design_space:
+  # Inherit kernels from parent blueprint
+
+  # Add pre/post-processing steps to standard BERT blueprint
+  steps:
+    - at_start:
+        insert:
+          - "bert_cleanup"
+          - "remove_head"
+            #- "remove_tail"
+          - "generate_reference_io"
+
+    - at_end:
+        insert: "shell_metadata_handover"
diff --git a/examples/bert_training/custom_steps.py b/examples/bert_training/custom_steps.py
new file mode 100644
index 00000000..e9978319
--- /dev/null
+++ b/examples/bert_training/custom_steps.py
@@ -0,0 +1,145 @@
+############################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# SPDX-License-Identifier: MIT
+#
+# @author       Shane T. Fleming <shane.fleming@amd.com>
+# @author       Thomas Keller <thomaskeller@microsoft.com>
+############################################################################
+
+"""
+BERT-Specific Custom Build Steps
+
+Custom steps specifically for BERT model processing, including:
+- Head and tail removal for model decomposition
+- Metadata extraction for shell integration
+- Reference I/O generation for validation
+
+These steps are highly specific to BERT model architecture and
+are not general-purpose FINN dataflow compilation steps.
+"""
+
+import os
+import shutil
+import logging
+from typing import Any
+import numpy as np
+
+import finn.core.onnx_exec as oxe
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import gen_finn_dt_tensor
+from brainsmith.core.plugins import step
+from brainsmith.utils import apply_transforms
+
+logger = logging.getLogger(__name__)
+
+
+@step(
+    name="remove_head",
+    category="bert",
+    description="Head removal for models"
+)
+def remove_head_step(model, cfg):
+    """Remove all nodes up to the first LayerNormalization node and rewire input."""
+    
+    assert len(model.graph.input) == 1, "Error the graph has more inputs than expected"
+    tensor_to_node = {output: node for node in model.graph.node for output in node.output}
+
+    to_remove = []
+
+    current_tensor = model.graph.input[0].name
+    current_node = model.find_consumer(current_tensor)
+    while current_node.op_type != "LayerNormalization":
+        to_remove.append(current_node)
+        assert len(current_node.output) == 1, "Error expected an linear path to the first LN"
+        current_tensor = current_node.output[0]
+        current_node = model.find_consumer(current_tensor)
+
+    # Send the global input to the consumers of the layernorm output
+    LN_output = current_node.output[0]
+    consumers = model.find_consumers(LN_output)
+
+    # Remove nodes
+    to_remove.append(current_node)
+    for node in to_remove:
+        model.graph.node.remove(node)
+
+    in_vi = model.get_tensor_valueinfo(LN_output)
+    model.graph.input.pop()
+    model.graph.input.append(in_vi)
+    model.graph.value_info.remove(in_vi)
+
+    # Reconnect input
+    for con in consumers:
+        for i,ip in enumerate(con.input):
+            if ip == LN_output:
+                con.input[i] = model.graph.input[0].name
+
+    # Clean up after head removal
+    model = apply_transforms(model, [
+        'RemoveUnusedTensors',
+        'GiveReadableTensorNames'
+    ])
+    
+    return model
+
+
+def _recurse_model_tail_removal(model, to_remove, node):
+    """Helper function for recursively walking the BERT graph from the second
+    output up to the last LayerNorm to remove it"""
+    if node is not None:
+        if node.op_type != "LayerNormalization":
+            to_remove.append(node)
+            for tensor in node.input:
+                _recurse_model_tail_removal(model, to_remove, model.find_producer(tensor))
+    return
+
+
+@step(
+    name="remove_tail", 
+    category="bert",
+    description="BERT-specific tail removal for models"
+)
+def remove_tail_step(model, cfg):
+    """Remove from global_out_1 all the way back to the first LayerNorm."""
+    # Direct implementation from old custom_step_remove_tail
+    out_names = [x.name for x in model.graph.output]
+    assert "global_out" in out_names, "Error: expected one of the outputs to be called global_out_1, we might need better pattern matching logic here"
+
+    to_remove = []
+    current_node = model.find_producer('global_out')
+    _recurse_model_tail_removal(model, to_remove, current_node)
+
+    for node in to_remove:
+        model.graph.node.remove(node)
+    del model.graph.output[out_names.index('global_out')]
+
+    return model
+
+
+@step(
+    name="generate_reference_io", 
+    category="bert",
+    description="Reference IO generation for BERT demo"
+)
+def generate_reference_io_step(model, cfg):
+    """
+    This step is to generate a reference IO pair for the 
+    onnx model where the head and the tail have been 
+    chopped off.
+    """
+    input_m = model.graph.input[0]
+    in_shape = [dim.dim_value for dim in input_m.type.tensor_type.shape.dim]
+    in_tensor = np.random.uniform(0, 1000, size=in_shape).astype(np.float32)
+    np.save(cfg.output_dir+"/input.npy", in_tensor)
+
+    input_t = { input_m.name : in_tensor}
+    out_name = model.graph.output[0].name
+
+    y_ref = oxe.execute_onnx(model, input_t, True)
+    np.save(cfg.output_dir+"/expected_output.npy", y_ref[out_name])
+    np.savez(cfg.output_dir+"/expected_context.npz", **y_ref) 
+    return model
diff --git a/examples/bert_training/evaluate_onnx_accuracy.py b/examples/bert_training/evaluate_onnx_accuracy.py
new file mode 100755
index 00000000..8cb4a8bb
--- /dev/null
+++ b/examples/bert_training/evaluate_onnx_accuracy.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+Evaluate ONNX Model Accuracy on Validation Set
+"""
+
+import onnxruntime as ort
+import numpy as np
+from transformers import BertTokenizer
+from datasets import load_dataset
+import argparse
+import os
+import time
+from tqdm import tqdm
+
+
+def load_onnx_model(model_path):
+    """Load ONNX model with appropriate runtime"""
+    print(f"Loading ONNX model from {model_path}...")
+    
+    is_qonnx = False
+    try:
+        with open(model_path, 'rb') as f:
+            content = f.read(50000)  # Read more content
+            if (b'qonnx.custom_op' in content or 
+                b'Quant(-1)' in content or 
+                b'brevitas' in content or
+                b'QuantLinear' in content or
+                b'qonnx:Quant' in content):
+                is_qonnx = True
+    except Exception:
+        pass
+        
+    if not is_qonnx:
+        try:
+            import onnxruntime as ort
+            test_session = ort.InferenceSession(model_path)
+            test_session = None  # Clean up
+        except Exception as e:
+            if 'qonnx.custom_op' in str(e) or 'Quant(-1)' in str(e):
+                is_qonnx = True
+    
+    if is_qonnx:
+        print("Detected QONNX model, using QONNX runtime...")
+        try:
+            from qonnx.core.modelwrapper import ModelWrapper
+            from qonnx.transformation.infer_shapes import InferShapes
+            from qonnx.transformation.infer_datatypes import InferDataTypes
+            
+            model = ModelWrapper(model_path)
+            
+            try:
+                model = model.transform(InferDataTypes())
+                model = model.transform(InferShapes())
+            except Exception as e:
+                print(f"  - Some transformations failed: {e}")
+            
+            return model, 'qonnx'
+            
+        except ImportError:
+            print("QONNX not available, falling back to ONNX Runtime...")
+            return None, None
+    else:
+        print("Using standard ONNX Runtime...")
+        try:
+            session = ort.InferenceSession(model_path)
+            return session, 'onnx'
+        except Exception as e:
+            print(f"Error loading ONNX model: {e}")
+            return None, None
+
+
+def predict_batch(model, model_type, input_ids_batch):
+    """Predict on a batch of input_ids"""
+    if model_type == 'onnx':
+        input_name = model.get_inputs()[0].name
+        output_name = model.get_outputs()[0].name
+        result = model.run([output_name], {input_name: input_ids_batch})
+        logits = result[0]
+        
+    elif model_type == 'qonnx':
+        from qonnx.core.onnx_exec import execute_onnx
+        
+        batch_logits = []
+        for i in range(input_ids_batch.shape[0]):
+            single_input = input_ids_batch[i:i+1]  # Keep batch dimension
+            input_dict = {"input_ids": single_input}
+            
+            try:
+                output_dict = execute_onnx(model, input_dict)
+                
+                output_key = list(output_dict.keys())[-1]
+                logits = output_dict[output_key]
+                
+                if len(logits.shape) == 1:
+                    logits = logits.reshape(1, -1)
+                
+                batch_logits.append(logits)
+                
+            except Exception as e:
+                print(f"Error processing sample {i}: {e}")
+                batch_logits.append(np.array([[0.0, 0.0]]))
+        
+        logits = np.vstack(batch_logits)
+    
+    return logits
+
+
+def evaluate_model_accuracy(model, model_type, tokenizer, max_length=128, 
+                          num_samples=None, batch_size=32):
+    """Evaluate model accuracy on SST-2 validation set"""
+    print("Loading SST-2 validation dataset...")
+    dataset = load_dataset("glue", "sst2")
+    val_dataset = dataset['validation']
+    
+    if model_type == 'qonnx' and batch_size > 8:
+        batch_size = 8
+        print(f"Using batch size {batch_size} for QONNX model")
+    
+    if num_samples:
+        val_dataset = val_dataset.select(range(min(num_samples, len(val_dataset))))
+        print(f"Evaluating on {len(val_dataset)} samples")
+    else:
+        print(f"Evaluating on full validation set ({len(val_dataset)} samples)")
+    
+    correct = 0
+    total = 0
+    
+    for i in tqdm(range(0, len(val_dataset), batch_size), desc="Evaluating"):
+        batch_end = min(i + batch_size, len(val_dataset))
+        batch_samples = val_dataset[i:batch_end]
+        
+        texts = batch_samples['sentence']
+        labels = batch_samples['label']
+        
+        inputs = tokenizer(
+            texts,
+            truncation=True,
+            padding='max_length',
+            max_length=max_length,
+            return_tensors='np'
+        )
+        
+        input_ids = inputs['input_ids'].astype(np.int64)
+        
+        try:
+            logits = predict_batch(model, model_type, input_ids)
+            predictions = np.argmax(logits, axis=-1)
+            
+            for pred, true_label in zip(predictions, labels):
+                if pred == true_label:
+                    correct += 1
+                total += 1
+                
+        except Exception as e:
+            print(f"Error processing batch {i//batch_size}: {e}")
+            continue
+    
+    if total == 0:
+        print("No samples were successfully processed!")
+        return 0.0
+    
+    accuracy = correct / total
+    return accuracy
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate ONNX model accuracy')
+    parser.add_argument('--model', default='quantized_int8_model.onnx',
+                        help='Path to ONNX model')
+    parser.add_argument('--max_length', type=int, default=128,
+                        help='Maximum sequence length')
+    parser.add_argument('--num_samples', type=int, default=None,
+                        help='Number of validation samples to use (default: all)')
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='Batch size for evaluation')
+    
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.model):
+        print(f"Error: Model not found at {args.model}")
+        return
+    
+    model, model_type = load_onnx_model(args.model)
+    if model is None:
+        print("Failed to load model")
+        return
+    
+    print("Loading tokenizer...")
+    tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
+    
+    print("\nStarting accuracy evaluation...")
+    start_time = time.time()
+    
+    accuracy = evaluate_model_accuracy(
+        model, model_type, tokenizer, 
+        args.max_length, args.num_samples, args.batch_size
+    )
+    
+    eval_time = time.time() - start_time
+    
+    print(f"\n=== Evaluation Results ===")
+    print(f"Model: {args.model}")
+    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
+    print(f"Evaluation time: {eval_time:.2f} seconds")
+    
+    model_size = os.path.getsize(args.model) / (1024 * 1024)
+    print(f"Model size: {model_size:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/bert_training/initial_folding.json b/examples/bert_training/initial_folding.json
new file mode 100644
index 00000000..a5fec5a8
--- /dev/null
+++ b/examples/bert_training/initial_folding.json
@@ -0,0 +1,175 @@
+{
+  "Defaults": {},
+  "ElementwiseAdd_hls_0": {
+    "PE": 1
+  },
+  "ElementwiseAdd_hls_1": {
+    "PE": 1
+  },
+  "LayerNorm_hls_0": {
+    "SIMD": 1
+  },
+  "ElementwiseMul_hls_0": {
+    "PE": 1
+  },
+  "ElementwiseMul_Add_2": {
+    "PE": 1
+  },
+  "FINNLoop_0_DuplicateStreams_hls_0": {
+    "PE": 2
+  },
+  "FINNLoop_0_Thresholding_rtl_0": {
+    "PE": 4
+  },
+  "FINNLoop_0_DuplicateStreams_hls_1": {
+    "PE": 4
+  },
+  "FINNLoop_0_MVAU_rtl_0": {
+    "PE": 32,
+    "SIMD": 4
+  },
+  "FINNLoop_0_MVAU_rtl_1": {
+    "PE": 32,
+    "SIMD": 4
+  },
+  "FINNLoop_0_MVAU_rtl_2": {
+    "PE": 32,
+    "SIMD": 4
+  },
+  "FINNLoop_0_ElementwiseMul_hls_0": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseMul_hls_1": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseMul_hls_2": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_0": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_1": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_2": {
+    "PE": 2
+  },
+  "FINNLoop_0_Shuffle_hls_0": {
+    "SIMD": 1
+  },
+  "FINNLoop_0_Shuffle_hls_1": {
+    "SIMD": 1
+  },
+  "FINNLoop_0_Shuffle_hls_2": {
+    "SIMD": 1
+  },
+  "FINNLoop_0_Thresholding_rtl_1": {
+    "PE": 4
+  },
+  "FINNLoop_0_Thresholding_rtl_2": {
+    "PE": 4
+  },
+  "FINNLoop_0_Thresholding_rtl_3": {
+    "PE": 4
+  },
+  "FINNLoop_0_MVAU_rtl_3": {
+    "PE": 16,
+    "SIMD": 4
+  },
+  "FINNLoop_0_Thresholding_rtl_4": {
+    "PE": 16
+  },
+  "FINNLoop_0_ElementwiseMul_hls_3": {
+    "PE": 2
+  },
+  "FINNLoop_0_HWSoftmax_hls_0": {
+    "SIMD": 1
+  },
+  "FINNLoop_0_Thresholding_rtl_5": {
+    "PE": 4
+  },
+  "FINNLoop_0_MVAU_rtl_4": {
+    "PE": 16,
+    "SIMD": 4
+  },
+  "FINNLoop_0_Shuffle_hls_3": {
+    "SIMD": 1
+  },
+  "FINNLoop_0_Thresholding_rtl_6": {
+    "PE": 8
+  },
+  "FINNLoop_0_MVAU_rtl_5": {
+    "PE": 32,
+    "SIMD": 4
+  },
+  "FINNLoop_0_ElementwiseMul_hls_4": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_3": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_4": {
+    "PE": 2
+  },
+  "FINNLoop_0_LayerNorm_hls_0": {
+    "SIMD": 8
+  },
+  "FINNLoop_0_ElementwiseMul_hls_5": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_5": {
+    "PE": 2
+  },
+  "FINNLoop_0_DuplicateStreams_hls_2": {
+    "PE": 4
+  },
+  "FINNLoop_0_Thresholding_rtl_7": {
+    "PE": 16
+  },
+  "FINNLoop_0_MVAU_rtl_6": {
+    "PE": 128,
+    "SIMD": 4
+  },
+  "FINNLoop_0_Thresholding_rtl_8": {
+    "PE": 128
+  },
+  "FINNLoop_0_MVAU_rtl_7": {
+    "PE": 128,
+    "SIMD": 4
+  },
+  "FINNLoop_0_ElementwiseMul_hls_6": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_6": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_7": {
+    "PE": 2
+  },
+  "FINNLoop_0_LayerNorm_hls_1": {
+    "SIMD": 8
+  },
+  "FINNLoop_0_ElementwiseMul_hls_7": {
+    "PE": 2
+  },
+  "FINNLoop_0_ElementwiseAdd_hls_8": {
+    "PE": 2
+  },
+  "Thresholding_rtl_0": {
+    "PE": 16
+  },
+  "MVAU_rtl_0": {
+    "PE": 16,
+    "SIMD": 16
+  },
+  "Thresholding_rtl_1": {
+    "PE": 4
+  },
+  "Thresholding_rtl_2": {
+    "PE": 2
+  },
+  "MVAU_rtl_1": {
+    "PE": 2,
+    "SIMD": 2
+  }
+}
diff --git a/examples/bert_training/quantize_to_int8.py b/examples/bert_training/quantize_to_int8.py
new file mode 100755
index 00000000..050ed168
--- /dev/null
+++ b/examples/bert_training/quantize_to_int8.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""
+Apply PTQ Quantization using Brevitas to FP32 Model and Export to Clean ONNX
+"""
+
+import torch
+import torch.nn as nn
+from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
+from datasets import load_dataset
+import brevitas.nn as qnn
+from brevitas.quant import Int8ActPerTensorFloat, Uint8ActPerTensorFloat, Int8WeightPerTensorFloat
+from brevitas.graph import ModuleToModuleByInstance
+from brevitas.graph.calibrate import calibration_mode
+from brevitas.graph.quantize import layerwise_quantize
+# from brevitas_examples.llm.llm_quant.prepare_for_quantize import replace_sdpa_with_quantizable_layers
+from brevitas.graph import TorchFunctionalToModule
+from brevitas.nn import ScaledDotProductAttention
+import torch.nn.functional as F
+from transformers.utils.fx import symbolic_trace
+import argparse
+import os
+import numpy as np
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+from qonnx.transformation.general import (
+    RemoveUnusedTensors,
+    SortGraph,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+
+
+def replace_sdpa_with_quantizable_layers(model):
+    """Replace scaled dot product attention with quantizable version"""
+    fn_to_module_map = ((F.scaled_dot_product_attention, ScaledDotProductAttention),)
+    model = TorchFunctionalToModule(fn_to_module_map=fn_to_module_map).apply(model)
+    return model
+
+
+def create_tinybert_config():
+    """Create TinyBERT configuration"""
+    config = BertConfig(
+        vocab_size=30522,
+        hidden_size=384,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=1536,
+        hidden_act="relu",
+        num_labels=2
+    )
+    return config
+
+
+def load_fp32_model(model_path, max_length=128):
+    """Load the trained FP32 model"""
+    print(f"Loading FP32 model from {model_path}...")
+    config = create_tinybert_config()
+    model = BertForSequenceClassificationWrapper(config, max_length)
+    model.load_state_dict(torch.load(model_path, map_location='cpu', weights_only=False))
+    model.eval()
+    return model
+
+
+def apply_bert_quantization(model, config, bitwidth=8, seqlen=128):
+    """Apply BERT-style quantization using layerwise approach"""
+    print(f"Applying BERT-style quantization with {bitwidth}-bit precision...")
+
+    dtype = torch.float32
+    model.to(dtype=dtype)
+    model.eval()
+    vocab_size = model.config.vocab_size
+    batch_size = 1
+
+    input_ids = torch.randint(vocab_size, (batch_size, seqlen), dtype=torch.int64)
+    inp = {'input_ids': input_ids}
+
+    print("Performing symbolic tracing...")
+    input_names = inp.keys()
+    model = symbolic_trace(model, input_names, disable_check=True)
+
+    print("Replacing SDPA with quantizable variants...")
+    model = replace_sdpa_with_quantizable_layers(model)
+    print("Replacement done.")
+
+    unsigned_hidden_act = config.hidden_act == 'relu'
+    layerwise_compute_layer_map = {}
+
+    # Linear layer quantization
+    layerwise_compute_layer_map[nn.Linear] = (
+        qnn.QuantLinear,
+        {
+            'input_quant': lambda module: Uint8ActPerTensorFloat
+                if module.in_features == config.intermediate_size and unsigned_hidden_act
+                else Int8ActPerTensorFloat,
+            'weight_quant': Int8WeightPerTensorFloat,
+            'weight_bit_width': bitwidth,
+            'output_quant': None,
+            'bias_quant': None,
+            'return_quant_tensor': False
+        }
+    )
+
+    layerwise_compute_layer_map[qnn.ScaledDotProductAttention] = (
+        qnn.QuantScaledDotProductAttention,
+        {
+            'softmax_input_quant': Int8ActPerTensorFloat,
+            'softmax_input_bit_width': bitwidth,
+            'attn_output_weights_quant': Uint8ActPerTensorFloat,
+            'attn_output_weights_bit_width': bitwidth,
+            'q_scaled_quant': Int8ActPerTensorFloat,
+            'q_scaled_bit_width': bitwidth,
+            'k_transposed_quant': Int8ActPerTensorFloat,
+            'k_transposed_bit_width': bitwidth,
+            'v_quant': Int8ActPerTensorFloat,
+            'v_bit_width': bitwidth,
+            'out_quant': Int8ActPerTensorFloat,
+            'out_bit_width': bitwidth,
+            'return_quant_tensor': False
+        }
+    )
+
+    # HardTanh quantization (replacing Tanh)
+    layerwise_compute_layer_map[nn.Tanh] = (
+        qnn.QuantHardTanh,
+        {
+            'input_quant': None,
+            'act_quant': Int8ActPerTensorFloat,
+            'act_bit_width': bitwidth,
+            'min_val': -1.0,
+            'max_val': 1.0,
+            'return_quant_tensor': False
+        }
+    )
+
+    print("Applying layerwise quantization...")
+    model = layerwise_quantize(
+        model=model,
+        compute_layer_map=layerwise_compute_layer_map
+    )
+    model.to(dtype=dtype)
+
+    print("BERT quantization completed.")
+    return model
+
+
+def calibrate_model(model, tokenizer, num_samples=1600, max_length=128):
+    """Calibrate the quantized model with sample data using proper calibration mode"""
+    print(f"Calibrating model with ~{num_samples} samples...")
+
+    dataset = load_dataset("glue", "sst2")
+    calibration_samples = dataset["train"].shuffle(seed=42).select(range(num_samples))
+
+    def tokenize_function(examples):
+        return tokenizer(
+            examples["sentence"],
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+            return_tensors="pt"
+        )
+
+    calibration_data = calibration_samples.map(tokenize_function, batched=True)
+    calibration_data.set_format(type="torch", columns=["input_ids"])
+    calibration_dataloader = DataLoader(calibration_data, batch_size=32, shuffle=False)
+
+    model.eval()
+    device = next(model.parameters()).device
+
+    with torch.no_grad(), calibration_mode(model):
+        for batch_idx, batch in enumerate(tqdm(calibration_dataloader, desc="Calibrating")):
+            input_ids = batch["input_ids"].to(device)
+
+            _ = model(input_ids)
+
+            if batch_idx >= 50:
+                break
+
+    print("Calibration completed")
+
+class BertForSequenceClassificationWrapper(BertForSequenceClassification):
+    def __init__(self, config, max_length=128):
+        super().__init__(config)
+        self.max_length = max_length
+
+    def forward(self, input_ids):
+        batch_size = input_ids.shape[0]
+        attention_mask = torch.ones((batch_size, self.max_length), dtype=torch.long, device=input_ids.device)
+        return super().forward(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def apply_qonnx_cleanup(model_path):
+    """Apply QONNX cleanup transformations to reduce complexity"""
+
+    try:
+        model = ModelWrapper(model_path)
+
+        print(f"  Original model has {len(model.graph.node)} nodes")
+
+        model = model.transform(InferDataTypes())
+        model = model.transform(InferShapes())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveUniqueParameterTensors())
+        model = model.transform(SortGraph())
+        model = model.transform(FoldConstants())
+        model = model.transform(RemoveUnusedTensors())
+
+        model = model.transform(FoldTransposeIntoQuantInit())
+
+        print(f"  Cleaned model has {len(model.graph.node)} nodes")
+
+        cleaned_path = model_path.replace('.onnx', '_cleaned.onnx')
+        model.save(cleaned_path)
+
+        print(f"  Cleaned model saved to: {cleaned_path}")
+        return cleaned_path
+
+    except Exception as e:
+        print(f"  QONNX cleanup failed: {e}")
+        return model_path
+
+
+def export_quantized_to_onnx(model, output_path, max_length=128):
+    """Export quantized model to clean ONNX"""
+    device = next(model.parameters()).device
+    model.eval()
+
+    dummy_input = torch.ones(1, max_length, dtype=torch.long).to(device)
+
+    from brevitas.export import export_qonnx
+    print(f"Attempting QONNX export with dynamo=True...")
+    export_qonnx(model, dummy_input, output_path, dynamo=True)
+    print(f"QONNX export successful")
+
+    print(f"Quantized ONNX model saved to: {output_path}")
+    cleaned_path = apply_qonnx_cleanup(output_path)
+
+    return cleaned_path
+
+
+def validate_quantized_model(original_model, quantized_model, tokenizer, max_length=128):
+    print("Validating quantized model...")
+
+    dataset = load_dataset("glue", "sst2")
+    test_samples = dataset['validation'].shuffle(seed=42).select(range(100))
+
+    original_model.eval()
+    quantized_model.eval()
+    device = next(quantized_model.parameters()).device
+
+    original_correct = 0
+    quantized_correct = 0
+
+    with torch.no_grad():
+        for sample in test_samples:
+            # Tokenize
+            inputs = tokenizer(
+                sample['sentence'],
+                truncation=True,
+                padding='max_length',
+                max_length=max_length,
+                return_tensors='pt'
+            )
+
+            input_ids = inputs['input_ids'].to(device)
+            true_label = sample['label']
+
+            orig_outputs = original_model(input_ids)
+            orig_pred = torch.argmax(orig_outputs.logits, dim=-1).item()
+            if orig_pred == true_label:
+                original_correct += 1
+
+            quant_outputs = quantized_model(input_ids)
+            # Handle different output formats
+            if hasattr(quant_outputs, 'logits'):
+                quant_logits = quant_outputs.logits
+            elif isinstance(quant_outputs, dict) and 'logits' in quant_outputs:
+                quant_logits = quant_outputs['logits']
+            else:
+                # If it's a tensor or other format, assume it's the logits directly
+                quant_logits = quant_outputs
+            quant_pred = torch.argmax(quant_logits, dim=-1).item()
+            if quant_pred == true_label:
+                quantized_correct += 1
+
+    orig_acc = original_correct / len(test_samples) * 100
+    quant_acc = quantized_correct / len(test_samples) * 100
+
+    print(f"Original model accuracy: {orig_acc:.2f}%")
+    print(f"Quantized model accuracy: {quant_acc:.2f}%")
+    print(f"Accuracy difference: {quant_acc - orig_acc:+.2f}%")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Quantize FP32 Model to INT8 and Export to ONNX')
+    parser.add_argument('--input_model', default='best_fp32_model.pth',
+                        help='Path to FP32 PyTorch model')
+    parser.add_argument('--output', default='quantized_int8_model.onnx',
+                        help='Output quantized ONNX path')
+    parser.add_argument('--calibration_samples', type=int, default=1600,
+                        help='Number of samples for calibration')
+    parser.add_argument('--bitwidth', type=int, default=8,
+                        help='Quantization bit width')
+    parser.add_argument('--max_length', type=int, default=128,
+                        help='Maximum sequence length')
+    parser.add_argument('--validate', action='store_true',
+                        help='Validate quantized model accuracy')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input_model):
+        print(f"Error: Input model not found at {args.input_model}")
+        print("Please run train_fp32_model.py first")
+        return
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+
+    tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
+    original_model = load_fp32_model(args.input_model, args.max_length)
+    original_model.to(device)
+
+    config = create_tinybert_config()
+    quantized_model = apply_bert_quantization(original_model, config, args.bitwidth, args.max_length)
+    quantized_model.to(device)
+
+    print(f"Quantized model has {sum(p.numel() for p in quantized_model.parameters()):,} parameters")
+
+    calibrate_model(quantized_model, tokenizer, args.calibration_samples, args.max_length)
+
+    if args.validate:
+        validate_quantized_model(original_model, quantized_model, tokenizer, args.max_length)
+
+    cleaned_model_path = export_quantized_to_onnx(quantized_model, args.output, args.max_length)
+
+    torch.save(quantized_model.state_dict(), 'quantized_int8_model.pth')
+
+    print(f"\nQuantization completed!")
+    print(f"Quantized ONNX model saved to: {args.output}")
+    if cleaned_model_path != args.output:
+        print(f"Cleaned ONNX model saved to: {cleaned_model_path}")
+    print(f"Quantized PyTorch model saved to: quantized_int8_model.pth")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/bert_training/train_fp32_model.py b/examples/bert_training/train_fp32_model.py
new file mode 100755
index 00000000..8c6d98f9
--- /dev/null
+++ b/examples/bert_training/train_fp32_model.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""
+Train FP32 TinyBERT Classification Model and Export to Clean ONNX
+"""
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
+from datasets import load_dataset
+import numpy as np
+import onnx
+import onnxsim
+import argparse
+import os
+from tqdm import tqdm
+
+
+def create_tinybert_config():
+    """Create TinyBERT configuration"""
+    config = BertConfig(
+        vocab_size=30522,
+        hidden_size=384,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=1536,
+        hidden_act="relu",
+        num_labels=2
+    )
+    return config
+
+
+def load_and_preprocess_data(tokenizer, max_length=128):
+    """Load and preprocess SST-2 dataset"""
+    print("Loading SST-2 dataset...")
+    dataset = load_dataset("glue", "sst2")
+    
+    def tokenize_data(examples):
+        return tokenizer(
+            examples['sentence'],
+            truncation=True,
+            padding='max_length',
+            max_length=max_length
+        )
+    
+    # Tokenize datasets
+    train_dataset = dataset['train'].map(tokenize_data, batched=True)
+    val_dataset = dataset['validation'].map(tokenize_data, batched=True)
+    
+    # Set format for PyTorch
+    train_dataset.set_format(type='torch', columns=['input_ids', 'label'])
+    val_dataset.set_format(type='torch', columns=['input_ids', 'label'])
+    
+    return train_dataset, val_dataset
+
+
+def train_model(model, train_loader, val_loader, device, epochs=3):
+    """Train the model"""
+    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
+    criterion = nn.CrossEntropyLoss()
+    
+    model.to(device)
+    best_val_acc = 0
+    
+    for epoch in range(epochs):
+        # Training
+        model.train()
+        total_loss = 0
+        correct = 0
+        total = 0
+        
+        print(f"\nEpoch {epoch+1}/{epochs}")
+        train_pbar = tqdm(train_loader, desc="Training")
+        
+        for batch in train_pbar:
+            input_ids = batch['input_ids'].to(device)
+            labels = batch['label'].to(device)
+            
+            optimizer.zero_grad()
+            outputs = model(input_ids)
+            loss = criterion(outputs.logits, labels)
+            loss.backward()
+            optimizer.step()
+            
+            total_loss += loss.item()
+            _, predicted = torch.max(outputs.logits.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+            
+            train_pbar.set_postfix({
+                'loss': f'{loss.item():.4f}',
+                'acc': f'{100.*correct/total:.2f}%'
+            })
+        
+        train_acc = 100. * correct / total
+        
+        # Validation
+        model.eval()
+        val_correct = 0
+        val_total = 0
+        val_loss = 0
+        
+        with torch.no_grad():
+            for batch in tqdm(val_loader, desc="Validation"):
+                input_ids = batch['input_ids'].to(device)
+                labels = batch['label'].to(device)
+                
+                outputs = model(input_ids)
+                loss = criterion(outputs.logits, labels)
+                val_loss += loss.item()
+                
+                _, predicted = torch.max(outputs.logits.data, 1)
+                val_total += labels.size(0)
+                val_correct += (predicted == labels).sum().item()
+        
+        val_acc = 100. * val_correct / val_total
+        
+        print(f"Epoch {epoch+1}: Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%")
+        
+        # Save best model
+        if val_acc > best_val_acc:
+            best_val_acc = val_acc
+            torch.save(model.state_dict(), 'best_fp32_model.pth')
+            print(f"New best model saved with validation accuracy: {val_acc:.2f}%")
+    
+    return best_val_acc
+
+
+def export_to_onnx(model, tokenizer, output_path, max_length=128):
+    """Export model to clean ONNX format"""
+    print("Exporting to ONNX...")
+    
+    model.eval()
+    device = next(model.parameters()).device
+    
+    # Create dummy input
+    dummy_input = torch.ones(1, max_length, dtype=torch.long).to(device)
+    
+    # Export to ONNX
+    torch.onnx.export(
+        model,
+        dummy_input,
+        output_path,
+        export_params=True,
+        opset_version=17,
+        do_constant_folding=True,
+        input_names=['input_ids'],
+        output_names=['logits'],
+        dynamic_axes={
+            'input_ids': {0: 'batch_size'},
+            'logits': {0: 'batch_size'}
+        }
+    )
+    
+    # Simplify ONNX model
+    print("Simplifying ONNX model...")
+    model_onnx = onnx.load(output_path)
+    model_onnx, check = onnxsim.simplify(model_onnx)
+    assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model_onnx, output_path)
+    
+    print(f"Clean ONNX model saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Train FP32 TinyBERT and Export to ONNX')
+    parser.add_argument('--epochs', type=int, default=3, help='Number of training epochs')
+    parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
+    parser.add_argument('--max_length', type=int, default=128, help='Maximum sequence length')
+    parser.add_argument('--output', default='fp32_model.onnx', help='Output ONNX path')
+    
+    args = parser.parse_args()
+    
+    # Setup
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+    
+    # Load tokenizer and create model
+    print("Loading tokenizer and creating model...")
+    tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
+    config = create_tinybert_config()
+    model = BertForSequenceClassification(config)
+    
+    print(f"Model has {sum(p.numel() for p in model.parameters()):,} parameters")
+    
+    # Load data
+    train_dataset, val_dataset = load_and_preprocess_data(tokenizer, args.max_length)
+    
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
+    
+    print(f"Training samples: {len(train_dataset)}")
+    print(f"Validation samples: {len(val_dataset)}")
+    
+    # Train model
+    best_acc = train_model(model, train_loader, val_loader, device, args.epochs)
+    
+    # Load best model for export
+    model.load_state_dict(torch.load('best_fp32_model.pth'))
+    model.eval()
+    
+    # Export to ONNX
+    export_to_onnx(model, tokenizer, args.output, args.max_length)
+    
+    print(f"\nTraining completed!")
+    print(f"Best validation accuracy: {best_acc:.2f}%")
+    print(f"FP32 ONNX model saved to: {args.output}")
+    print(f"PyTorch model saved to: best_fp32_model.pth")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/blueprints/bert.yaml b/examples/blueprints/bert.yaml
index cabb59df..db0d219d 100644
--- a/examples/blueprints/bert.yaml
+++ b/examples/blueprints/bert.yaml
@@ -14,10 +14,13 @@ design_space:
     - DuplicateStreams
     - ElementwiseBinaryOp
     - Shuffle
+    - Crop
+    - Lookup
     - Softmax
     - finn:Thresholding
     - finn:MVAU
 
+>>>>>>> develop:examples/blueprints/bert.yaml
   steps:
     - "qonnx_to_finn"
     # Topology optimization
@@ -26,7 +29,7 @@ design_space:
     # Core Brainsmith steps
     - "build_dataflow_graph"  # ONNX --> Kernels
     - "build_hw_graph"        # Kernels --> HW Backends
-    # - "loop_rolling"
+    - "loop_rolling"
     - "transpose_decomposition"
     - "brainsmith:target_fps_parallelization"
     - "apply_parallelization_config"