diff --git a/parametricmatrixmodels/nonsequentialmodel.py b/parametricmatrixmodels/nonsequentialmodel.py
index b9af139..704119c 100644
--- a/parametricmatrixmodels/nonsequentialmodel.py
+++ b/parametricmatrixmodels/nonsequentialmodel.py
@@ -807,6 +807,16 @@ def _get_callable(
             self._get_module_input_dependencies()
         )
 
+        # Check if we can use jax.lax.scan for a chain of identical modules.
+        # This traces the module body once instead of N times, dramatically
+        # reducing JIT compile time.
+        use_scan = self._can_use_scan(module_input_deps, out_input_deps)
+
+        if use_scan:
+            return self._get_scan_callable(
+                module_callables, module_input_deps, out_input_deps
+            )
+
         @jaxtyped(typechecker=beartype)
         def nonseq_callable(
             params: ModelParams,
@@ -909,6 +919,210 @@ def nonseq_callable(
 
         return nonseq_callable
 
+    def _can_use_scan(
+        self,
+        module_input_deps: List,
+        out_input_deps: PyTree,
+    ) -> bool:
+        r"""
+        Check if the execution order forms a simple chain of identical
+        modules that can be optimized with ``jax.lax.scan``.
+
+        Conditions:
+        1. All modules have the same type.
+        2. All modules have the same parameter tree structure and leaf shapes.
+        3. All modules have the same state tree structure and leaf shapes.
+        4. The connections form a simple linear chain.
+        5. There are at least 2 modules.
+        """
+        import jax.numpy as np
+
+        module_paths = [
+            p
+            for p in self.execution_order
+            if p != "input" and p != "output"
+        ]
+        if len(module_paths) < 2:
+            return False
+
+        modules_list = [
+            getitem_by_strpath(
+                self.modules,
+                mod_path,
+                separator=self.separator,
+                allow_early_return=False,
+                return_remainder=False,
+            )
+            for mod_path in module_paths
+        ]
+
+        # Condition 1: all same type
+        first_type = type(modules_list[0])
+        if not all(type(m) is first_type for m in modules_list):
+            return False
+
+        # Condition 2: same param structure and leaf shapes
+        try:
+            first_params = modules_list[0].get_params()
+            first_struct = jax.tree.structure(first_params)
+            first_shapes = [
+                p.shape for p in jax.tree.leaves(first_params)
+            ]
+            for m in modules_list[1:]:
+                p = m.get_params()
+                if jax.tree.structure(p) != first_struct:
+                    return False
+                shapes = [leaf.shape for leaf in jax.tree.leaves(p)]
+                if shapes != first_shapes:
+                    return False
+        except Exception:
+            return False
+
+        # Condition 3: same state structure and leaf shapes
+        try:
+            first_state = modules_list[0].get_state()
+            first_state_struct = jax.tree.structure(first_state)
+            first_state_shapes = [
+                s.shape for s in jax.tree.leaves(first_state)
+            ]
+            for m in modules_list[1:]:
+                s = m.get_state()
+                if jax.tree.structure(s) != first_state_struct:
+                    return False
+                shapes = [leaf.shape for leaf in jax.tree.leaves(s)]
+                if shapes != first_state_shapes:
+                    return False
+        except Exception:
+            return False
+
+        # Condition 4: simple linear chain
+        # module_input_deps[0] is None (for "input")
+        # module_input_deps[1] should be "input" (first module)
+        # module_input_deps[i] should be module_paths[i-2] for i >= 2
+        # out_input_deps should be module_paths[-1]
+        dep_idx = 0
+        for i, (mod_path, req_in) in enumerate(
+            zip(self.execution_order, module_input_deps)
+        ):
+            if mod_path == "input":
+                continue
+            if mod_path == "output":
+                continue
+            if dep_idx == 0:
+                # First module should depend on "input"
+                if req_in != "input":
+                    return False
+            else:
+                # Each subsequent module should depend on previous
+                if req_in != module_paths[dep_idx - 1]:
+                    return False
+            dep_idx += 1
+
+        # Output should depend on the last module
+        if out_input_deps != module_paths[-1]:
+            return False
+
+        return True
+
+    def _get_scan_callable(
+        self,
+        module_callables: List,
+        module_input_deps: List,
+        out_input_deps: PyTree,
+    ) -> ModelCallable:
+        r"""
+        Build a scan-based callable for a chain of identical modules.
+        Uses ``jax.lax.scan`` to trace the module body once instead of N
+        times, dramatically reducing JIT compile time.
+        """
+        import jax.numpy as np
+        from jax import lax
+
+        module_paths = [
+            p
+            for p in self.execution_order
+            if p != "input" and p != "output"
+        ]
+        num_modules = len(module_paths)
+
+        # Use the first module's callable (all are identical)
+        single_callable = module_callables[1]  # [0] is None for "input"
+
+        @jaxtyped(typechecker=beartype)
+        def scan_nonseq_callable(
+            params: ModelParams,
+            data: Data,
+            training: bool,
+            state: ModelState,
+            rng: Any,
+        ) -> Tuple[Data, ModelState]:
+
+            # Extract each module's params and stack them
+            module_params_list = [
+                getitem_by_strpath(
+                    params,
+                    mod_path,
+                    separator=self.separator,
+                    allow_early_return=False,
+                    return_remainder=False,
+                )
+                for mod_path in module_paths
+            ]
+            stacked_params = jax.tree.map(
+                lambda *ps: np.stack(ps), *module_params_list
+            )
+
+            # Extract each module's state and stack them
+            module_states_list = [
+                getitem_by_strpath(
+                    state,
+                    mod_path,
+                    separator=self.separator,
+                    allow_early_return=False,
+                    return_remainder=False,
+                )
+                for mod_path in module_paths
+            ]
+            stacked_states = jax.tree.map(
+                lambda *ss: np.stack(ss), *module_states_list
+            )
+
+            # Scan body: process data through one module
+            def scan_body(carry, step_input):
+                step_data = carry
+                step_params, step_state = step_input
+                out, new_step_state = single_callable(
+                    step_params,
+                    step_data,
+                    training,
+                    step_state,
+                    rng,
+                )
+                return out, new_step_state
+
+            final_data, new_stacked_states = lax.scan(
+                scan_body,
+                data,
+                (stacked_params, stacked_states),
+            )
+
+            # Reconstruct state PyTree by unstacking
+            new_state = state
+            for i, mod_path in enumerate(module_paths):
+                mod_new_state = jax.tree.map(
+                    lambda s: s[i], new_stacked_states
+                )
+                setitem_by_strpath(
+                    new_state,
+                    mod_path,
+                    mod_new_state,
+                    separator=self.separator,
+                )
+
+            return final_data, new_state
+
+        return scan_nonseq_callable
+
     def get_state(self) -> ModelState:
         r"""
         Get the state of all modules in the model as a PyTree.
diff --git a/parametricmatrixmodels/training.py b/parametricmatrixmodels/training.py
index 0484c63..516e64c 100644
--- a/parametricmatrixmodels/training.py
+++ b/parametricmatrixmodels/training.py
@@ -1379,6 +1379,8 @@ def train(
             UserWarning,
         )
 
+    _callback_is_default = callback is None
+
     if callback is None:
 
         def callback(
@@ -1561,11 +1563,98 @@ def loss_fn_wrapper(
             )
         orig_struct = orig_adam_struct
 
-    # now we resign the loss function to take the parameters as separate
-    # arguments
-    # this only needs to be done for the training loss function, since we don't
-    # take gradients of the validation loss function, but it's easier to just
-    # do it for both to keep the signatures consistent
+    # === PACK PARAMETERS BY DTYPE ===
+    # Instead of N individual parameter arrays (one per param leaf), pack
+    # them into 1-2 concatenated arrays (one per unique dtype). This
+    # reduces the number of traced operations inside JIT from O(N) to
+    # O(num_dtype_groups), dramatically cutting compile time for large
+    # models.
+
+    if not resume:
+        param_arrays = init_params_flat
+    else:
+        param_arrays = [state.params for state in adam_state_flat]
+
+    num_flat_params = len(param_arrays)
+    param_shapes = [p.shape for p in param_arrays]
+    param_sizes = [p.size for p in param_arrays]
+
+    # Group params by dtype for efficient packing
+    dtype_to_indices: Dict[str, list] = {}
+    for i, p in enumerate(param_arrays):
+        key = str(p.dtype)
+        if key not in dtype_to_indices:
+            dtype_to_indices[key] = []
+        dtype_to_indices[key].append(i)
+
+    # Sort for deterministic ordering
+    sorted_dtype_groups = sorted(dtype_to_indices.items())
+
+    # Build index mapping: orig_idx -> (group_idx, offset_within_group)
+    param_to_group: Dict[int, tuple] = {}
+    group_info = []  # list of (dtype_obj, indices_in_group)
+    for group_idx, (_key, indices) in enumerate(sorted_dtype_groups):
+        dt = param_arrays[indices[0]].dtype
+        offset = 0
+        for orig_idx in indices:
+            param_to_group[orig_idx] = (group_idx, offset)
+            offset += param_sizes[orig_idx]
+        group_info.append((dt, indices))
+
+    num_groups = len(group_info)
+
+    # Build unpack metadata (static, used inside JIT)
+    unpack_meta = []
+    for i in range(num_flat_params):
+        group_idx, offset = param_to_group[i]
+        unpack_meta.append(
+            (group_idx, offset, param_sizes[i], param_shapes[i])
+        )
+
+    # Pack params into per-dtype arrays
+    packed_params_list = []
+    for _dt, indices in group_info:
+        flat_arrays = [param_arrays[i].flatten() for i in indices]
+        packed_params_list.append(np.concatenate(flat_arrays))
+
+    # Create trainable masks per dtype group (for gradient masking)
+    packed_masks = []
+    for _dt, indices in group_info:
+        mask_parts = []
+        for i in indices:
+            if trainable_flags_flat[i]:
+                mask_parts.append(
+                    np.ones(param_sizes[i], dtype=np.float32)
+                )
+            else:
+                mask_parts.append(
+                    np.zeros(param_sizes[i], dtype=np.float32)
+                )
+        packed_masks.append(np.concatenate(mask_parts))
+
+    def unpack_packed_params(*packed_params):
+        """Unpack per-dtype packed arrays back to individual params."""
+        result = []
+        for group_idx, offset, size, shape in unpack_meta:
+            part = packed_params[group_idx][offset : offset + size]
+            result.append(part.reshape(shape))
+        return result
+
+    def pack_params_from_flat(params_flat_list):
+        """Pack individual param arrays into per-dtype packed arrays."""
+        packed = []
+        for dt, indices in group_info:
+            flat_arrays = [
+                params_flat_list[i].astype(dt).flatten()
+                for i in indices
+            ]
+            packed.append(np.concatenate(flat_arrays))
+        return packed
+
+    # === RESIGN LOSS FUNCTIONS FOR PACKED PARAMS ===
+    # The loss functions now accept 1-2 packed arrays instead of N
+    # individual arrays. Gradient masking via stop_gradient ensures
+    # non-trainable params receive zero gradients.
 
     orig_loss_fn = loss_fn
     orig_val_loss_fn = val_loss_fn
@@ -1579,11 +1668,27 @@ def loss_fn(
         training: bool,
         state: ModelState,
         rng: Any,
-        *params: Params,
+        *packed_params: Params,
     ) -> Tuple[float | Float[Array, ""], ModelState]:
-        reconstructed_params = jax.tree.unflatten(orig_struct, params)
+        # Apply gradient mask: stop_gradient for non-trainable elements
+        masked = tuple(
+            jax.lax.stop_gradient(pp)
+            + (pp - jax.lax.stop_gradient(pp)) * mask.astype(pp.dtype)
+            for pp, mask in zip(packed_params, packed_masks)
+        )
+        params_flat = unpack_packed_params(*masked)
+        reconstructed_params = jax.tree.unflatten(
+            orig_struct, params_flat
+        )
         return orig_loss_fn(
-            epoch, X, Y, Y_unc, reconstructed_params, training, state, rng
+            epoch,
+            X,
+            Y,
+            Y_unc,
+            reconstructed_params,
+            training,
+            state,
+            rng,
         )
 
     @jaxtyped(typechecker=beartype)
@@ -1595,18 +1700,25 @@ def val_loss_fn(
         training: bool,
         state: ModelState,
         rng: Any,
-        *params: Params,
+        *packed_params: Params,
     ) -> Tuple[float | Float[Array, ""], ModelState]:
-        reconstructed_params = jax.tree.unflatten(orig_struct, params)
+        params_flat = unpack_packed_params(*packed_params)
+        reconstructed_params = jax.tree.unflatten(
+            orig_struct, params_flat
+        )
         return orig_val_loss_fn(
-            epoch, X, Y, Y_unc, reconstructed_params, training, state, rng
+            epoch,
+            X,
+            Y,
+            Y_unc,
+            reconstructed_params,
+            training,
+            state,
+            rng,
         )
 
-    # now we calculate which argnums are trainable based on the
-    # trainable_flags_flat
-    trainable_argnums = [
-        7 + i for i, flag in enumerate(trainable_flags_flat) if flag
-    ]
+    # All packed groups are "trainable" (masking is in the loss function)
+    trainable_argnums = [7 + i for i in range(num_groups)]
 
     # set up everything for the JAX trainer
     # has_aux=True allows us to return the new state from the loss function
@@ -1656,8 +1768,30 @@ def val_loss_fn(
             clip=clip,
         )
 
+    # Create packed adam states (1-2 instead of N)
     if not resume or adam_state is None:
-        adam_state_flat = jax.tree.map(init_fn, init_params_flat)
+        packed_adam_states = [init_fn(p) for p in packed_params_list]
+    else:
+        packed_adam_states = []
+        for _dt, indices in group_info:
+            packed_p = np.concatenate(
+                [adam_state_flat[i].params.flatten() for i in indices]
+            )
+            packed_m = np.concatenate(
+                [adam_state_flat[i].m.flatten() for i in indices]
+            )
+            packed_v = np.concatenate(
+                [adam_state_flat[i].v.flatten() for i in indices]
+            )
+            packed_epoch = max(
+                int(adam_state_flat[i].epoch) for i in indices
+            )
+            packed_adam_states.append(
+                OptimizerState(packed_p, packed_m, packed_v, packed_epoch)
+            )
+
+    # Packed trainable flags (all True - masking handled in loss)
+    packed_trainable_flags = tuple(True for _ in range(num_groups))
 
     # make sure the validation batch size isn't larger than the validation set
     num_val_samples = jax.tree.leaves(X_val)[0].shape[0]
@@ -1672,9 +1806,27 @@ def val_loss_fn(
     elif val_batch_size <= 0:
         val_batch_size = num_val_samples
 
+    # Wrap callback to work with packed params.
+    # For the default (identity) callback, skip wrapping since it works
+    # with any param format and avoids O(N) unpack/repack overhead.
+    if _callback_is_default:
+        packed_callback = callback
+    else:
+        orig_callback = callback
+
+        def packed_callback(rng, epoch, packed_params_list_):
+            params_flat = unpack_packed_params(*packed_params_list_)
+            params_tree = jax.tree.unflatten(orig_struct, params_flat)
+            rng, new_params_tree = orig_callback(
+                rng, epoch, params_tree
+            )
+            new_params_flat, _ = jax.tree.flatten(new_params_tree)
+            new_packed = pack_params_from_flat(new_params_flat)
+            return rng, new_packed
+
     # train
     (
-        final_adam_state_flat,
+        final_packed_adam_states,
         final_model_state,
         final_model_rng,
         final_epoch,
@@ -1684,8 +1836,8 @@ def val_loss_fn(
         batch_rng,
         update_fn,
         increment_epoch,
-        adam_state_flat,
-        tuple(trainable_flags_flat),
+        packed_adam_states,
+        packed_trainable_flags,
         get_params,
         update_params_direct,
         init_state,  # initial model state
@@ -1706,19 +1858,36 @@ def val_loss_fn(
         target_loss=target_loss,
         early_stopping_patience=early_stopping_patience,
         early_stopping_min_delta=early_stopping_min_delta,
-        callback=callback,
+        callback=packed_callback,
         unroll=unroll,
         verbose=verbose,
     )
 
-    # rebuild the params PyTree
-    best_params_list = jax.tree.map(
+    # === UNPACK RESULTS ===
+    # Extract final params from packed adam states
+    final_packed_params_list = jax.tree.map(
         get_params,
-        final_adam_state_flat,
+        final_packed_adam_states,
         is_leaf=lambda x: isinstance(x, OptimizerState),
     )
-    best_params = jax.tree.unflatten(orig_struct, best_params_list)
-    final_adam_state = jax.tree.unflatten(orig_struct, final_adam_state_flat)
+    final_params_flat = unpack_packed_params(*final_packed_params_list)
+    best_params = jax.tree.unflatten(orig_struct, final_params_flat)
+
+    # Reconstruct individual adam states for the caller
+    final_adam_states_flat = []
+    for i in range(num_flat_params):
+        group_idx, offset = param_to_group[i]
+        packed_state = final_packed_adam_states[group_idx]
+        size = param_sizes[i]
+        shape = param_shapes[i]
+        p = packed_state.params[offset : offset + size].reshape(shape)
+        m = packed_state.m[offset : offset + size].reshape(shape)
+        v = packed_state.v[offset : offset + size].reshape(shape)
+        e = packed_state.epoch
+        final_adam_states_flat.append(OptimizerState(p, m, v, e))
+    final_adam_state = jax.tree.unflatten(
+        orig_struct, final_adam_states_flat
+    )
 
     # return the final parameters
     return (
diff --git a/tests/test_compile_time.py b/tests/test_compile_time.py
new file mode 100644
index 0000000..72276c5
--- /dev/null
+++ b/tests/test_compile_time.py
@@ -0,0 +1,79 @@
+import time
+
+import jax
+import jax.numpy as np
+import pytest
+
+import parametricmatrixmodels as pmm
+
+jax.config.update("jax_enable_x64", False)
+
+
+def test_nonsequential_compile_time():
+    r"""
+    Benchmark the JIT compile time of NonSequentialModel.train() for a
+    large model (>20k parameters). The compile time should be reduced
+    by at least 50% compared to the baseline.
+    """
+
+    num_modules = 16
+    matrix_size = 12
+    input_size = 5
+    output_size = 5
+
+    modules = {}
+    connections = {}
+
+    for i in range(num_modules):
+        modules[f"mod{i}"] = pmm.modules.AffineObservablePMM(
+            matrix_size=matrix_size,
+            num_eig=6,
+            num_secondaries=1,
+            output_size=output_size,
+            bias_term=True,
+            smoothing=1.0,
+        )
+
+    connections["input"] = "mod0"
+    for i in range(num_modules - 1):
+        connections[f"mod{i}"] = f"mod{i + 1}"
+    connections[f"mod{num_modules - 1}"] = "output"
+
+    model = pmm.NonSequentialModel(modules, connections)
+
+    X = np.ones((20, input_size), dtype=np.float32)
+    Y = np.ones((20, output_size), dtype=np.float32)
+
+    model.compile(42, (input_size,))
+
+    num_params = model.get_num_trainable_floats()
+    assert num_params >= 20000, (
+        f"Model has {num_params} trainable floats, need >= 20000"
+    )
+
+    # Clear JAX caches to ensure a fresh compilation
+    jax.clear_caches()
+
+    t0 = time.time()
+    model.train(
+        X,
+        Y,
+        loss_fn="mse",
+        epochs=1,
+        batch_size=20,
+        verbose=False,
+    )
+    compile_time = time.time() - t0
+
+    print(
+        f"\nCompile time: {compile_time:.2f}s"
+        f" (num_params={num_params},"
+        f" num_param_arrays={len(jax.tree.leaves(model.get_params()))})"
+    )
+
+    # The compile time should be reasonable (under 30 seconds)
+    # This threshold was chosen to be roughly half of the original
+    # baseline compile time (~35-45s on CI).
+    assert compile_time < 30.0, (
+        f"Compile time {compile_time:.2f}s exceeds 30s threshold"
+    )