diff --git a/inference.py b/inference.py
index bd1b661..22ef8f4 100644
--- a/inference.py
+++ b/inference.py
@@ -4,6 +4,7 @@
 
 import torch
 import numpy as np
+import open3d as o3d
 from tqdm import tqdm
 from threading import Thread
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -19,6 +20,294 @@
 }
 
 
+def save_processed_pointcloud(point_cloud_tensor, output_path, original_min_extent=None):
+    """
+    Save the processed point cloud tensor as a PLY file to visualize what the model sees.
+    
+    Args:
+        point_cloud_tensor: The processed point cloud tensor from preprocess_point_cloud
+        output_path: Path to save the PLY file
+        original_min_extent: Original minimum extent for coordinate restoration
+    """
+    # Extract the point cloud data (remove batch dimension)
+    point_data = point_cloud_tensor.squeeze(0).cpu().numpy()
+    
+    # The tensor structure is [grid_coord(3), xyz(3), rgb(3)] = 9 columns
+    if point_data.shape[1] >= 9:
+        grid_coords = point_data[:, 0:3]  # Discretized grid coordinates
+        xyz_coords = point_data[:, 3:6]   # Actual XYZ coordinates (after PositiveShift)
+        rgb_colors = point_data[:, 6:9]   # RGB colors (normalized)
+        
+        # Use actual XYZ coordinates (these are the shifted/processed coordinates)
+        points = xyz_coords
+        colors = rgb_colors
+        
+        # Restore original coordinate system if min_extent provided
+        if original_min_extent is not None:
+            points = points + original_min_extent
+        
+        # Ensure colors are in [0,1] range
+        colors = np.clip(colors, 0, 1)
+        
+        # Create Open3D point cloud
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points)
+        pcd.colors = o3d.utility.Vector3dVector(colors)
+        
+        # Save as PLY
+        o3d.io.write_point_cloud(output_path, pcd)
+        print(f"Saved processed point cloud to: {output_path}")
+        print(f"  - Number of points: {len(points)}")
+        print(f"  - Coordinate range: [{points.min(axis=0)}, {points.max(axis=0)}]")
+        print(f"  - Grid coordinate range: [{grid_coords.min(axis=0)}, {grid_coords.max(axis=0)}]")
+    else:
+        print(f"Warning: Unexpected tensor shape {point_data.shape}, cannot save PLY")
+
+
+def save_grid_coordinates_pointcloud(point_cloud_tensor, output_path, grid_size_m=None, min_extent=None):
+    """
+    Save a point cloud using the discretized grid coordinates to see the voxel grid structure.
+    """
+    point_data = point_cloud_tensor.squeeze(0).cpu().numpy()
+    
+    if point_data.shape[1] >= 9:
+        grid_coords = point_data[:, 0:3]  # Discretized grid coordinates
+        rgb_colors = point_data[:, 6:9]   # RGB colors
+        
+        # Use grid coordinates as positions (voxel grid). Optionally convert to meters for downstream use
+        if grid_size_m is not None:
+            points = grid_coords.astype(float) * float(grid_size_m)
+            if min_extent is not None:
+                points = points + np.array(min_extent, dtype=float)
+        else:
+            points = grid_coords
+        colors = np.clip(rgb_colors, 0, 1)
+        
+        # Create Open3D point cloud
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points.astype(float))
+        pcd.colors = o3d.utility.Vector3dVector(colors)
+        
+        # Save as PLY
+        o3d.io.write_point_cloud(output_path, pcd)
+        print(f"Saved grid coordinate point cloud to: {output_path}")
+        print(f"  - Number of voxels: {len(points)}")
+        print(f"  - Grid coordinate range: [{points.min(axis=0)}, {points.max(axis=0)}]")
+
+
+def preprocess_point_cloud_adaptive_unified(points, colors, base_grid_size, num_bins, density_levels=3):
+    """
+    Unified adaptive preprocessing that maintains single coordinate system.
+    Uses density-weighted sampling within a single grid.
+    """
+    from sklearn.neighbors import NearestNeighbors
+    
+    print(f"Unified adaptive preprocessing with {density_levels} density levels...")
+    
+    # Calculate local point density
+    k_neighbors = min(20, len(points) // 10)
+    nbrs = NearestNeighbors(n_neighbors=k_neighbors).fit(points)
+    distances, _ = nbrs.kneighbors(points)
+    local_density = 1.0 / (distances[:, -1] + 1e-6)
+    
+    # Normalize density to [0, 1] range
+    min_density = np.min(local_density)
+    max_density = np.max(local_density)
+    normalized_density = (local_density - min_density) / (max_density - min_density + 1e-6)
+    
+    # Create density-based sampling weights
+    # Higher density points get higher sampling probability
+    sampling_weights = 0.1 + 0.9 * normalized_density  # Range [0.1, 1.0]
+    
+    # Use adaptive grid size based on overall density distribution
+    density_percentile_75 = np.percentile(normalized_density, 75)
+    adaptive_grid_scale = 1.0 / (1.0 + density_percentile_75)  # Smaller grid for denser clouds
+    final_grid_size = base_grid_size * adaptive_grid_scale
+    
+    print(f"  Adaptive grid size: {final_grid_size:.6f} (scale: {adaptive_grid_scale:.3f})")
+    print(f"  Density range: [{min_density:.2f}, {max_density:.2f}]")
+    print(f"  Sampling weights range: [{np.min(sampling_weights):.2f}, {np.max(sampling_weights):.2f}]")
+    
+    # Apply standard preprocessing with adaptive grid
+    transform = Compose([
+        dict(type="PositiveShift"),
+        dict(type="NormalizeColor"),
+        dict(type="DensityWeightedGridSample", 
+             grid_size=final_grid_size,
+             hash_type="fnv",
+             mode="test",
+             keys=("coord", "color"),
+             return_grid_coord=True,
+             max_grid_coord=num_bins,
+             sampling_weights=sampling_weights),  # Custom transform
+    ])
+    
+    # Fallback to regular GridSample if custom transform not available
+    try:
+        point_cloud = transform({
+            "coord": points.copy(),
+            "color": colors.copy(),
+        })
+    except:
+        print("  Fallback to regular GridSample with adaptive grid size")
+        fallback_transform = Compose([
+            dict(type="PositiveShift"),
+            dict(type="NormalizeColor"),
+            dict(type="GridSample",
+                 grid_size=final_grid_size,
+                 hash_type="fnv", 
+                 mode="test",
+                 keys=("coord", "color"),
+                 return_grid_coord=True,
+                 max_grid_coord=num_bins),
+        ])
+        point_cloud = fallback_transform({
+            "coord": points.copy(),
+            "color": colors.copy(),
+        })
+    
+    print(f"  Processed points: {len(point_cloud['grid_coord']):,}")
+    
+    # Convert to tensor format
+    coord = point_cloud["grid_coord"]
+    xyz = point_cloud["coord"]
+    rgb = point_cloud["color"]
+    point_cloud_data = np.concatenate([coord, xyz, rgb], axis=1)
+    return torch.as_tensor(np.stack([point_cloud_data], axis=0))
+
+
+def preprocess_point_cloud_adaptive(points, colors, base_grid_size, num_bins, density_levels=3):
+    """
+    Adaptive point cloud preprocessing with variable voxel density based on local point density.
+    
+    Args:
+        points: Original point coordinates
+        colors: Original point colors  
+        base_grid_size: Base voxel size for sparse regions
+        num_bins: Number of bins for coordinate discretization
+        density_levels: Number of different density levels to use
+    """
+    from sklearn.neighbors import NearestNeighbors
+    
+    print(f"Adaptive preprocessing with {density_levels} density levels...")
+    
+    # Calculate local point density for each point
+    k_neighbors = min(20, len(points) // 10)  # Adaptive k based on point cloud size
+    nbrs = NearestNeighbors(n_neighbors=k_neighbors).fit(points)
+    distances, _ = nbrs.kneighbors(points)
+    
+    # Use average distance to k-th neighbor as density measure (smaller = denser)
+    local_density = 1.0 / (distances[:, -1] + 1e-6)  # Inverse distance = density
+    
+    # Create density-based regions
+    density_percentiles = np.linspace(0, 100, density_levels + 1)
+    density_thresholds = np.percentile(local_density, density_percentiles)
+    
+    processed_clouds = []
+    total_points = 0
+    
+    for level in range(density_levels):
+        # Define density range for this level
+        min_density = density_thresholds[level]
+        max_density = density_thresholds[level + 1]
+        
+        # Select points in this density range
+        if level == density_levels - 1:  # Last level includes maximum
+            mask = (local_density >= min_density)
+        else:
+            mask = (local_density >= min_density) & (local_density < max_density)
+        
+        if not np.any(mask):
+            continue
+            
+        level_points = points[mask]
+        level_colors = colors[mask]
+        
+        # Calculate grid size for this density level
+        # Higher density regions get smaller voxels (more detail)
+        density_factor = (level + 1) / density_levels  # 0.33, 0.67, 1.0 for 3 levels
+        level_grid_size = base_grid_size * (1.0 / (density_factor ** 0.5))  # Square root scaling
+        
+        print(f"  Level {level + 1}: {np.sum(mask):,} points, grid_size={level_grid_size:.6f}")
+        
+        # Process this density level
+        transform = Compose([
+            dict(type="PositiveShift"),
+            dict(type="NormalizeColor"),
+            dict(
+                type="GridSample",
+                grid_size=level_grid_size,
+                hash_type="fnv",
+                mode="test",
+                keys=("coord", "color"),
+                return_grid_coord=True,
+                max_grid_coord=num_bins,  # Use same coordinate space for all levels
+            ),
+        ])
+        
+        level_cloud = transform({
+            "coord": level_points.copy(),
+            "color": level_colors.copy(),
+        })
+        
+        processed_clouds.append(level_cloud)
+        total_points += len(level_cloud["grid_coord"])
+    
+    # Combine all density levels
+    if len(processed_clouds) == 1:
+        final_cloud = processed_clouds[0]
+    else:
+        final_cloud = combine_density_levels(processed_clouds, num_bins)
+    
+    print(f"  Total processed points: {total_points:,}")
+    
+    # Convert to tensor format
+    coord = final_cloud["grid_coord"]
+    xyz = final_cloud["coord"]
+    rgb = final_cloud["color"]
+    point_cloud = np.concatenate([coord, xyz, rgb], axis=1)
+    return torch.as_tensor(np.stack([point_cloud], axis=0))
+
+
+def combine_density_levels(processed_clouds, num_bins):
+    """Combine multiple density-level point clouds into one unified coordinate system."""
+    all_coords = []
+    all_xyz = []
+    all_colors = []
+    
+    for i, cloud in enumerate(processed_clouds):
+        all_coords.append(cloud["grid_coord"])
+        all_xyz.append(cloud["coord"])
+        all_colors.append(cloud["color"])
+    
+    combined_cloud = {
+        "grid_coord": np.vstack(all_coords),
+        "coord": np.vstack(all_xyz),
+        "color": np.vstack(all_colors),
+    }
+    
+    # Remove duplicate voxels (keep the one from the highest density level)
+    # Since we process from low to high density, later entries have higher priority
+    grid_coords = combined_cloud["grid_coord"]
+    
+    # Create unique voxel keys
+    voxel_keys = grid_coords[:, 0] * (num_bins ** 2) + grid_coords[:, 1] * num_bins + grid_coords[:, 2]
+    
+    # Keep last occurrence (highest density level) for each voxel
+    _, unique_indices = np.unique(voxel_keys[::-1], return_index=True)
+    unique_indices = len(voxel_keys) - 1 - unique_indices  # Reverse back to original indices
+    unique_indices = np.sort(unique_indices)  # Sort to maintain order
+    
+    # Keep only unique voxels (prioritizing higher density levels)
+    combined_cloud["grid_coord"] = combined_cloud["grid_coord"][unique_indices]
+    combined_cloud["coord"] = combined_cloud["coord"][unique_indices]
+    combined_cloud["color"] = combined_cloud["color"][unique_indices]
+    
+    print(f"  Combined: {len(unique_indices):,} unique voxels after deduplication")
+    
+    return combined_cloud
+
+
 def preprocess_point_cloud(points, colors, grid_size, num_bins):
     transform = Compose(
         [
@@ -78,47 +367,70 @@ def generate_layout(
     prompt = f"<|point_start|><|point_pad|><|point_end|>{task_prompt} The reference code is as followed: {code_template}"
 
     # prepare the conversation data
-    if model.config.model_type == "spatiallm_qwen":
-        conversation = [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt},
-        ]
-    else:
-        conversation = [{"role": "user", "content": prompt}]
+    conversation = [{"role": "user", "content": prompt}]
 
     input_ids = tokenizer.apply_chat_template(
         conversation, add_generation_prompt=True, return_tensors="pt"
     )
     input_ids = input_ids.to(model.device)
 
-    streamer = TextIteratorStreamer(
-        tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-    )
+    # Choose between streaming and non-streaming based on num_beams
+    if num_beams == 1:
+        # Use streaming for single beam (default behavior)
+        print("Using streaming generation (num_beams=1)")
+        
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+        )
 
-    generate_kwargs = dict(
-        {"input_ids": input_ids, "point_clouds": point_cloud},
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        use_cache=True,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        num_beams=num_beams,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
+        generate_kwargs = dict(
+            {"input_ids": input_ids, "point_clouds": point_cloud},
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            use_cache=True,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            num_beams=num_beams,
+        )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
 
-    print("Generating layout...\n")
-    generate_texts = []
-    for text in streamer:
-        generate_texts.append(text)
-        print(text, end="", flush=True)
-    print("\nDone!")
+        print("Generating layout...\n")
+        generate_texts = []
+        for text in streamer:
+            generate_texts.append(text)
+            print(text, end="", flush=True)
+        print("\nDone!")
+
+        layout_str = "".join(generate_texts)
+        
+    else:
+        # Use non-streaming for multiple beams (num_beams > 1)
+        print(f"Using non-streaming generation (num_beams={num_beams})")
+        
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            point_clouds=point_cloud,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            use_cache=True,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            num_beams=num_beams,
+        )
+        
+        print("Generating layout...\n")
+        outputs = model.generate(**generate_kwargs)
+        print("Done!")
+        
+        generated_ids = outputs[:, input_ids.shape[-1]:]
+        layout_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-    layout_str = "".join(generate_texts)
     layout = Layout(layout_str)
-    layout.undiscretize_and_unnormalize(num_bins=model.config.point_config["num_bins"])
+    layout.undiscretize_and_unnormalize(num_bins=num_bins)
     return layout
 
 
@@ -267,9 +579,47 @@ def generate_layout(
     parser.add_argument(
         "--seed",
         type=int,
-        default=-1,
+        default=1,
         help="The seed to use during inference, negative value means no seed",
     )
+    parser.add_argument(
+        "--save_processed_pcd",
+        action="store_true",
+        help="Save the processed point cloud as PLY to see what the model sees",
+    )
+    parser.add_argument(
+        "--save_grid_pcd", 
+        action="store_true",
+        help="Save the grid coordinate point cloud as PLY to see the voxel structure",
+    )
+    parser.add_argument(
+        "--voxel_density_multiplier",
+        type=float,
+        default=1.0,
+        help="Multiply voxel count by this factor (2.0 = double voxels, default: 2.0)",
+    )
+    parser.add_argument(
+        "--adaptive_sampling",
+        action="store_true",
+        help="Use adaptive voxel density based on local point density",
+    )
+    parser.add_argument(
+        "--unified_adaptive",
+        action="store_true", 
+        help="Use unified adaptive sampling (single coordinate system, recommended)",
+    )
+    parser.add_argument(
+        "--density_levels",
+        type=int,
+        default=1,
+        help="Number of density levels for adaptive sampling (default: 3)",
+    )
+    parser.add_argument("--scale", action="store_true", help="Optimize wall scales to fit grid.ply perimeter")
+    parser.add_argument(
+        "--cleanup",
+        action="store_true",
+        help="Remove intermediate artifacts (processed/grid PLYs, scale PNG). Final TXT remains.",
+    )
     args = parser.parse_args()
 
     # load the model
@@ -283,6 +633,7 @@ def generate_layout(
 
     # number of bins used for discretization
     num_bins = model.config.point_config["num_bins"]
+    print(f"Number of bins: {num_bins}")
 
     # check if the input is a single point cloud file or a folder containing multiple point cloud files
     if os.path.isfile(args.point_cloud):
@@ -291,18 +642,48 @@ def generate_layout(
         point_cloud_files = glob.glob(os.path.join(args.point_cloud, "*.ply"))
 
     for point_cloud_file in tqdm(point_cloud_files):
+        # Track intermediates for optional cleanup
+        intermediates = []
+        processed_ply_path = None
+        grid_ply_path = None
+        diagram_path = None
+        base_filename = os.path.splitext(os.path.basename(point_cloud_file))[0]
         # load the point cloud
         point_cloud = load_o3d_pcd(point_cloud_file)
+        # Use the standard grid size for coordinate system consistency
+        # The model was trained with this grid size and expects this coordinate scaling
         grid_size = Layout.get_grid_size(num_bins)
+        
+        # For higher voxel density, we'll modify the cleanup_pcd voxel_size instead
+        # This maintains coordinate system consistency while increasing point density
+        cleanup_voxel_size = grid_size / args.voxel_density_multiplier
+        print(f"Grid size: {grid_size:.6f} (standard), Cleanup voxel size: {cleanup_voxel_size:.6f} (density multiplier: {args.voxel_density_multiplier})")
 
         if not args.no_cleanup:
-            point_cloud = cleanup_pcd(point_cloud, voxel_size=grid_size)
+            point_cloud = cleanup_pcd(point_cloud, voxel_size=cleanup_voxel_size)
 
         points, colors = get_points_and_colors(point_cloud)
         min_extent = np.min(points, axis=0)
 
         # preprocess the point cloud to tensor features
-        input_pcd = preprocess_point_cloud(points, colors, grid_size, num_bins)
+        if args.unified_adaptive:
+            input_pcd = preprocess_point_cloud_adaptive_unified(points, colors, grid_size, num_bins, args.density_levels)
+        elif args.adaptive_sampling:
+            input_pcd = preprocess_point_cloud_adaptive(points, colors, grid_size, num_bins, args.density_levels)
+        else:
+            input_pcd = preprocess_point_cloud(points, colors, grid_size, num_bins)
+
+        # Save intermediate processed point cloud if requested
+        if args.save_processed_pcd or args.save_grid_pcd:
+            if args.save_processed_pcd:
+                processed_ply_path = f"{base_filename}_processed.ply"
+                save_processed_pointcloud(input_pcd, processed_ply_path, min_extent)
+                intermediates.append(processed_ply_path)
+                
+            if args.save_grid_pcd:
+                grid_ply_path = f"{base_filename}_grid.ply"
+                save_grid_coordinates_pointcloud(input_pcd, grid_ply_path, grid_size_m=grid_size, min_extent=min_extent)
+                intermediates.append(grid_ply_path)
 
         # generate the layout
         layout = generate_layout(
@@ -321,6 +702,36 @@ def generate_layout(
         layout.translate(min_extent)
         pred_language_string = layout.to_language_string()
 
+        # Perform scale optimization if requested
+        if args.scale:
+            if not args.save_grid_pcd:
+                print("Warning: --scale flag requires --save_grid_pcd to be active. Forcing grid PCD generation.")
+                grid_ply_path = f"{base_filename}_grid.ply"
+                save_grid_coordinates_pointcloud(input_pcd, grid_ply_path, grid_size_m=grid_size, min_extent=min_extent)
+                intermediates.append(grid_ply_path)
+            
+            if os.path.exists(grid_ply_path):
+                from scale_optimizer import run_scale_optimization
+                output_dir = os.path.dirname(args.output) if os.path.splitext(args.output)[-1] else args.output
+                if output_dir:
+                    os.makedirs(output_dir, exist_ok=True)
+                diagram_path = os.path.join(output_dir, f"{base_filename}_scale_optimization.png")
+                intermediates.append(diagram_path)
+                
+                print(f"Running scale optimization with grid file: {grid_ply_path}")
+                # Prefer using the original input PLY (meters) for perimeter extraction
+                optimized_layout = run_scale_optimization(layout, grid_ply_path, diagram_path, source_ply_path=point_cloud_file)
+                
+                if optimized_layout:
+                    print("Scale optimization successful. Using optimized layout.")
+                    layout = optimized_layout
+                    pred_language_string = layout.to_language_string()
+                else:
+                    print("Scale optimization failed. Using original layout.")
+            else:
+                print(f"Warning: Grid PLY file not found at {grid_ply_path}. Skipping scale optimization.")
+
+
         # check if the output path is a file or directory
         if os.path.splitext(args.output)[-1]:
             with open(args.output, "w") as f:
@@ -330,3 +741,13 @@ def generate_layout(
             os.makedirs(args.output, exist_ok=True)
             with open(os.path.join(args.output, output_filename), "w") as f:
                 f.write(pred_language_string)
+
+        # Cleanup intermediates if requested
+        if args.cleanup and intermediates:
+            for p in set(intermediates):
+                try:
+                    if p and os.path.exists(p):
+                        os.remove(p)
+                        print(f"Cleaned up: {p}")
+                except Exception as e:
+                    print(f"Warning: Failed to remove {p}: {e}")
diff --git a/inference_original.py b/inference_original.py
new file mode 100644
index 0000000..bd1b661
--- /dev/null
+++ b/inference_original.py
@@ -0,0 +1,332 @@
+import os
+import glob
+import argparse
+
+import torch
+import numpy as np
+from tqdm import tqdm
+from threading import Thread
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import TextIteratorStreamer, set_seed
+
+from spatiallm import Layout
+from spatiallm.pcd import load_o3d_pcd, get_points_and_colors, cleanup_pcd, Compose
+
+DETECT_TYPE_PROMPT = {
+    "all": "Detect walls, doors, windows, boxes.",
+    "arch": "Detect walls, doors, windows.",
+    "object": "Detect boxes.",
+}
+
+
+def preprocess_point_cloud(points, colors, grid_size, num_bins):
+    transform = Compose(
+        [
+            dict(type="PositiveShift"),
+            dict(type="NormalizeColor"),
+            dict(
+                type="GridSample",
+                grid_size=grid_size,
+                hash_type="fnv",
+                mode="test",
+                keys=("coord", "color"),
+                return_grid_coord=True,
+                max_grid_coord=num_bins,
+            ),
+        ]
+    )
+    point_cloud = transform(
+        {
+            "name": "pcd",
+            "coord": points.copy(),
+            "color": colors.copy(),
+        }
+    )
+    coord = point_cloud["grid_coord"]
+    xyz = point_cloud["coord"]
+    rgb = point_cloud["color"]
+    point_cloud = np.concatenate([coord, xyz, rgb], axis=1)
+    return torch.as_tensor(np.stack([point_cloud], axis=0))
+
+
+def generate_layout(
+    model,
+    point_cloud,
+    tokenizer,
+    code_template_file,
+    top_k=10,
+    top_p=0.95,
+    temperature=0.6,
+    num_beams=1,
+    seed=-1,
+    max_new_tokens=4096,
+    detect_type="all",
+    categories=[],
+):
+    if seed >= 0:
+        set_seed(seed)
+
+    # load the code template
+    with open(code_template_file, "r") as f:
+        code_template = f.read()
+
+    task_prompt = DETECT_TYPE_PROMPT[detect_type]
+    if detect_type != "arch" and categories:
+        task_prompt = task_prompt.replace("boxes", ", ".join(categories))
+    print("Task prompt: ", task_prompt)
+
+    prompt = f"<|point_start|><|point_pad|><|point_end|>{task_prompt} The reference code is as followed: {code_template}"
+
+    # prepare the conversation data
+    if model.config.model_type == "spatiallm_qwen":
+        conversation = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ]
+    else:
+        conversation = [{"role": "user", "content": prompt}]
+
+    input_ids = tokenizer.apply_chat_template(
+        conversation, add_generation_prompt=True, return_tensors="pt"
+    )
+    input_ids = input_ids.to(model.device)
+
+    streamer = TextIteratorStreamer(
+        tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+    )
+
+    generate_kwargs = dict(
+        {"input_ids": input_ids, "point_clouds": point_cloud},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+
+    print("Generating layout...\n")
+    generate_texts = []
+    for text in streamer:
+        generate_texts.append(text)
+        print(text, end="", flush=True)
+    print("\nDone!")
+
+    layout_str = "".join(generate_texts)
+    layout = Layout(layout_str)
+    layout.undiscretize_and_unnormalize(num_bins=model.config.point_config["num_bins"])
+    return layout
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("SpatialLM inference script")
+    parser.add_argument(
+        "-p",
+        "--point_cloud",
+        type=str,
+        required=True,
+        help="Path to the input point cloud file or a folder containing multiple point cloud files",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        required=True,
+        help="Path to the output layout txt file or a folder to save multiple layout txt files",
+    )
+    parser.add_argument(
+        "-m",
+        "--model_path",
+        type=str,
+        default="manycore-research/SpatialLM-Llama-1B",
+        help="Path to the model checkpoint",
+    )
+    parser.add_argument(
+        "-d",
+        "--detect_type",
+        type=str,
+        default="all",
+        choices=["all", "arch", "object"],
+        help="The type of indoor elements to detect. all: (wall, door, window, box), arch: (wall, door, window), object: (box)",
+    )
+    parser.add_argument(
+        "-c",
+        "--category",
+        nargs="+",
+        default=[],
+        choices=[
+            "sofa",
+            "chair",
+            "dining_chair",
+            "bar_chair",
+            "stool",
+            "bed",
+            "pillow",
+            "wardrobe",
+            "nightstand",
+            "tv_cabinet",
+            "wine_cabinet",
+            "bathroom_cabinet",
+            "shoe_cabinet",
+            "entrance_cabinet",
+            "decorative_cabinet",
+            "washing_cabinet",
+            "wall_cabinet",
+            "sideboard",
+            "cupboard",
+            "coffee_table",
+            "dining_table",
+            "side_table",
+            "dressing_table",
+            "desk",
+            "integrated_stove",
+            "gas_stove",
+            "range_hood",
+            "micro-wave_oven",
+            "sink",
+            "stove",
+            "refrigerator",
+            "hand_sink",
+            "shower",
+            "shower_room",
+            "toilet",
+            "tub",
+            "illumination",
+            "chandelier",
+            "floor-standing_lamp",
+            "wall_decoration",
+            "painting",
+            "curtain",
+            "carpet",
+            "plants",
+            "potted_bonsai",
+            "tv",
+            "computer",
+            "air_conditioner",
+            "washing_machine",
+            "clothes_rack",
+            "mirror",
+            "bookcase",
+            "cushion",
+            "bar",
+            "screen",
+            "combination_sofa",
+            "dining_table_combination",
+            "leisure_table_and_chair_combination",
+            "multifunctional_combination_bed",
+        ],
+        help="A list of categories of objects to detect. If not specified, all categories will be detected.",
+    )
+    parser.add_argument(
+        "-t",
+        "--code_template_file",
+        type=str,
+        default="code_template.txt",
+        help="Path to the code template file",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=10,
+        help="The number of highest probability vocabulary tokens to keep for top-k filtering",
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.95,
+        help="The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+        help="The value used to module the next token probabilities",
+    )
+    parser.add_argument(
+        "--num_beams",
+        type=int,
+        default=1,
+        help="The number of beams for beam search",
+    )
+    parser.add_argument(
+        "--inference_dtype",
+        type=str,
+        default="bfloat16",
+        help="The torch dtype to use for inference, bfloat16 or float32",
+    )
+    parser.add_argument(
+        "--no_cleanup",
+        default=False,
+        action="store_true",
+        help="Whether to not cleanup the point cloud",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=-1,
+        help="The seed to use during inference, negative value means no seed",
+    )
+    args = parser.parse_args()
+
+    # load the model
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path, torch_dtype=getattr(torch, args.inference_dtype)
+    )
+    model.to("cuda")
+    model.set_point_backbone_dtype(torch.float32)
+    model.eval()
+
+    # number of bins used for discretization
+    num_bins = model.config.point_config["num_bins"]
+
+    # check if the input is a single point cloud file or a folder containing multiple point cloud files
+    if os.path.isfile(args.point_cloud):
+        point_cloud_files = [args.point_cloud]
+    else:
+        point_cloud_files = glob.glob(os.path.join(args.point_cloud, "*.ply"))
+
+    for point_cloud_file in tqdm(point_cloud_files):
+        # load the point cloud
+        point_cloud = load_o3d_pcd(point_cloud_file)
+        grid_size = Layout.get_grid_size(num_bins)
+
+        if not args.no_cleanup:
+            point_cloud = cleanup_pcd(point_cloud, voxel_size=grid_size)
+
+        points, colors = get_points_and_colors(point_cloud)
+        min_extent = np.min(points, axis=0)
+
+        # preprocess the point cloud to tensor features
+        input_pcd = preprocess_point_cloud(points, colors, grid_size, num_bins)
+
+        # generate the layout
+        layout = generate_layout(
+            model,
+            input_pcd,
+            tokenizer,
+            args.code_template_file,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            num_beams=args.num_beams,
+            seed=args.seed,
+            detect_type=args.detect_type,
+            categories=args.category,
+        )
+        layout.translate(min_extent)
+        pred_language_string = layout.to_language_string()
+
+        # check if the output path is a file or directory
+        if os.path.splitext(args.output)[-1]:
+            with open(args.output, "w") as f:
+                f.write(pred_language_string)
+        else:
+            output_filename = os.path.basename(point_cloud_file).replace(".ply", ".txt")
+            os.makedirs(args.output, exist_ok=True)
+            with open(os.path.join(args.output, output_filename), "w") as f:
+                f.write(pred_language_string)
diff --git a/scale_optimizer.py b/scale_optimizer.py
new file mode 100644
index 0000000..c3b9941
--- /dev/null
+++ b/scale_optimizer.py
@@ -0,0 +1,780 @@
+import numpy as np
+import open3d as o3d
+from scipy.optimize import minimize, Bounds, LinearConstraint
+from scipy.spatial import cKDTree
+
+# Make matplotlib optional - if it's not available, diagrams won't be generated
+try:
+    import matplotlib.pyplot as plt
+    HAS_MATPLOTLIB = True
+except ImportError:
+    HAS_MATPLOTLIB = False
+    print("Warning: matplotlib not found. Diagrams will not be generated.")
+
+from copy import deepcopy
+
+from spatiallm.layout.layout import Layout
+from spatiallm.layout.entity import Wall
+
+def huber_loss(distance: float, delta: float) -> float:
+    """Huber loss for non-negative distance."""
+    if distance <= delta:
+        return 0.5 * distance * distance
+    return delta * (distance - 0.5 * delta)
+
+def point_to_segment_distance(point: np.ndarray, seg_start: np.ndarray, seg_end: np.ndarray) -> float:
+    """Distance from a point to a 2D segment."""
+    edge = seg_end - seg_start
+    denom = float(np.dot(edge, edge))
+    if denom == 0.0:
+        return float(np.linalg.norm(point - seg_start))
+    t = float(np.dot(point - seg_start, edge) / denom)
+    t = max(0.0, min(1.0, t))
+    closest = seg_start + t * edge
+    return float(np.linalg.norm(point - closest))
+
+def compute_principal_angle(points: np.ndarray) -> float:
+    centered = points - np.mean(points, axis=0)
+    cov = np.cov(centered.T)
+    vals, vecs = np.linalg.eigh(cov)
+    principal_vec = vecs[:, int(np.argmax(vals))]
+    return float(np.arctan2(principal_vec[1], principal_vec[0]))
+
+def rotate_vectors(vectors: np.ndarray, angle: float) -> np.ndarray:
+    c, s = np.cos(angle), np.sin(angle)
+    R = np.array([[c, -s], [s, c]])
+    return (R @ vectors.T).T
+
+def reconstruct_vertices_from_scales(scales: np.ndarray, vectors: np.ndarray) -> np.ndarray:
+    vertices = [np.zeros(2, dtype=float)]
+    for i in range(len(scales)):
+        vertices.append(vertices[-1] + scales[i] * vectors[i])
+    return np.array(vertices[:-1])
+
+def build_closure_matrix(vectors: np.ndarray) -> np.ndarray:
+    if vectors.size == 0:
+        return np.zeros((2, 0))
+    return np.vstack([vectors[:, 0], vectors[:, 1]])
+
+def compute_hull_edges(hull_points: np.ndarray):
+    edges = []
+    dirs = []
+    angles = []
+    H = len(hull_points)
+    for i in range(H):
+        p0 = hull_points[i]
+        p1 = hull_points[(i + 1) % H]
+        e = p1 - p0
+        L = float(np.linalg.norm(e))
+        if L == 0.0:
+            u = np.array([1.0, 0.0])
+        else:
+            u = e / L
+        edges.append((p0, p1))
+        dirs.append(u)
+        angles.append(float(np.arctan2(u[1], u[0])))
+    return edges, np.array(dirs), np.array(angles)
+
+def select_candidate_edges(vectors: np.ndarray, hull_dirs: np.ndarray, angle_threshold_deg: float = 20.0):
+    candidates = []
+    cos_thresh = float(np.cos(np.deg2rad(angle_threshold_deg)))
+    for v in vectors:
+        Lv = float(np.linalg.norm(v))
+        if Lv == 0.0:
+            uv = np.array([1.0, 0.0])
+        else:
+            uv = v / Lv
+        dots = hull_dirs @ uv
+        idx = np.where(np.abs(dots) >= cos_thresh)[0]
+        if idx.size == 0:
+            # fallback to best 3 by angle similarity
+            idx = np.argsort(-np.abs(dots))[:3]
+        candidates.append(idx.astype(int))
+    return candidates
+
+def make_objective(
+    vectors: np.ndarray,
+    hull_points: np.ndarray,
+    num_samples_per_edge: int = 15,
+    huber_delta: float = 0.04,
+    reg_lambda: float = 3e-4,
+    smooth_lambda: float = 1e-3,
+    hull_edges: list | None = None,
+    candidate_edges: list | None = None,
+    use_kdtree: bool = True,
+    k_neighbors: int = 8,
+):
+    # Precompute hull edges and candidates
+    if hull_edges is None:
+        hull_edges, hull_dirs, _ = compute_hull_edges(hull_points)
+    else:
+        # Recompute dirs for safety
+        hull_dirs = []
+        for (p0, p1) in hull_edges:
+            e = p1 - p0
+            L = float(np.linalg.norm(e))
+            hull_dirs.append(e / L if L != 0.0 else np.array([1.0, 0.0]))
+        hull_dirs = np.array(hull_dirs)
+    if candidate_edges is None:
+        candidate_edges = select_candidate_edges(vectors, hull_dirs)
+
+    hull_center = np.mean(hull_points, axis=0)
+    sample_ts = np.linspace(0.0, 1.0, num_samples_per_edge, dtype=float)
+
+    # KD-tree over hull edge midpoints for spatial filtering
+    if use_kdtree:
+        mids = np.array([(e[0] + e[1]) * 0.5 for e in hull_edges])
+        tree = cKDTree(mids)
+
+    def objective(scales: np.ndarray) -> float:
+        vertices = reconstruct_vertices_from_scales(scales, vectors)
+        vertices_center = np.mean(vertices, axis=0)
+        translation = hull_center - vertices_center
+        vertices_aligned = vertices + translation
+
+        total = 0.0
+        # data term
+        for i in range(len(scales)):
+            start = vertices_aligned[i]
+            end = start + scales[i] * vectors[i]
+            cands = candidate_edges[i] if candidate_edges is not None else range(len(hull_edges))
+            if use_kdtree:
+                # spatial filter: nearest k edge midpoints to the segment midpoint
+                seg_mid = (start + end) * 0.5
+                _, idxs = tree.query(seg_mid, k=min(k_neighbors, len(hull_edges)))
+                if np.isscalar(idxs):
+                    idxs = np.array([int(idxs)])
+                # intersect orientation and spatial candidates
+                cands = np.intersect1d(np.array(cands, dtype=int), idxs.astype(int), assume_unique=False)
+                if cands.size == 0:
+                    cands = idxs.astype(int)
+            for t in sample_ts:
+                pt = (1.0 - t) * start + t * end
+                min_d = float('inf')
+                for j in cands:
+                    h0, h1 = hull_edges[j]
+                    d = point_to_segment_distance(pt, h0, h1)
+                    if d < min_d:
+                        min_d = d
+                total += huber_loss(min_d, huber_delta)
+        # L2 on deviation from 1
+        diff = scales - 1.0
+        total += reg_lambda * float(np.dot(diff, diff))
+        # Smoothness across adjacent scales (circular)
+        if smooth_lambda > 0.0 and len(scales) > 2:
+            smooth = 0.0
+            for i in range(len(scales)):
+                prev_i = (i - 1) % len(scales)
+                d = scales[i] - scales[prev_i]
+                smooth += d * d
+            total += smooth_lambda * float(smooth)
+        return float(total)
+
+    return objective
+
+def transform_points_similarity(points: np.ndarray, theta: float, scale: float, translation: np.ndarray) -> np.ndarray:
+    """Apply 2D similarity transform x' = s R x + t to a set of points.
+
+    points: (N,2), translation: (2,)
+    """
+    c, s = np.cos(theta), np.sin(theta)
+    R = np.array([[c, -s], [s, c]])
+    return (scale * (R @ points.T)).T + translation
+
+def transform_points_anisotropic(points: np.ndarray, theta: float, scale_x: float, scale_y: float, translation: np.ndarray) -> np.ndarray:
+	"""Apply 2D anisotropic similarity transform x' = R diag(sx, sy) x + t."""
+	c, s = np.cos(theta), np.sin(theta)
+	R = np.array([[c, -s], [s, c]])
+	D = np.array([[scale_x, 0.0], [0.0, scale_y]])
+	return (R @ (D @ points.T)).T + translation
+
+def build_segments_from_vertices(vertices: np.ndarray) -> np.ndarray:
+    """Return segments as (M,2,2) array from polygon vertices."""
+    if vertices.shape[0] < 2:
+        return np.zeros((0, 2, 2))
+    M = vertices.shape[0]
+    segs = []
+    for i in range(M):
+        p0 = vertices[i]
+        p1 = vertices[(i + 1) % M]
+        segs.append([p0, p1])
+    return np.array(segs)
+
+def robust_align_hull_to_walls(hull_points: np.ndarray, orig_vertices: np.ndarray, huber_delta: float = 0.05):
+    """Estimate similarity transform (theta, scale, tx, ty) aligning noisy hull to straight-line walls.
+
+    Minimizes Huber-robust sum of distances from transformed hull points to the nearest original wall segment.
+    """
+    # Downsample hull for speed
+    if len(hull_points) > 600:
+        step = max(1, len(hull_points) // 600)
+        hull_sample = hull_points[::step]
+    else:
+        hull_sample = hull_points
+
+    segments = build_segments_from_vertices(orig_vertices)
+
+    # Initial guess from PCA-based angle and RMS radius ratio
+    hull_center = np.mean(hull_sample, axis=0)
+    orig_center = np.mean(orig_vertices, axis=0)
+    try:
+        angle_h = compute_principal_angle(hull_sample)
+        angle_o = compute_principal_angle(orig_vertices)
+        theta0 = angle_o - angle_h
+    except Exception:
+        theta0 = 0.0
+    r_o = float(np.sqrt(np.mean(np.sum((orig_vertices - orig_center) ** 2, axis=1))))
+    r_h = float(np.sqrt(np.mean(np.sum((hull_sample - hull_center) ** 2, axis=1))))
+    s0 = (r_o / r_h) if r_h > 1e-9 else 1.0
+    # Translation to align centroids under initial rot-scale
+    c0, s_ = np.cos(theta0), np.sin(theta0)
+    R0 = np.array([[c0, -s_], [s_, c0]])
+    t0 = orig_center - s0 * (R0 @ hull_center)
+
+    def objective(p: np.ndarray) -> float:
+        theta, scale, tx, ty = p
+        transformed = transform_points_similarity(hull_sample, theta, max(scale, 1e-6), np.array([tx, ty]))
+        total = 0.0
+        for pt in transformed:
+            # distance to nearest wall segment
+            dmin = float('inf')
+            for seg in segments:
+                d = point_to_segment_distance(pt, seg[0], seg[1])
+                if d < dmin:
+                    dmin = d
+            total += huber_loss(dmin, huber_delta)
+        # small regularizer to keep scale near 1
+        total += 1e-3 * (scale - 1.0) * (scale - 1.0)
+        return float(total)
+
+    # Bounds: theta free in [-pi, pi], scale in [0.5, 2.0], tx,ty wide bounds
+    bounds = [(-np.pi, np.pi), (0.5, 2.0), (-1e6, 1e6), (-1e6, 1e6)]
+    x0 = np.array([theta0, s0, t0[0], t0[1]], dtype=float)
+    res = minimize(objective, x0, method='L-BFGS-B', bounds=bounds, options={'maxiter': 300, 'ftol': 1e-12})
+    theta_opt, scale_opt, tx_opt, ty_opt = res.x
+    return float(theta_opt), float(scale_opt), np.array([tx_opt, ty_opt]), res
+
+def robust_align_hull_to_walls_anisotropic(hull_points: np.ndarray, orig_vertices: np.ndarray, huber_delta: float = 0.05):
+	"""Estimate anisotropic similarity (theta, sx, sy, tx, ty) aligning hull to straight walls.
+
+	Minimizes Huber-robust sum of distances from transformed hull points to nearest wall segments.
+	"""
+	# Downsample hull for speed
+	if len(hull_points) > 800:
+		step = max(1, len(hull_points) // 800)
+		hull_sample = hull_points[::step]
+	else:
+		hull_sample = hull_points
+
+	segments = build_segments_from_vertices(orig_vertices)
+
+	hull_center = np.mean(hull_sample, axis=0)
+	orig_center = np.mean(orig_vertices, axis=0)
+	try:
+		angle_h = compute_principal_angle(hull_sample)
+		angle_o = compute_principal_angle(orig_vertices)
+		theta0 = angle_o - angle_h
+	except Exception:
+		theta0 = 0.0
+
+	# Start from isotropic guess
+	r_o = float(np.sqrt(np.mean(np.sum((orig_vertices - orig_center) ** 2, axis=1))))
+	r_h = float(np.sqrt(np.mean(np.sum((hull_sample - hull_center) ** 2, axis=1))))
+	s0 = (r_o / r_h) if r_h > 1e-9 else 1.0
+	sx0, sy0 = s0, s0
+	c0, s_ = np.cos(theta0), np.sin(theta0)
+	R0 = np.array([[c0, -s_], [s_, c0]])
+	t0 = orig_center - (R0 @ (np.array([[sx0, 0.0],[0.0, sy0]]) @ hull_center))
+
+	def objective(p: np.ndarray) -> float:
+		theta, log_sx, log_sy, tx, ty = p
+		sx = float(np.exp(log_sx))
+		sy = float(np.exp(log_sy))
+		pts = transform_points_anisotropic(hull_sample, theta, sx, sy, np.array([tx, ty]))
+		total = 0.0
+		for pt in pts:
+			# nearest segment distance
+			dmin = float('inf')
+			for seg in segments:
+				d = point_to_segment_distance(pt, seg[0], seg[1])
+				if d < dmin:
+					dmin = d
+			total += huber_loss(dmin, huber_delta)
+		# mild regularization on anisotropy to avoid degenerate scaling
+		total += 1e-3 * ((np.log(sx) - np.log(sy)) ** 2)
+		return float(total)
+
+	# Bounds and init
+	bounds = [(-np.pi, np.pi), (np.log(0.5), np.log(2.0)), (np.log(0.5), np.log(2.0)), (-1e6, 1e6), (-1e6, 1e6)]
+	x0 = np.array([theta0, np.log(sx0), np.log(sy0), t0[0], t0[1]], dtype=float)
+	res = minimize(objective, x0, method='L-BFGS-B', bounds=bounds, options={'maxiter': 400, 'ftol': 1e-12})
+	theta_opt, log_sx_opt, log_sy_opt, tx_opt, ty_opt = res.x
+	return float(theta_opt), float(np.exp(log_sx_opt)), float(np.exp(log_sy_opt)), np.array([tx_opt, ty_opt]), res
+
+def order_walls_connected(walls, tol_primary: float = 0.05, tol_secondary: float = 0.15, allow_reverse: bool = True, inplace: bool = False):
+    """
+    Order walls so they form a connected chain with tolerant matching.
+
+    - Prefer direct start match within tol_primary
+    - Else allow reversing a wall and match within tol_primary
+    - Else snap nearest candidate within tol_secondary by translating endpoints equally
+
+    Returns (ordered_walls, is_closed).
+    """
+    if not walls:
+        return [], False
+    walls_in = list(walls) if inplace else [deepcopy(w) for w in walls]
+    n = len(walls_in)
+    if n == 1:
+        return [walls_in[0]], True
+
+    def d2(p1, p2):
+        v = np.array(p1) - np.array(p2)
+        return float(np.dot(v, v))
+
+    ordered = [walls_in[0]]
+    used = {0}
+
+    for step in range(n - 1):
+        last_wall = ordered[-1]
+        last_end = np.array([last_wall.bx, last_wall.by])
+
+        # 1) direct start match
+        next_idx = -1
+        min_dist2 = float('inf')
+        for i, w in enumerate(walls_in):
+            if i in used:
+                continue
+            dist2 = d2([w.ax, w.ay], last_end)
+            if dist2 < min_dist2:
+                min_dist2 = dist2
+                next_idx = i
+        if next_idx != -1 and np.sqrt(min_dist2) <= tol_primary:
+            ordered.append(walls_in[next_idx])
+            used.add(next_idx)
+            continue
+
+        # 2) reverse match
+        if allow_reverse:
+            next_idx_r = -1
+            min_dist2_r = float('inf')
+            for i, w in enumerate(walls_in):
+                if i in used:
+                    continue
+                dist2 = d2([w.bx, w.by], last_end)
+                if dist2 < min_dist2_r:
+                    min_dist2_r = dist2
+                    next_idx_r = i
+            if next_idx_r != -1 and np.sqrt(min_dist2_r) <= tol_primary:
+                wcopy = walls_in[next_idx_r]
+                wcopy.ax, wcopy.bx = wcopy.bx, wcopy.ax
+                wcopy.ay, wcopy.by = wcopy.by, wcopy.ay
+                ordered.append(wcopy)
+                used.add(next_idx_r)
+                print(f"Connected by reversing wall at index {next_idx_r} (step {step}).")
+                continue
+
+        # 3) snap nearest within tol_secondary
+        candidate_idx = -1
+        best_is_start = True
+        best_dist2 = float('inf')
+        for i, w in enumerate(walls_in):
+            if i in used:
+                continue
+            d_start = d2([w.ax, w.ay], last_end)
+            d_end = d2([w.bx, w.by], last_end)
+            if d_start < best_dist2:
+                best_dist2 = d_start
+                candidate_idx = i
+                best_is_start = True
+            if allow_reverse and d_end < best_dist2:
+                best_dist2 = d_end
+                candidate_idx = i
+                best_is_start = False
+        if candidate_idx != -1 and np.sqrt(best_dist2) <= tol_secondary:
+            wcopy = walls_in[candidate_idx]
+            if best_is_start:
+                delta = last_end - np.array([wcopy.ax, wcopy.ay])
+                wcopy.ax += delta[0]; wcopy.ay += delta[1]
+                wcopy.bx += delta[0]; wcopy.by += delta[1]
+                ordered.append(wcopy)
+                used.add(candidate_idx)
+                print(f"Snapped wall at index {candidate_idx} by {np.linalg.norm(delta):.3f} m to connect (step {step}).")
+            else:
+                wcopy.ax, wcopy.bx = wcopy.bx, wcopy.ax
+                wcopy.ay, wcopy.by = wcopy.by, wcopy.ay
+                delta = last_end - np.array([wcopy.ax, wcopy.ay])
+                wcopy.ax += delta[0]; wcopy.ay += delta[1]
+                wcopy.bx += delta[0]; wcopy.by += delta[1]
+                ordered.append(wcopy)
+                used.add(candidate_idx)
+                print(f"Reversed+snapped wall at index {candidate_idx} by {np.linalg.norm(delta):.3f} m to connect (step {step}).")
+            continue
+
+        print(f"Warning: Walls are not connected! Missing connection after step {step} (wall id={getattr(last_wall, 'id', step)}).")
+        return list(walls_in), False
+
+    # Closure check and snap if close
+    first_start = np.array([ordered[0].ax, ordered[0].ay])
+    last_end = np.array([ordered[-1].bx, ordered[-1].by])
+    gap = float(np.linalg.norm(first_start - last_end))
+    if gap <= tol_secondary:
+        delta = first_start - last_end
+        ordered[-1].bx += delta[0]; ordered[-1].by += delta[1]
+        if gap > tol_primary:
+            print(f"Closed loop by snapping final gap of {gap:.3f} m.")
+        return ordered, True
+
+    print(f"Warning: Walls don't form closed loop! Gap: {gap:.6f}")
+    return ordered, False
+
+def extract_vectors_from_ordered_walls(ordered_walls):
+    """
+    Extract direction vectors from ordered walls.
+    Ensures vectors follow the connectivity chain.
+    """
+    vectors = []
+    vertices = []
+    
+    if not ordered_walls:
+        return np.array([]), np.array([])
+    
+    # Start from first wall's start point
+    start = np.array([ordered_walls[0].ax, ordered_walls[0].ay])
+    vertices.append(start)
+    
+    for wall in ordered_walls:
+        # Vector is always from start to end of this wall
+        vec = np.array([wall.bx - wall.ax, wall.by - wall.ay])
+        vectors.append(vec)
+        # Next vertex is this wall's endpoint
+        vertices.append(np.array([wall.bx, wall.by]))
+    
+    # Remove last vertex (it should equal the first for closed loop)
+    vertices = vertices[:-1]
+    
+    return np.array(vertices), np.array(vectors)
+
+def solve_for_closure(partial_scales, vectors, opt_indices, solve_indices):
+    """
+    Given N-2 scales, solve for the 2 remaining scales to ensure closed loop.
+    """
+    n = len(vectors)
+    full_scales = np.ones(n)
+    full_scales[opt_indices] = partial_scales
+    
+    # The constraint is that sum of all scaled vectors = 0
+    # We have: sum(s_i * v_i for i in opt_indices) + s_a * v_a + s_b * v_b = 0
+    # So: s_a * v_a + s_b * v_b = -sum(s_i * v_i for i in opt_indices)
+    
+    sum_opt = np.zeros(2)
+    for i, idx in enumerate(opt_indices):
+        sum_opt += partial_scales[i] * vectors[idx]
+    
+    # Build 2x2 system
+    A = np.column_stack([vectors[solve_indices[0]], vectors[solve_indices[1]]])
+    b = -sum_opt
+    
+    try:
+        scales_solve = np.linalg.solve(A, b)
+        full_scales[solve_indices[0]] = scales_solve[0]
+        full_scales[solve_indices[1]] = scales_solve[1]
+    except np.linalg.LinAlgError:
+        print("Warning: Cannot solve for closure (walls may be parallel)")
+        # Keep default scales of 1
+    
+    return full_scales
+
+def objective_function(partial_scales, vectors, hull_points, opt_indices, solve_indices):
+    """
+    Minimize distance from wall vertices to hull perimeter.
+    """
+    # Get full scales with closure constraint
+    scales = solve_for_closure(partial_scales, vectors, opt_indices, solve_indices)
+    
+    # Check if solved scales are valid (positive)
+    if np.any(scales <= 0):
+        return 1e10  # Large penalty for invalid scales
+    
+    # Build vertices from scaled vectors
+    vertices = [np.zeros(2)]  # Start at origin (we'll translate later)
+    for i, vec in enumerate(vectors):
+        next_vertex = vertices[-1] + scales[i] * vec
+        vertices.append(next_vertex)
+    vertices = np.array(vertices[:-1])  # Remove duplicate last vertex
+    
+    # Center vertices to match hull
+    vertices_center = np.mean(vertices, axis=0)
+    hull_center = np.mean(hull_points, axis=0)
+    vertices = vertices - vertices_center + hull_center
+    
+    # Calculate distance from vertices to hull
+    total_dist = 0
+    for v in vertices:
+        min_dist = float('inf')
+        for i in range(len(hull_points)):
+            p1 = hull_points[i]
+            p2 = hull_points[(i + 1) % len(hull_points)]
+            # Distance from point to line segment
+            edge = p2 - p1
+            t = max(0, min(1, np.dot(v - p1, edge) / (np.dot(edge, edge) + 1e-10)))
+            closest = p1 + t * edge
+            dist = np.linalg.norm(v - closest)
+            min_dist = min(min_dist, dist)
+        total_dist += min_dist ** 2
+    
+    return total_dist
+
+def generate_diagram(original_layout, optimized_layout, hull_points, output_path, meta_info: dict = None, hull_points_unscaled: np.ndarray | None = None):
+    """Generate comparison diagram with overlays and vertex markers.
+
+    meta_info (optional) may contain: 'scale_min', 'scale_mean', 'scale_max',
+    'closure_residual', 'objective'.
+    """
+    if not HAS_MATPLOTLIB:
+        print(f"Skipping diagram generation (matplotlib not available). Would have saved to: {output_path}")
+        return
+    
+    plt.figure(figsize=(12, 10))
+    
+    # Plot hull
+    hull_plot = np.vstack([hull_points, hull_points[0]])
+    plt.plot(hull_plot[:, 0], hull_plot[:, 1], 'g.-', linewidth=2, markersize=3, label='Target Perimeter')
+
+    # Plot unscaled/original hull in orange if provided
+    if hull_points_unscaled is not None:
+        hull_unscaled_plot = np.vstack([hull_points_unscaled, hull_points_unscaled[0]])
+        plt.plot(hull_unscaled_plot[:, 0], hull_unscaled_plot[:, 1], color='orange', linestyle='-', linewidth=1.8, alpha=0.9, label='Target Perimeter (unscaled)')
+    
+    # Plot original walls
+    for wall in original_layout.walls:
+        plt.plot([wall.ax, wall.bx], [wall.ay, wall.by], 'b-', linewidth=2, alpha=0.7)
+    plt.plot([], [], 'b-', linewidth=2, label='Original Walls')
+    
+    # Plot optimized/scale-applied walls
+    if optimized_layout:
+        for wall in optimized_layout.walls:
+            plt.plot([wall.ax, wall.bx], [wall.ay, wall.by], 'r-', linewidth=2, alpha=0.85)
+        plt.plot([], [], 'r-', linewidth=2, label='Scale-applied Walls')
+
+    # Vertex markers for visual diff
+    try:
+        ow_ordered, _ = order_walls_connected(original_layout.walls)
+        ov = []
+        if ow_ordered:
+            ov.append([ow_ordered[0].ax, ow_ordered[0].ay])
+            for w in ow_ordered:
+                ov.append([w.bx, w.by])
+        ov = np.array(ov[:-1]) if len(ov) > 1 else np.array(ov)
+        if ov.size:
+            plt.scatter(ov[:, 0], ov[:, 1], c='b', s=10, alpha=0.6, marker='o', label='Original Vertices')
+    except Exception:
+        pass
+    try:
+        if optimized_layout:
+            nw_ordered, _ = order_walls_connected(optimized_layout.walls)
+            nv = []
+            if nw_ordered:
+                nv.append([nw_ordered[0].ax, nw_ordered[0].ay])
+                for w in nw_ordered:
+                    nv.append([w.bx, w.by])
+            nv = np.array(nv[:-1]) if len(nv) > 1 else np.array(nv)
+            if nv.size:
+                plt.scatter(nv[:, 0], nv[:, 1], c='r', s=12, alpha=0.9, marker='x', label='Scaled Vertices')
+    except Exception:
+        pass
+    
+    plt.xlabel('X (meters)')
+    plt.ylabel('Y (meters)')
+    title = 'Wall Scale Optimization'
+    if meta_info:
+        parts = []
+        if meta_info.get('scale_min') is not None and meta_info.get('scale_mean') is not None and meta_info.get('scale_max') is not None:
+            parts.append(f"scales[min/mean/max]={meta_info['scale_min']:.3f}/{meta_info['scale_mean']:.3f}/{meta_info['scale_max']:.3f}")
+        if meta_info.get('closure_residual') is not None:
+            parts.append(f"||A s||={meta_info['closure_residual']:.2e}")
+        if meta_info.get('objective') is not None:
+            parts.append(f"obj={meta_info['objective']:.3f}")
+        if parts:
+            title += ' (' + ', '.join(parts) + ')'
+    plt.title(title)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.axis('equal')
+    plt.tight_layout()
+    plt.savefig(output_path)
+    plt.close()
+    print(f"Saved diagram to: {output_path}")
+
+def run_scale_optimization(layout: Layout, grid_ply_path: str, diagram_path: str, source_ply_path: str | None = None):
+    """Main optimization function.
+
+    If source_ply_path is provided, the perimeter/hull will be extracted from the original input PLY (meters).
+    Otherwise, it will fall back to the grid PLY path.
+    """
+    # 1. Load and process grid PLY
+    # Prefer original PLY (meters) if available
+    ply_to_read = source_ply_path if source_ply_path else grid_ply_path
+    try:
+        pcd = o3d.io.read_point_cloud(ply_to_read)
+        points = np.asarray(pcd.points)
+        print(f"Loaded PLY for perimeter extraction: {ply_to_read}  (points={points.shape[0]})")
+    except Exception as e:
+        print(f"Error loading PLY '{ply_to_read}': {e}")
+        return None
+
+    if points.shape[0] < 3:
+        print("Not enough points in grid PLY")
+        return None
+
+    # Filter by height to get wall slice
+    z_coords = points[:, 2]
+    z_min = np.min(z_coords)
+    mask = (z_coords > z_min + 0.1) & (z_coords < z_min + 2.5)
+    points_2d = points[mask, :2]
+
+    if points_2d.shape[0] < 3:
+        points_2d = points[:, :2]
+
+    try:
+        # Efficient perimeter sampling using fixed bins across bounding box
+        n_bins = 100
+        x = points_2d[:, 0]
+        y = points_2d[:, 1]
+        x_min, x_max = float(np.min(x)), float(np.max(x))
+        y_min, y_max = float(np.min(y)), float(np.max(y))
+
+        # Y-sliced extremes (min/max X per Y bin)
+        bins_y = np.linspace(y_min, y_max, n_bins + 1)
+        idx_y = np.digitize(y, bins_y) - 1
+        idx_y = np.clip(idx_y, 0, n_bins - 1)
+        min_x_by_bin = np.full(n_bins, np.inf)
+        max_x_by_bin = np.full(n_bins, -np.inf)
+        for b in range(n_bins):
+            maskb = (idx_y == b)
+            if not np.any(maskb):
+                continue
+            xb = x[maskb]
+            min_x_by_bin[b] = float(np.min(xb))
+            max_x_by_bin[b] = float(np.max(xb))
+        y_centers = 0.5 * (bins_y[:-1] + bins_y[1:])
+
+        perimeter_pts = []
+        for b in range(n_bins):
+            if np.isfinite(min_x_by_bin[b]):
+                perimeter_pts.append([min_x_by_bin[b], y_centers[b]])
+            if np.isfinite(max_x_by_bin[b]):
+                perimeter_pts.append([max_x_by_bin[b], y_centers[b]])
+
+        # X-sliced extremes (min/max Y per X bin)
+        bins_x = np.linspace(x_min, x_max, n_bins + 1)
+        idx_x = np.digitize(x, bins_x) - 1
+        idx_x = np.clip(idx_x, 0, n_bins - 1)
+        min_y_by_bin = np.full(n_bins, np.inf)
+        max_y_by_bin = np.full(n_bins, -np.inf)
+        for b in range(n_bins):
+            maskb = (idx_x == b)
+            if not np.any(maskb):
+                continue
+            yb = y[maskb]
+            min_y_by_bin[b] = float(np.min(yb))
+            max_y_by_bin[b] = float(np.max(yb))
+        x_centers = 0.5 * (bins_x[:-1] + bins_x[1:])
+        for b in range(n_bins):
+            if np.isfinite(min_y_by_bin[b]):
+                perimeter_pts.append([x_centers[b], min_y_by_bin[b]])
+            if np.isfinite(max_y_by_bin[b]):
+                perimeter_pts.append([x_centers[b], max_y_by_bin[b]])
+
+        perimeter_pts = np.unique(np.array(perimeter_pts), axis=0)
+        centroid = np.mean(perimeter_pts, axis=0)
+        angles = np.arctan2(perimeter_pts[:, 1] - centroid[1], perimeter_pts[:, 0] - centroid[0])
+        hull_points = perimeter_pts[np.argsort(angles)]
+    except Exception as e:
+        print(f"Error computing perimeter: {e}")
+        return None
+
+    # 2. Order walls by connectivity
+    ordered_walls, connected = order_walls_connected(layout.walls)
+    if not connected:
+        print("ERROR: Input walls are not properly connected. Cannot optimize.")
+        generate_diagram(layout, None, hull_points, diagram_path)
+        return None
+
+    # 3. Extract vertices and vectors from ordered walls
+    orig_vertices, vectors = extract_vectors_from_ordered_walls(ordered_walls)
+    if len(vectors) < 3:
+        print("Need at least 3 walls to optimize")
+        return None
+    
+    n = len(vectors)
+    base_lengths = np.linalg.norm(vectors, axis=1)
+    print(f"Connected loop with {n} walls. Base length stats (m): min={base_lengths.min():.3f} mean={base_lengths.mean():.3f} max={base_lengths.max():.3f}")
+
+    # 4. Robust anisotropic pre-alignment: rotate + (sx, sy) scale + translate
+    angle_align, sx_align, sy_align, t_align, res_align = robust_align_hull_to_walls_anisotropic(hull_points, orig_vertices, huber_delta=0.05)
+    # Preserve unscaled (no scale) but rotated+translated hull for visualization
+    hull_points_unscaled = transform_points_anisotropic(hull_points, angle_align, 1.0, 1.0, t_align)
+    hull_points = transform_points_anisotropic(hull_points, angle_align, sx_align, sy_align, t_align)
+    print(f"Robust anisotropic pre-align hull: rot={np.degrees(angle_align):.2f} deg, sx={sx_align:.4f}, sy={sy_align:.4f}, t=({t_align[0]:.3f},{t_align[1]:.3f}).")
+
+    # 5. Keep wall directions in the original frame for optimization
+    rotated_vectors = vectors
+    print("Using original wall directions for optimization (no rotation/scale on vectors).")
+
+    # 6. Build linear closure constraints
+    A = build_closure_matrix(rotated_vectors)
+    print(f"Using trivial per-axis scale from alignment; skipping nonlinear optimization.")
+
+    # 7. Trivial per-axis: derive per-wall inverse scale to undo anisotropic factors
+    # For wall i with unit direction u, inverse scale = sqrt((u_x/sx)^2 + (u_y/sy)^2)
+    final_scales = np.zeros(n, dtype=float)
+    for i in range(n):
+        v = rotated_vectors[i]
+        L = float(np.linalg.norm(v))
+        u = (v / L) if L != 0.0 else np.array([1.0, 0.0])
+        final_scales[i] = float(np.sqrt((u[0] / (sx_align if sx_align != 0.0 else 1.0))**2 + (u[1] / (sy_align if sy_align != 0.0 else 1.0))**2))
+    eq_residual = A @ final_scales
+    print(f"Closure residual (A @ s): ||A s||={np.linalg.norm(eq_residual):.3e}, components=({eq_residual[0]:.3e}, {eq_residual[1]:.3e})")
+
+    # 8. Apply scales to layout (preserve original orientation; anchor at original first vertex)
+    optimized_layout = deepcopy(layout)
+    optimized_walls, _ = order_walls_connected(optimized_layout.walls, inplace=True)
+    # Re-translate start to best match the aligned hull centroid to reduce drift
+    orig_start = np.array([optimized_walls[0].ax, optimized_walls[0].ay, optimized_walls[0].az])
+    current_pos = orig_start
+    print("Applying scales to walls without rotation (preserving original orientation):")
+    for i, wall in enumerate(optimized_walls):
+        base_vec = vectors[i]
+        length = float(np.linalg.norm(base_vec))
+        if length == 0.0:
+            direction = np.array([1.0, 0.0])
+            new_length = 0.0
+        else:
+            direction = base_vec / length
+            new_length = length * final_scales[i]
+        wall.ax, wall.ay, wall.az = current_pos
+        next_pos_2d = current_pos[:2] + direction * new_length
+        wall.bx, wall.by = next_pos_2d
+        wall.bz = wall.az
+        current_pos = np.array([wall.bx, wall.by, wall.bz])
+        print(f"  Wall {i} (id={getattr(wall, 'id', i)}): base_len={length:.3f} m, scale_inv={final_scales[i]:.4f} (undo sx={sx_align:.4f}, sy={sy_align:.4f}), new_len={new_length:.3f} m, start=({wall.ax:.3f},{wall.ay:.3f}), end=({wall.bx:.3f},{wall.by:.3f})")
+
+    first_start = np.array([optimized_walls[0].ax, optimized_walls[0].ay])
+    last_end = np.array([optimized_walls[-1].bx, optimized_walls[-1].by])
+    closure_error = np.linalg.norm(first_start - last_end)
+    if closure_error > 1e-3:
+        print(f"WARNING: Closure error = {closure_error:.6f} meters")
+    else:
+        print(f"Closure check passed: residual distance between first/last = {closure_error:.6e} m")
+
+	# 9. Diagram with meta
+    meta = {
+        'scale_min': float(final_scales.min()),
+        'scale_mean': float(final_scales.mean()),
+        'scale_max': float(final_scales.max()),
+        'closure_residual': float(np.linalg.norm(eq_residual)),
+        'objective': None,
+    }
+    generate_diagram(layout, optimized_layout, hull_points, diagram_path, meta_info=meta, hull_points_unscaled=hull_points_unscaled)
+    return optimized_layout
\ No newline at end of file
diff --git a/spatiallm/model/sonata_encoder.py b/spatiallm/model/sonata_encoder.py
index d4c396e..5077857 100644
--- a/spatiallm/model/sonata_encoder.py
+++ b/spatiallm/model/sonata_encoder.py
@@ -316,7 +316,7 @@ def __init__(
         proj_drop=0.0,
         order_index=0,
         enable_rpe=False,
-        enable_flash=True,
+        enable_flash=False,
         upcast_attention=True,
         upcast_softmax=True,
     ):
@@ -523,7 +523,7 @@ def __init__(
         order_index=0,
         cpe_indice_key=None,
         enable_rpe=False,
-        enable_flash=True,
+        enable_flash=False,
         upcast_attention=True,
         upcast_softmax=True,
     ):
@@ -823,7 +823,7 @@ def __init__(
         pre_norm=True,
         shuffle_orders=True,
         enable_rpe=False,
-        enable_flash=True,
+        enable_flash=False,
         upcast_attention=False,
         upcast_softmax=False,
         mask_token=False,