From 24fb395ff3469d35effc9b42fc48ae205110c29f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Tue, 16 Dec 2025 12:06:13 -0300
Subject: [PATCH 01/16] Update README

---
 migration/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/README.md b/migration/README.md
index c88f01e8..c7639497 100644
--- a/migration/README.md
+++ b/migration/README.md
@@ -4,6 +4,9 @@ This section contains scripts to help before, during, and after migrations.
 ## [Mongosync Insights](mongosync_insights)
 This project parses **mongosync** logs and reads the internal database (metadata), generating a variety of plots to assist with monitoring and troubleshooting ongoing mongosync migrations.
 
+## [Toolbox](toolbox)
+It contains scripts used by Migration Factory team during the Data Capture Request.
+
 ### License
 
 [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0)

From 8f68655e34ee05f9f592464f1f0c230b540e9f01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Tue, 16 Dec 2025 12:07:30 -0300
Subject: [PATCH 02/16] toolbox scripts

---
 migration/toolbox/README.md              | 28 +++++++++++
 migration/toolbox/collectionSizes.js     | 62 ++++++++++++++++++++++++
 migration/toolbox/probIndexesComplete.js | 49 +++++++++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 migration/toolbox/README.md
 create mode 100644 migration/toolbox/collectionSizes.js
 create mode 100644 migration/toolbox/probIndexesComplete.js

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
new file mode 100644
index 00000000..c3eb1764
--- /dev/null
+++ b/migration/toolbox/README.md
@@ -0,0 +1,28 @@
+# Toolbox
+Toolbox is a collection of scripts used by Migration Factory team to facilitate the Data Capture Request.
+
+## Database and Collection size
+
+## Index size, parameters and utilization
+
+### License
+
+[Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0)
+
+DISCLAIMER
+----------
+Please note: all tools/ scripts in this repo are released for use "AS IS" **without any warranties of any kind**,
+including, but not limited to their installation, use, or performance.  We disclaim any and all warranties, either 
+express or implied, including but not limited to any warranty of noninfringement, merchantability, and/ or fitness 
+for a particular purpose.  We do not warrant that the technology will meet your requirements, that the operation 
+thereof will be uninterrupted or error-free, or that any errors will be corrected.
+
+Any use of these scripts and tools is **at your own risk**.  There is no guarantee that they have been through 
+thorough testing in a comparable environment and we are not responsible for any damage or data loss incurred with 
+their use.
+
+You are responsible for reviewing and testing any scripts you run *thoroughly* before use in any non-testing 
+environment.
+
+Thanks,  
+The MongoDB Support Team
diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js
new file mode 100644
index 00000000..ad4bae99
--- /dev/null
+++ b/migration/toolbox/collectionSizes.js
@@ -0,0 +1,62 @@
+// List of system databases to exclude
+const excludeDatabases = ['admin', 'config', 'local'];
+const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2);
+const databaseInfo = [];
+
+// Function to check if an array contains a value
+const arrayContains = function(arr, val) {
+    return arr.indexOf(val) !== -1;
+};
+
+// Get all databases and exclude system ones
+const databases = db.adminCommand('listDatabases').databases.filter(function(database) {
+    return !arrayContains(excludeDatabases, database.name);
+});
+
+// Debugging: Log the databases found
+//print("Databases found (excluding system databases):");
+//databases.forEach(function(database) {
+//    print(" - " + database.name);
+//});
+
+for (var i = 0; i < databases.length; i++) {
+    const database = databases[i];
+    const currentDb = db.getSiblingDB(database.name);
+
+    // Debugging: Log the current database being processed
+    //print("Processing database: " + database.name);
+
+    // Use getCollectionNames()
+    const collections = currentDb.getCollectionNames();
+    
+    // Debugging: Log collections found in the database
+    //print("Collections found in " + database.name + ":");
+    //if (collections.length === 0) {
+    //    print("  No collections found.");
+    //}
+    collections.forEach(function(collectionName) {
+        //print("  - " + collectionName);
+        const currentCollection = currentDb.getCollection(collectionName);
+        const stats = currentCollection.stats(); // Get collection stats
+
+        databaseInfo.push({
+            db: database.name,
+            collection: collectionName,
+            size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB
+            size: stats.size // Size in bytes
+        });
+    });
+}
+
+// Sort by size (descending order)
+databaseInfo.sort(function(a, b) {
+    return b.size - a.size;
+});
+
+// Print the sorted list of collections
+print("Database | Collection | Size (MB)");
+print("---------------------------------");
+for (var j = 0; j < databaseInfo.length; j++) {
+    const info = databaseInfo[j];
+    print(info.db + " | " + info.collection + " | " + info.size_MB + " MB");
+}
\ No newline at end of file
diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js
new file mode 100644
index 00000000..78793e58
--- /dev/null
+++ b/migration/toolbox/probIndexesComplete.js
@@ -0,0 +1,49 @@
+const indexesUtilization = [];
+const excludeDatabases = ['admin', 'config', 'local']
+const byteToMB = (byte) => ((byte/1024)/1024).toFixed(2);
+
+/* This version is used to get information on only a few DBs, add them to the following line*/
+const databases = db.adminCommand('listDatabases').databases.filter(({ name }) => !excludeDatabases.includes(name))
+const project = { $project: {'ops': "$accesses.ops", 'accesses.since': 1, 'name': 1, 'key': 1, 'spec': 1} };
+
+
+for (const database of databases) {
+	const currentDb = db.getSiblingDB(database.name)
+
+	currentDb.getCollectionInfos({ type: "collection" }).forEach(function(collection){
+        const currentCollection = currentDb.getCollection(collection.name);
+
+        const indexes = currentCollection.getIndexes();
+        const indexesSize = currentCollection.stats().indexSizes;
+
+        currentCollection.aggregate( [ { $indexStats: { } }, project ] ).forEach(function(index){
+            
+            const indexDetail = indexes.filter(i => i.name === index.name)[0];
+            const idxValues = Object.values(Object.assign({}, index.key));
+
+            let indexType = "commom";
+            if(index.name === '_id_') indexType = '[INTERNAL]';
+            else if(idxValues.includes('2dsphere')) indexType = '2dsphere';
+            else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack';
+            else if(indexDetail.textIndexVersion !== undefined) indexType = 'text';
+            else if(indexDetail.expireAfterSeconds !== undefined) indexType = 'TTL';
+            else if(indexDetail.partialFilterExpression !== undefined) indexType = 'Partial';
+            
+            indexesUtilization.push({
+                db: database.name, 
+                collection: collection.name, 
+                name: index.name,
+                type: indexType,
+                unique: index.spec.unique,
+                accesses: index.ops,
+                'size (MB)': parseFloat(byteToMB(indexesSize[index.name])),
+                size: indexesSize[index.name],
+                accesses_since: index.accesses.since,
+            })
+        });
+	})
+}
+
+//const indexesProblematic = indexesUtilization.filter(index => {return index.type === 'TTL'})
+console.table(indexesUtilization);
+//console.table(indexesProblematic);
\ No newline at end of file

From 459d2800df739a1cc957cd356c9c9389a201a279 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Tue, 16 Dec 2025 12:17:50 -0300
Subject: [PATCH 03/16] Update README

---
 migration/toolbox/README.md | 60 +++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
index c3eb1764..432c2e22 100644
--- a/migration/toolbox/README.md
+++ b/migration/toolbox/README.md
@@ -3,8 +3,68 @@ Toolbox is a collection of scripts used by Migration Factory team to facilitate
 
 ## Database and Collection size
 
+**Script:** `collectionSizes.js`
+
+Lists all databases and collections (excluding system databases: `admin`, `config`, `local`) with their sizes in MB, sorted from largest to smallest.
+
+### Usage
+
+```bash
+mongosh "mongodb://localhost:27017" --quiet collectionSizes.js
+```
+
+Or with authentication:
+
+```bash
+mongosh "mongodb://user:password@localhost:27017" --quiet collectionSizes.js
+```
+
+### Example Output
+
+```
+Database | Collection | Size (MB)
+---------------------------------
+mydb | largeCollection | 1024.50 MB
+mydb | mediumCollection | 256.25 MB
+otherdb | smallCollection | 12.00 MB
+```
+
 ## Index size, parameters and utilization
 
+**Script:** `probIndexesComplete.js`
+
+Collects index statistics across all user databases (excluding `admin`, `config`, `local`). For each index, it reports:
+- Database and collection name
+- Index name and type (common, TTL, Partial, text, 2dsphere, geoHaystack, or `[INTERNAL]` for `_id_`)
+- Whether the index is unique
+- Access count (ops) and when tracking started
+- Index size in MB and bytes
+
+### Usage
+
+```bash
+mongosh "mongodb://localhost:27017" --quiet probIndexesComplete.js
+```
+
+Or with authentication:
+
+```bash
+mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js
+```
+
+### Example Output
+
+```
+┌─────────┬────────┬────────────────┬──────────────┬────────────┬────────┬──────────┬──────────┬─────────┬─────────────────────────┐
+│ (index) │ db     │ collection     │ name         │ type       │ unique │ accesses │ size (MB)│ size    │ accesses_since          │
+├─────────┼────────┼────────────────┼──────────────┼────────────┼────────┼──────────┼──────────┼─────────┼─────────────────────────┤
+│ 0       │ mydb   │ users          │ _id_         │ [INTERNAL] │        │ 150      │ 0.25     │ 262144  │ 2024-01-15T10:30:00.000Z│
+│ 1       │ mydb   │ users          │ email_1      │ common     │ true   │ 1200     │ 0.12     │ 126976  │ 2024-01-15T10:30:00.000Z│
+│ 2       │ mydb   │ sessions       │ _id_         │ [INTERNAL] │        │ 50       │ 0.08     │ 81920   │ 2024-01-15T10:30:00.000Z│
+│ 3       │ mydb   │ sessions       │ expireAt_1   │ TTL        │        │ 0        │ 0.04     │ 40960   │ 2024-01-15T10:30:00.000Z│
+└─────────┴────────┴────────────────┴──────────────┴────────────┴────────┴──────────┴──────────┴─────────┴─────────────────────────┘
+```
+
 ### License
 
 [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0)

From e8e96a71f8f65ddfa26145e326808dc2aafa9d3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Wed, 4 Feb 2026 15:53:09 -0300
Subject: [PATCH 04/16] Add mongosync limitations

---
 migration/toolbox/README.md                   |  24 ++
 .../toolbox/README_limitations_checker.md     | 260 ++++++++++++++
 .../mongosync_limitations_checker_unified.py  | 334 ++++++++++++++++++
 3 files changed, 618 insertions(+)
 create mode 100644 migration/toolbox/README_limitations_checker.md
 create mode 100644 migration/toolbox/mongosync_limitations_checker_unified.py

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
index 432c2e22..96d4833b 100644
--- a/migration/toolbox/README.md
+++ b/migration/toolbox/README.md
@@ -65,6 +65,30 @@ mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js
 └─────────┴────────┴────────────────┴──────────────┴────────────┴────────┴──────────┴──────────┴─────────┴─────────────────────────┘
 ```
 
+## Mongosync Limitations Checker
+
+**Script:** `mongosync_limitations_checker_unified.py`
+
+Detects a known mongosync limitation where a collection has two indexes with the exact same key pattern—one unique and one non-unique. This condition can cause mongosync to fail during migrations.
+
+The script supports two modes:
+- **Online mode:** Connects directly to a MongoDB cluster via connection string
+- **Offline mode:** Parses a `getMongoData` JSON file (no cluster access required)
+
+### Quick Usage
+
+**Offline (getMongoData):**
+```bash
+python3 mongosync_limitations_checker_unified.py --getmongodata <file>.json
+```
+
+**Online (MongoDB cluster):**
+```bash
+python3 mongosync_limitations_checker_unified.py --uri "mongodb+srv://USER:PASS@host"
+```
+
+For full documentation, filtering options, and examples, see [README_limitations_checker.md](README_limitations_checker.md).
+
 ### License
 
 [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md
new file mode 100644
index 00000000..6fc28827
--- /dev/null
+++ b/migration/toolbox/README_limitations_checker.md
@@ -0,0 +1,260 @@
+# Mongosync Limitations Checker (Unified)
+
+This script detects a known **mongosync limitation**:
+
+> A collection that has two indexes with the exact same key pattern where one is **unique** and the other is **non-unique**.
+
+This condition can cause mongosync to fail or behave unexpectedly during migrations.  
+The script is intended as a **pre-check** for MRAs and migration readiness reviews.
+
+---
+
+## What the script does
+
+For every collection it scans, the script:
+
+1. Retrieves all index definitions.
+2. Separates them into:
+   - **unique** indexes
+   - **non-unique** indexes
+3. Compares index key patterns.
+4. Flags a limitation when it finds the *same key pattern* in both groups.
+
+### Output
+
+- Prints a clean terminal report.
+- Optionally writes a JSON report using `--out`.
+
+Each finding includes:
+- `database`
+- `collection`
+- `index_keys`
+- `unique_index_names`
+- `non_unique_index_names`
+
+**Sample terminal output:**
+
+```
+Starting mongosync limitations checker (ONLINE).
+Input: mongodb+srv://...
+Limitations found: 1
+
+- mydb.users | keys=[['email', 1]] | unique=['email_unique_idx'] | non-unique=['email_idx']
+
+Finishing mongosync limitations checker.
+```
+
+**Sample JSON output** (when using `--out`):
+
+```json
+[
+  {
+    "database": "mydb",
+    "collection": "users",
+    "index_keys": [["email", 1]],
+    "unique_index_names": ["email_unique_idx"],
+    "non_unique_index_names": ["email_idx"]
+  }
+]
+```
+
+---
+
+## What it runs against
+
+The script supports **two modes**.
+
+### Online mode (MongoDB cluster)
+
+Reads indexes directly from a MongoDB deployment using a connection string.
+
+Supported:
+- MongoDB Atlas clusters
+- Self-managed replica sets / Sharded clusters
+
+### Offline mode (getMongoData JSON)
+
+Runs without cluster access by parsing a `getMongoData` output JSON.
+
+---
+
+## Requirements
+
+### Offline mode
+- Python 3.7+
+- No external dependencies
+
+### Online mode
+- Python 3.7+
+- PyMongo:
+```bash
+python3 -m pip install pymongo
+```
+
+---
+
+## Atlas / SRV TLS note
+
+PyMongo uses the Python/OS trust store. On some machines you may need `certifi`:
+```bash
+python3 -m pip install certifi
+```
+Run the script with `--use-certifi-ca` when connecting to Atlas.
+
+---
+
+## Usage
+
+Exactly one mode flag is required.
+```bash
+python3 mongosync_limitations_checker_unified.py \
+(--uri "<MONGODB_URI>" | --getmongodata <getMongoData.json>) \
+[flags...]
+```
+
+---
+
+## Flags
+
+**Mode selection (required)**
+
+| Flag             | Description                               |
+| ---------------- | ----------------------------------------- |
+| `--uri`          | Online mode. Connect to a MongoDB cluster |
+| `--getmongodata` | Offline mode. Parse getMongoData JSON     |
+
+---
+
+**Filters (apply to both modes)**
+
+| Flag            | Description                      |
+| --------------- | -------------------------------- |
+| `--include-dbs` | Comma-separated DB allow-list    |
+| `--exclude-dbs` | Comma-separated DB block-list    |
+| `--include-ns`  | Regex applied to `db.collection` |
+
+---
+
+**Output**
+
+| Flag    | Description                   |
+| ------- | ----------------------------- |
+| `--out` | Write findings to a JSON file |
+
+---
+
+**TLS helper (online only)**
+
+| Flag               | Description                                    |
+| ------------------ | ---------------------------------------------- |
+| `--use-certifi-ca` | Use certifi CA bundle (fixes Atlas TLS issues) |
+
+---
+
+## How to use the filters
+
+**Include / exclude DBs**
+
+```bash
+--include-dbs prod_01,prod_02
+--exclude-dbs test,staging
+```
+- System DBs (`admin`, `local`, `config`) are always skipped.
+
+**Namespace regex filter**
+
+The `--include-ns` flag accepts a regex pattern that is searched against the full namespace (`db.collection`):
+
+```bash
+--include-ns "^prod_"         # Namespaces starting with "prod_"
+--include-ns "\.users$"       # Collections ending with "users"
+--include-ns "orders"         # Namespaces containing "orders"
+```
+
+---
+
+## Examples
+
+### Offline (getMongoData)
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--getmongodata <getMongoData_output>.json
+```
+
+With JSON output:
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--getmongodata <getMongoData_output>.json \
+--out <output_file>.json
+```
+
+Offline + DB filter:
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--getmongodata <getMongoData_output>.json \
+--include-dbs <db1>,<db2> \
+--out <output_file>.json
+```
+
+---
+
+### Online (non-SRV)
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--uri "mongodb://<username>:<password>@<host>:<port>/admin?appName=<app_name>" \
+--out <output_file>.json
+```
+
+---
+
+### Online (Atlas SRV)
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--uri "mongodb+srv://USER:PASS@<SRV_conn_String>/admin?appName=checker" \
+--out <output_file>.json
+```
+
+If you see TLS errors:
+
+```bash
+python3 -m pip install certifi
+```
+
+Then: 
+
+```bash
+python3 mongosync_limitations_checker_unified.py \
+--uri "mongodb+srv://USER:PASS@<SRV_conn_String>/admin?appName=checker" \
+--use-certifi-ca \
+--out <output_file>.json
+```
+
+---
+
+## Notes
+
+- The script is read-only.
+- Permission errors on specific collections are skipped.
+
+DISCLAIMER
+----------
+Please note: all tools/ scripts in this repo are released for use "AS IS" **without any warranties of any kind**,
+including, but not limited to their installation, use, or performance.  We disclaim any and all warranties, either 
+express or implied, including but not limited to any warranty of noninfringement, merchantability, and/ or fitness 
+for a particular purpose.  We do not warrant that the technology will meet your requirements, that the operation 
+thereof will be uninterrupted or error-free, or that any errors will be corrected.
+
+Any use of these scripts and tools is **at your own risk**.  There is no guarantee that they have been through 
+thorough testing in a comparable environment and we are not responsible for any damage or data loss incurred with 
+their use.
+
+You are responsible for reviewing and testing any scripts you run *thoroughly* before use in any non-testing 
+environment.
+
+Thanks,  
+The MongoDB Support Team
\ No newline at end of file
diff --git a/migration/toolbox/mongosync_limitations_checker_unified.py b/migration/toolbox/mongosync_limitations_checker_unified.py
new file mode 100644
index 00000000..e8035c5d
--- /dev/null
+++ b/migration/toolbox/mongosync_limitations_checker_unified.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Set, Tuple
+
+
+# -------------------------
+# Filter helpers
+# -------------------------
+
+def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]:
+    if not value:
+        return None
+    items = [v.strip() for v in value.split(",") if v.strip()]
+    return set(items) if items else None
+
+
+def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]:
+    if not pattern:
+        return None
+    return re.compile(pattern)
+
+
+def ns_allowed(
+    db: str,
+    coll: str,
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+) -> bool:
+    # include/exclude DBs
+    if include_dbs is not None and db not in include_dbs:
+        return False
+    if exclude_dbs is not None and db in exclude_dbs:
+        return False
+
+    # system DBs are always excluded
+    if db in ("admin", "local", "config"):
+        return False
+
+    # include-ns regex on db.collection
+    if include_ns_re is not None:
+        ns = f"{db}.{coll}"
+        if not include_ns_re.search(ns):
+            return False
+
+    return True
+
+
+# -------------------------
+# Normalization + core logic
+# -------------------------
+
+def normalize_key_pattern(key_obj: Any) -> FrozenSet[Tuple[str, Any]]:
+    """
+    Normalize key patterns into a hashable representation.
+
+    NOTE: Order-insensitive comparison (frozenset), matching the behavior of your original script.
+    """
+    if isinstance(key_obj, dict):
+        return frozenset(key_obj.items())
+
+    if isinstance(key_obj, (list, tuple)):
+        pairs: List[Tuple[str, Any]] = []
+        ok = True
+        for item in key_obj:
+            if isinstance(item, (list, tuple)) and len(item) == 2:
+                pairs.append((str(item[0]), item[1]))
+            else:
+                ok = False
+                break
+        if ok:
+            return frozenset(pairs)
+
+    try:
+        items = list(key_obj.items())  # type: ignore[attr-defined]
+        return frozenset((str(k), v) for k, v in items)
+    except Exception:
+        return frozenset({("<<unrecognized_key>>", str(key_obj))})
+
+
+def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    index_rows yields dicts shaped like:
+      {
+        "database": str,
+        "collection": str,
+        "index_name": str,
+        "key": <dict or list of pairs>,
+        "unique": bool
+      }
+    """
+
+    per_collection: Dict[Tuple[str, str], Dict[FrozenSet[Tuple[str, Any]], Dict[str, List[str]]]] = defaultdict(
+        lambda: defaultdict(lambda: {"unique": [], "non_unique": []})
+    )
+
+    for row in index_rows:
+        db = row.get("database")
+        coll = row.get("collection")
+        name = row.get("index_name", "<unknown_index_name>")
+        key = row.get("key")
+        unique = bool(row.get("unique", False))
+
+        if not db or not coll or key is None:
+            continue
+
+        key_pattern = normalize_key_pattern(key)
+        bucket = "unique" if unique else "non_unique"
+        per_collection[(db, coll)][key_pattern][bucket].append(str(name))
+
+    limitations: List[Dict[str, Any]] = []
+
+    for (db, coll), by_key in per_collection.items():
+        for key_pattern, buckets in by_key.items():
+            if buckets["unique"] and buckets["non_unique"]:
+                limitations.append(
+                    {
+                        "database": db,
+                        "collection": coll,
+                        "index_keys": sorted([list(kv) for kv in key_pattern], key=lambda x: str(x[0])),
+                        "unique_index_names": sorted(set(buckets["unique"])),
+                        "non_unique_index_names": sorted(set(buckets["non_unique"])),
+                    }
+                )
+
+    limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"])))
+    return limitations
+
+
+# -------------------------
+# Offline extractor (getMongoData)
+# -------------------------
+
+def iter_indexes_from_getmongodata(
+    docs: List[Dict[str, Any]],
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+) -> Iterable[Dict[str, Any]]:
+    for doc in docs:
+        if doc.get("section") != "data_info":
+            continue
+        if doc.get("subsection") != "indexes":
+            continue
+        if doc.get("error") is not None:
+            continue
+
+        params = doc.get("commandParameters") or {}
+        db = params.get("db")
+        coll = params.get("collection")
+        output = doc.get("output")
+
+        if not db or not coll or not isinstance(output, list):
+            continue
+
+        if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re):
+            continue
+
+        for idx in output:
+            if not isinstance(idx, dict):
+                continue
+
+            yield {
+                "database": db,
+                "collection": coll,
+                "index_name": idx.get("name", "<unknown_index_name>"),
+                "key": idx.get("key"),
+                "unique": bool(idx.get("unique", False)),
+            }
+
+
+# -------------------------
+# Online extractor (MongoDB cluster)
+# -------------------------
+
+def iter_indexes_from_cluster(
+    uri: str,
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+    use_certifi_ca: bool = False,
+) -> Iterable[Dict[str, Any]]:
+    try:
+        from pymongo import MongoClient
+    except Exception as e:
+        raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}")
+
+    client_kwargs: Dict[str, Any] = {}
+    if use_certifi_ca:
+        try:
+            import certifi
+            client_kwargs["tlsCAFile"] = certifi.where()
+        except Exception as e:
+            raise RuntimeError(
+                f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}"
+            )
+
+    client = MongoClient(uri, **client_kwargs)
+    try:
+        db_names = client.list_database_names()
+        for db_name in db_names:
+            # DB-level filters first
+            if include_dbs is not None and db_name not in include_dbs:
+                continue
+            if exclude_dbs is not None and db_name in exclude_dbs:
+                continue
+            if db_name in ("admin", "local", "config"):
+                continue
+
+            db = client[db_name]
+            try:
+                coll_names = db.list_collection_names()
+            except Exception:
+                continue
+
+            for coll_name in coll_names:
+                if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re):
+                    continue
+
+                coll = db[coll_name]
+                try:
+                    for idx in coll.list_indexes():
+                        yield {
+                            "database": db_name,
+                            "collection": coll_name,
+                            "index_name": idx.get("name", "<unknown_index_name>"),
+                            "key": idx.get("key"),
+                            "unique": bool(idx.get("unique", False)),
+                        }
+                except Exception:
+                    continue
+    finally:
+        client.close()
+
+
+# -------------------------
+# Output helpers
+# -------------------------
+
+def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None:
+    print(title)
+    print(f"Input: {input_label}")
+    print(f"Limitations found: {len(limitations)}\n")
+
+    if not limitations:
+        print("No limitations found.")
+        return
+
+    for item in limitations:
+        ns = f"{item['database']}.{item['collection']}"
+        print(
+            f"- {ns} | keys={item['index_keys']} "
+            f"| unique={item['unique_index_names']} | non-unique={item['non_unique_index_names']}"
+        )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)."
+    )
+
+    mode = parser.add_mutually_exclusive_group(required=True)
+    mode.add_argument("--uri", help="MongoDB connection string (online mode).")
+    mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).")
+
+    # Filters
+    parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).")
+    parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.")
+    parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".')
+
+    # Output / TLS helpers
+    parser.add_argument("--out", default=None, help="Write limitations to a JSON file.")
+    parser.add_argument(
+        "--use-certifi-ca",
+        action="store_true",
+        help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).",
+    )
+
+    args = parser.parse_args()
+
+    include_dbs = _parse_csv_set(args.include_dbs)
+    exclude_dbs = _parse_csv_set(args.exclude_dbs)
+    include_ns_re = _compile_regex(args.include_ns)
+
+    try:
+        if args.uri:
+            rows = iter_indexes_from_cluster(
+                args.uri,
+                include_dbs=include_dbs,
+                exclude_dbs=exclude_dbs,
+                include_ns_re=include_ns_re,
+                use_certifi_ca=args.use_certifi_ca,
+            )
+            limitations = find_limitations(rows)
+            print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri)
+
+        else:
+            with open(args.getmongodata, "r", encoding="utf-8") as f:
+                docs = json.load(f)
+            if not isinstance(docs, list):
+                print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr)
+                return 2
+
+            rows = iter_indexes_from_getmongodata(
+                docs,
+                include_dbs=include_dbs,
+                exclude_dbs=exclude_dbs,
+                include_ns_re=include_ns_re,
+            )
+            limitations = find_limitations(rows)
+            print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata)
+
+        if args.out:
+            with open(args.out, "w", encoding="utf-8") as f:
+                json.dump(limitations, f, indent=2)
+            print(f"\nWrote JSON report to: {args.out}")
+
+        print("\nFinishing mongosync limitations checker.")
+        return 0
+
+    except Exception as e:
+        print(f"An error occurred: {e}", file=sys.stderr)
+        return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From dab00f553ada9b4793117f4a0bca7cb082a0b672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Mon, 9 Feb 2026 15:40:01 -0300
Subject: [PATCH 05/16] Change Limitation chacker name

---
 migration/toolbox/README_limitations_checker.md    | 14 +++++++-------
 ...=> mongosync_uniqueindex_limitation_checker.py} |  6 ++++--
 2 files changed, 11 insertions(+), 9 deletions(-)
 rename migration/toolbox/{mongosync_limitations_checker_unified.py => mongosync_uniqueindex_limitation_checker.py} (97%)

diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md
index 6fc28827..4712c9a4 100644
--- a/migration/toolbox/README_limitations_checker.md
+++ b/migration/toolbox/README_limitations_checker.md
@@ -107,7 +107,7 @@ Run the script with `--use-certifi-ca` when connecting to Atlas.
 
 Exactly one mode flag is required.
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 (--uri "<MONGODB_URI>" | --getmongodata <getMongoData.json>) \
 [flags...]
 ```
@@ -178,14 +178,14 @@ The `--include-ns` flag accepts a regex pattern that is searched against the ful
 ### Offline (getMongoData)
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --getmongodata <getMongoData_output>.json
 ```
 
 With JSON output:
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --getmongodata <getMongoData_output>.json \
 --out <output_file>.json
 ```
@@ -193,7 +193,7 @@ python3 mongosync_limitations_checker_unified.py \
 Offline + DB filter:
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --getmongodata <getMongoData_output>.json \
 --include-dbs <db1>,<db2> \
 --out <output_file>.json
@@ -204,7 +204,7 @@ python3 mongosync_limitations_checker_unified.py \
 ### Online (non-SRV)
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --uri "mongodb://<username>:<password>@<host>:<port>/admin?appName=<app_name>" \
 --out <output_file>.json
 ```
@@ -214,7 +214,7 @@ python3 mongosync_limitations_checker_unified.py \
 ### Online (Atlas SRV)
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --uri "mongodb+srv://USER:PASS@<SRV_conn_String>/admin?appName=checker" \
 --out <output_file>.json
 ```
@@ -228,7 +228,7 @@ python3 -m pip install certifi
 Then: 
 
 ```bash
-python3 mongosync_limitations_checker_unified.py \
+python3 mongosync_uniqueindex_limitation_checker.py \
 --uri "mongodb+srv://USER:PASS@<SRV_conn_String>/admin?appName=checker" \
 --use-certifi-ca \
 --out <output_file>.json
diff --git a/migration/toolbox/mongosync_limitations_checker_unified.py b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py
similarity index 97%
rename from migration/toolbox/mongosync_limitations_checker_unified.py
rename to migration/toolbox/mongosync_uniqueindex_limitation_checker.py
index e8035c5d..320fc416 100644
--- a/migration/toolbox/mongosync_limitations_checker_unified.py
+++ b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py
@@ -247,6 +247,7 @@ def iter_indexes_from_cluster(
 def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None:
     print(title)
     print(f"Input: {input_label}")
+    print("Checking for unique and non-unique indexes on the same field/s...")
     print(f"Limitations found: {len(limitations)}\n")
 
     if not limitations:
@@ -255,9 +256,10 @@ def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str
 
     for item in limitations:
         ns = f"{item['database']}.{item['collection']}"
+        keys_dict = {k: v for k, v in item["index_keys"]}
         print(
-            f"- {ns} | keys={item['index_keys']} "
-            f"| unique={item['unique_index_names']} | non-unique={item['non_unique_index_names']}"
+            f"- {ns} | keys={keys_dict} "
+            f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}"
         )
 
 

From fa12416481709810e17eb719e2e8da67969522de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 10:49:27 -0300
Subject: [PATCH 06/16] Update migration/toolbox/probIndexesComplete.js

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 migration/toolbox/probIndexesComplete.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js
index 78793e58..6c593b3a 100644
--- a/migration/toolbox/probIndexesComplete.js
+++ b/migration/toolbox/probIndexesComplete.js
@@ -21,7 +21,7 @@ for (const database of databases) {
             const indexDetail = indexes.filter(i => i.name === index.name)[0];
             const idxValues = Object.values(Object.assign({}, index.key));
 
-            let indexType = "commom";
+            let indexType = "common";
             if(index.name === '_id_') indexType = '[INTERNAL]';
             else if(idxValues.includes('2dsphere')) indexType = '2dsphere';
             else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack';

From d291b6827ec9d5a7f9130c5a1af3e97d3e44f488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 10:50:12 -0300
Subject: [PATCH 07/16] Update migration/toolbox/probIndexesComplete.js

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 migration/toolbox/probIndexesComplete.js | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js
index 6c593b3a..979561bb 100644
--- a/migration/toolbox/probIndexesComplete.js
+++ b/migration/toolbox/probIndexesComplete.js
@@ -18,16 +18,16 @@ for (const database of databases) {
 
         currentCollection.aggregate( [ { $indexStats: { } }, project ] ).forEach(function(index){
             
-            const indexDetail = indexes.filter(i => i.name === index.name)[0];
+            const indexDetail = indexes.find(i => i.name === index.name);
             const idxValues = Object.values(Object.assign({}, index.key));
 
-            let indexType = "common";
+            let indexType = "commom";
             if(index.name === '_id_') indexType = '[INTERNAL]';
             else if(idxValues.includes('2dsphere')) indexType = '2dsphere';
             else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack';
-            else if(indexDetail.textIndexVersion !== undefined) indexType = 'text';
-            else if(indexDetail.expireAfterSeconds !== undefined) indexType = 'TTL';
-            else if(indexDetail.partialFilterExpression !== undefined) indexType = 'Partial';
+            else if(indexDetail?.textIndexVersion !== undefined) indexType = 'text';
+            else if(indexDetail?.expireAfterSeconds !== undefined) indexType = 'TTL';
+            else if(indexDetail?.partialFilterExpression !== undefined) indexType = 'Partial';
             
             indexesUtilization.push({
                 db: database.name, 

From 029d335fdc09036666a54af0de30d1eeea393dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 10:51:18 -0300
Subject: [PATCH 08/16] Update migration/toolbox/README.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 migration/toolbox/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
index 96d4833b..881a6a59 100644
--- a/migration/toolbox/README.md
+++ b/migration/toolbox/README.md
@@ -67,7 +67,7 @@ mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js
 
 ## Mongosync Limitations Checker
 
-**Script:** `mongosync_limitations_checker_unified.py`
+**Script:** `mongosync_uniqueindex_limitation_checker.py`
 
 Detects a known mongosync limitation where a collection has two indexes with the exact same key pattern—one unique and one non-unique. This condition can cause mongosync to fail during migrations.
 

From 8b6ea88115c139f93abed47b0f8581657633b820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 10:53:02 -0300
Subject: [PATCH 09/16] Update migration/toolbox/probIndexesComplete.js

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 migration/toolbox/probIndexesComplete.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js
index 979561bb..dd2be959 100644
--- a/migration/toolbox/probIndexesComplete.js
+++ b/migration/toolbox/probIndexesComplete.js
@@ -2,7 +2,7 @@ const indexesUtilization = [];
 const excludeDatabases = ['admin', 'config', 'local']
 const byteToMB = (byte) => ((byte/1024)/1024).toFixed(2);
 
-/* This version is used to get information on only a few DBs, add them to the following line*/
+/* This version gets information for all non-system DBs. To limit it to specific DBs, edit the filter in the next line (e.g., by adding an explicit include list). */
 const databases = db.adminCommand('listDatabases').databases.filter(({ name }) => !excludeDatabases.includes(name))
 const project = { $project: {'ops': "$accesses.ops", 'accesses.since': 1, 'name': 1, 'key': 1, 'spec': 1} };
 

From 9406e77c361d21d1b76689b1f91bbba25d52266b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 11:07:01 -0300
Subject: [PATCH 10/16] Update README.md

---
 migration/toolbox/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
index 881a6a59..9f6a3b73 100644
--- a/migration/toolbox/README.md
+++ b/migration/toolbox/README.md
@@ -79,12 +79,12 @@ The script supports two modes:
 
 **Offline (getMongoData):**
 ```bash
-python3 mongosync_limitations_checker_unified.py --getmongodata <file>.json
+python3 mongosync_uniqueindex_limitation_checker.py --getmongodata <file>.json
 ```
 
 **Online (MongoDB cluster):**
 ```bash
-python3 mongosync_limitations_checker_unified.py --uri "mongodb+srv://USER:PASS@host"
+python3 mongosync_uniqueindex_limitation_checker.py --uri "mongodb+srv://USER:PASS@host"
 ```
 
 For full documentation, filtering options, and examples, see [README_limitations_checker.md](README_limitations_checker.md).

From 58e279e785cb9aa577f73712de82ab99d344ad04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 11:53:02 -0300
Subject: [PATCH 11/16] Update README.md

---
 migration/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/README.md b/migration/README.md
index c7639497..daa77ecf 100644
--- a/migration/README.md
+++ b/migration/README.md
@@ -5,7 +5,7 @@ This section contains scripts to help before, during, and after migrations.
 This project parses **mongosync** logs and reads the internal database (metadata), generating a variety of plots to assist with monitoring and troubleshooting ongoing mongosync migrations.
 
 ## [Toolbox](toolbox)
-It contains scripts used by Migration Factory team during the Data Capture Request.
+Toolbox is a collection of helper scripts created by the Migration Factory team for data capture and analysis
 
 ### License
 

From a93e464228fd87ef438fc4e76daa69c79d4487bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 11:54:39 -0300
Subject: [PATCH 12/16] Update README.md

---
 migration/toolbox/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md
index 9f6a3b73..c8b8b820 100644
--- a/migration/toolbox/README.md
+++ b/migration/toolbox/README.md
@@ -1,5 +1,5 @@
 # Toolbox
-Toolbox is a collection of scripts used by Migration Factory team to facilitate the Data Capture Request.
+Toolbox is a collection of helper scripts created by the Migration Factory team for data capture and analysis.
 
 ## Database and Collection size
 

From e91dcd5984717f2635a81d3253cf6dd465859bc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 11:55:13 -0300
Subject: [PATCH 13/16] Update migration/toolbox/README_limitations_checker.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 migration/toolbox/README_limitations_checker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md
index 4712c9a4..6204709c 100644
--- a/migration/toolbox/README_limitations_checker.md
+++ b/migration/toolbox/README_limitations_checker.md
@@ -39,7 +39,7 @@ Starting mongosync limitations checker (ONLINE).
 Input: mongodb+srv://...
 Limitations found: 1
 
-- mydb.users | keys=[['email', 1]] | unique=['email_unique_idx'] | non-unique=['email_idx']
+- mydb.users | keys={['email', 1]} | uniqueIndex=['email_unique_idx'] | non-uniqueIndex=['email_idx']
 
 Finishing mongosync limitations checker.
 ```

From 6058eca184f924d9fadb99f3a6c98f494961a1ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Thu, 12 Feb 2026 12:19:34 -0300
Subject: [PATCH 14/16] Update collectionSizes.js

---
 migration/toolbox/collectionSizes.js | 402 ++++++++++++++++++++++-----
 1 file changed, 340 insertions(+), 62 deletions(-)

diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js
index ad4bae99..d4d674f1 100644
--- a/migration/toolbox/collectionSizes.js
+++ b/migration/toolbox/collectionSizes.js
@@ -1,62 +1,340 @@
-// List of system databases to exclude
-const excludeDatabases = ['admin', 'config', 'local'];
-const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2);
-const databaseInfo = [];
-
-// Function to check if an array contains a value
-const arrayContains = function(arr, val) {
-    return arr.indexOf(val) !== -1;
-};
-
-// Get all databases and exclude system ones
-const databases = db.adminCommand('listDatabases').databases.filter(function(database) {
-    return !arrayContains(excludeDatabases, database.name);
-});
-
-// Debugging: Log the databases found
-//print("Databases found (excluding system databases):");
-//databases.forEach(function(database) {
-//    print(" - " + database.name);
-//});
-
-for (var i = 0; i < databases.length; i++) {
-    const database = databases[i];
-    const currentDb = db.getSiblingDB(database.name);
-
-    // Debugging: Log the current database being processed
-    //print("Processing database: " + database.name);
-
-    // Use getCollectionNames()
-    const collections = currentDb.getCollectionNames();
-    
-    // Debugging: Log collections found in the database
-    //print("Collections found in " + database.name + ":");
-    //if (collections.length === 0) {
-    //    print("  No collections found.");
-    //}
-    collections.forEach(function(collectionName) {
-        //print("  - " + collectionName);
-        const currentCollection = currentDb.getCollection(collectionName);
-        const stats = currentCollection.stats(); // Get collection stats
-
-        databaseInfo.push({
-            db: database.name,
-            collection: collectionName,
-            size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB
-            size: stats.size // Size in bytes
-        });
-    });
-}
-
-// Sort by size (descending order)
-databaseInfo.sort(function(a, b) {
-    return b.size - a.size;
-});
-
-// Print the sorted list of collections
-print("Database | Collection | Size (MB)");
-print("---------------------------------");
-for (var j = 0; j < databaseInfo.length; j++) {
-    const info = databaseInfo[j];
-    print(info.db + " | " + info.collection + " | " + info.size_MB + " MB");
-}
\ No newline at end of file
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+
+# Order-preserving, hashable signature for an index key pattern
+KeySig = Tuple[Tuple[str, Any], ...]
+
+
+
+# -------------------------
+# Filter helpers
+# -------------------------
+
+def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]:
+    if not value:
+        return None
+    items = [v.strip() for v in value.split(",") if v.strip()]
+    return set(items) if items else None
+
+
+def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]:
+    if not pattern:
+        return None
+    return re.compile(pattern)
+
+
+def ns_allowed(
+    db: str,
+    coll: str,
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+) -> bool:
+    # include/exclude DBs
+    if include_dbs is not None and db not in include_dbs:
+        return False
+    if exclude_dbs is not None and db in exclude_dbs:
+        return False
+
+    # system DBs are always excluded
+    if db in ("admin", "local", "config"):
+        return False
+
+    # include-ns regex on db.collection
+    if include_ns_re is not None:
+        ns = f"{db}.{coll}"
+        if not include_ns_re.search(ns):
+            return False
+
+    return True
+
+
+# -------------------------
+# Normalization + core logic
+# -------------------------
+
+def normalize_key_pattern(key_obj: Any) -> KeySig:
+    """
+    Normalize index key patterns into an order-preserving, hashable representation.
+
+    IMPORTANT: Order matters for compound indexes in MongoDB.
+    """
+
+    if isinstance(key_obj, dict):
+        return tuple((str(k), v) for k, v in key_obj.items())
+
+    if isinstance(key_obj, (list, tuple)):
+        pairs: List[Tuple[str, Any]] = []
+        for item in key_obj:
+            if isinstance(item, (list, tuple)) and len(item) == 2:
+                pairs.append((str(item[0]), item[1]))
+            else:
+                return (("<<unrecognized_key>>", str(key_obj)),)
+        return tuple(pairs)
+
+    # Last resort: try .items() (dict-like)
+    try:
+        items = list(key_obj.items())  # type: ignore[attr-defined]
+        return tuple((str(k), v) for k, v in items)
+    except Exception:
+        return (("<<unrecognized_key>>", str(key_obj)),)
+
+
+def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    index_rows yields dicts shaped like:
+      {
+        "database": str,
+        "collection": str,
+        "index_name": str,
+        "key": <dict or list of pairs>,
+        "unique": bool
+      }
+    """
+
+    per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict(
+        lambda: defaultdict(lambda: {"unique": [], "non_unique": []})
+    )
+
+    for row in index_rows:
+        db = row.get("database")
+        coll = row.get("collection")
+        name = row.get("index_name", "<unknown_index_name>")
+        key = row.get("key")
+        unique = bool(row.get("unique", False))
+
+        if not db or not coll or key is None:
+            continue
+
+        key_pattern = normalize_key_pattern(key)
+        bucket = "unique" if unique else "non_unique"
+        per_collection[(db, coll)][key_pattern][bucket].append(str(name))
+
+    limitations: List[Dict[str, Any]] = []
+
+    for (db, coll), by_key in per_collection.items():
+        for key_pattern, buckets in by_key.items():
+            if buckets["unique"] and buckets["non_unique"]:
+                limitations.append(
+                    {
+                        "database": db,
+                        "collection": coll,
+                        "index_keys": [list(kv) for kv in key_pattern],
+                        "unique_index_names": sorted(set(buckets["unique"])),
+                        "non_unique_index_names": sorted(set(buckets["non_unique"])),
+                    }
+                )
+
+    limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"])))
+    return limitations
+
+
+# -------------------------
+# Offline extractor (getMongoData)
+# -------------------------
+
+def iter_indexes_from_getmongodata(
+    docs: List[Dict[str, Any]],
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+) -> Iterable[Dict[str, Any]]:
+    for doc in docs:
+        if doc.get("section") != "data_info":
+            continue
+        if doc.get("subsection") != "indexes":
+            continue
+        if doc.get("error") is not None:
+            continue
+
+        params = doc.get("commandParameters") or {}
+        db = params.get("db")
+        coll = params.get("collection")
+        output = doc.get("output")
+
+        if not db or not coll or not isinstance(output, list):
+            continue
+
+        if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re):
+            continue
+
+        for idx in output:
+            if not isinstance(idx, dict):
+                continue
+
+            yield {
+                "database": db,
+                "collection": coll,
+                "index_name": idx.get("name", "<unknown_index_name>"),
+                "key": idx.get("key"),
+                "unique": bool(idx.get("unique", False)),
+            }
+
+
+# -------------------------
+# Online extractor (MongoDB cluster)
+# -------------------------
+
+def iter_indexes_from_cluster(
+    uri: str,
+    include_dbs: Optional[Set[str]],
+    exclude_dbs: Optional[Set[str]],
+    include_ns_re: Optional[re.Pattern],
+    use_certifi_ca: bool = False,
+) -> Iterable[Dict[str, Any]]:
+    try:
+        from pymongo import MongoClient
+    except Exception as e:
+        raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}")
+
+    client_kwargs: Dict[str, Any] = {}
+    if use_certifi_ca:
+        try:
+            import certifi
+            client_kwargs["tlsCAFile"] = certifi.where()
+        except Exception as e:
+            raise RuntimeError(
+                f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}"
+            )
+
+    client = MongoClient(uri, **client_kwargs)
+    try:
+        db_names = client.list_database_names()
+        for db_name in db_names:
+            # DB-level filters first
+            if include_dbs is not None and db_name not in include_dbs:
+                continue
+            if exclude_dbs is not None and db_name in exclude_dbs:
+                continue
+            if db_name in ("admin", "local", "config"):
+                continue
+
+            db = client[db_name]
+            try:
+                coll_names = db.list_collection_names()
+            except Exception:
+                continue
+
+            for coll_name in coll_names:
+                if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re):
+                    continue
+
+                coll = db[coll_name]
+                try:
+                    for idx in coll.list_indexes():
+                        yield {
+                            "database": db_name,
+                            "collection": coll_name,
+                            "index_name": idx.get("name", "<unknown_index_name>"),
+                            "key": idx.get("key"),
+                            "unique": bool(idx.get("unique", False)),
+                        }
+                except Exception:
+                    continue
+    finally:
+        client.close()
+
+
+# -------------------------
+# Output helpers
+# -------------------------
+
+def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None:
+    print(title)
+    print(f"Input: {input_label}")
+    print("Checking for unique and non-unique indexes on the same field/s...")
+    print(f"Limitations found: {len(limitations)}\n")
+
+    if not limitations:
+        print("No limitations found.")
+        return
+
+    for item in limitations:
+        ns = f"{item['database']}.{item['collection']}"
+        keys_dict = {k: v for k, v in item["index_keys"]}
+        print(
+            f"- {ns} | keys={keys_dict} "
+            f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}"
+        )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)."
+    )
+
+    mode = parser.add_mutually_exclusive_group(required=True)
+    mode.add_argument("--uri", help="MongoDB connection string (online mode).")
+    mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).")
+
+    # Filters
+    parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).")
+    parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.")
+    parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".')
+
+    # Output / TLS helpers
+    parser.add_argument("--out", default=None, help="Write limitations to a JSON file.")
+    parser.add_argument(
+        "--use-certifi-ca",
+        action="store_true",
+        help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).",
+    )
+
+    args = parser.parse_args()
+
+    include_dbs = _parse_csv_set(args.include_dbs)
+    exclude_dbs = _parse_csv_set(args.exclude_dbs)
+    include_ns_re = _compile_regex(args.include_ns)
+
+    try:
+        if args.uri:
+            rows = iter_indexes_from_cluster(
+                args.uri,
+                include_dbs=include_dbs,
+                exclude_dbs=exclude_dbs,
+                include_ns_re=include_ns_re,
+                use_certifi_ca=args.use_certifi_ca,
+            )
+            limitations = find_limitations(rows)
+            print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri)
+
+        else:
+            with open(args.getmongodata, "r", encoding="utf-8") as f:
+                docs = json.load(f)
+            if not isinstance(docs, list):
+                print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr)
+                return 2
+
+            rows = iter_indexes_from_getmongodata(
+                docs,
+                include_dbs=include_dbs,
+                exclude_dbs=exclude_dbs,
+                include_ns_re=include_ns_re,
+            )
+            limitations = find_limitations(rows)
+            print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata)
+
+        if args.out:
+            with open(args.out, "w", encoding="utf-8") as f:
+                json.dump(limitations, f, indent=2)
+            print(f"\nWrote JSON report to: {args.out}")
+
+        print("\nFinishing mongosync limitations checker.")
+        return 0
+
+    except Exception as e:
+        print(f"An error occurred: {e}", file=sys.stderr)
+        return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 1a5547fbf8c4a0d5f33899cc4a804780b646c387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Fri, 13 Feb 2026 10:30:11 -0300
Subject: [PATCH 15/16] Update collectionSizes.js

---
 migration/toolbox/collectionSizes.js | 402 +++++----------------------
 1 file changed, 62 insertions(+), 340 deletions(-)

diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js
index d4d674f1..f6c671e2 100644
--- a/migration/toolbox/collectionSizes.js
+++ b/migration/toolbox/collectionSizes.js
@@ -1,340 +1,62 @@
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-import sys
-from collections import defaultdict
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
-
-
-# Order-preserving, hashable signature for an index key pattern
-KeySig = Tuple[Tuple[str, Any], ...]
-
-
-
-# -------------------------
-# Filter helpers
-# -------------------------
-
-def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]:
-    if not value:
-        return None
-    items = [v.strip() for v in value.split(",") if v.strip()]
-    return set(items) if items else None
-
-
-def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]:
-    if not pattern:
-        return None
-    return re.compile(pattern)
-
-
-def ns_allowed(
-    db: str,
-    coll: str,
-    include_dbs: Optional[Set[str]],
-    exclude_dbs: Optional[Set[str]],
-    include_ns_re: Optional[re.Pattern],
-) -> bool:
-    # include/exclude DBs
-    if include_dbs is not None and db not in include_dbs:
-        return False
-    if exclude_dbs is not None and db in exclude_dbs:
-        return False
-
-    # system DBs are always excluded
-    if db in ("admin", "local", "config"):
-        return False
-
-    # include-ns regex on db.collection
-    if include_ns_re is not None:
-        ns = f"{db}.{coll}"
-        if not include_ns_re.search(ns):
-            return False
-
-    return True
-
-
-# -------------------------
-# Normalization + core logic
-# -------------------------
-
-def normalize_key_pattern(key_obj: Any) -> KeySig:
-    """
-    Normalize index key patterns into an order-preserving, hashable representation.
-
-    IMPORTANT: Order matters for compound indexes in MongoDB.
-    """
-
-    if isinstance(key_obj, dict):
-        return tuple((str(k), v) for k, v in key_obj.items())
-
-    if isinstance(key_obj, (list, tuple)):
-        pairs: List[Tuple[str, Any]] = []
-        for item in key_obj:
-            if isinstance(item, (list, tuple)) and len(item) == 2:
-                pairs.append((str(item[0]), item[1]))
-            else:
-                return (("<<unrecognized_key>>", str(key_obj)),)
-        return tuple(pairs)
-
-    # Last resort: try .items() (dict-like)
-    try:
-        items = list(key_obj.items())  # type: ignore[attr-defined]
-        return tuple((str(k), v) for k, v in items)
-    except Exception:
-        return (("<<unrecognized_key>>", str(key_obj)),)
-
-
-def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    index_rows yields dicts shaped like:
-      {
-        "database": str,
-        "collection": str,
-        "index_name": str,
-        "key": <dict or list of pairs>,
-        "unique": bool
-      }
-    """
-
-    per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict(
-        lambda: defaultdict(lambda: {"unique": [], "non_unique": []})
-    )
-
-    for row in index_rows:
-        db = row.get("database")
-        coll = row.get("collection")
-        name = row.get("index_name", "<unknown_index_name>")
-        key = row.get("key")
-        unique = bool(row.get("unique", False))
-
-        if not db or not coll or key is None:
-            continue
-
-        key_pattern = normalize_key_pattern(key)
-        bucket = "unique" if unique else "non_unique"
-        per_collection[(db, coll)][key_pattern][bucket].append(str(name))
-
-    limitations: List[Dict[str, Any]] = []
-
-    for (db, coll), by_key in per_collection.items():
-        for key_pattern, buckets in by_key.items():
-            if buckets["unique"] and buckets["non_unique"]:
-                limitations.append(
-                    {
-                        "database": db,
-                        "collection": coll,
-                        "index_keys": [list(kv) for kv in key_pattern],
-                        "unique_index_names": sorted(set(buckets["unique"])),
-                        "non_unique_index_names": sorted(set(buckets["non_unique"])),
-                    }
-                )
-
-    limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"])))
-    return limitations
-
-
-# -------------------------
-# Offline extractor (getMongoData)
-# -------------------------
-
-def iter_indexes_from_getmongodata(
-    docs: List[Dict[str, Any]],
-    include_dbs: Optional[Set[str]],
-    exclude_dbs: Optional[Set[str]],
-    include_ns_re: Optional[re.Pattern],
-) -> Iterable[Dict[str, Any]]:
-    for doc in docs:
-        if doc.get("section") != "data_info":
-            continue
-        if doc.get("subsection") != "indexes":
-            continue
-        if doc.get("error") is not None:
-            continue
-
-        params = doc.get("commandParameters") or {}
-        db = params.get("db")
-        coll = params.get("collection")
-        output = doc.get("output")
-
-        if not db or not coll or not isinstance(output, list):
-            continue
-
-        if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re):
-            continue
-
-        for idx in output:
-            if not isinstance(idx, dict):
-                continue
-
-            yield {
-                "database": db,
-                "collection": coll,
-                "index_name": idx.get("name", "<unknown_index_name>"),
-                "key": idx.get("key"),
-                "unique": bool(idx.get("unique", False)),
-            }
-
-
-# -------------------------
-# Online extractor (MongoDB cluster)
-# -------------------------
-
-def iter_indexes_from_cluster(
-    uri: str,
-    include_dbs: Optional[Set[str]],
-    exclude_dbs: Optional[Set[str]],
-    include_ns_re: Optional[re.Pattern],
-    use_certifi_ca: bool = False,
-) -> Iterable[Dict[str, Any]]:
-    try:
-        from pymongo import MongoClient
-    except Exception as e:
-        raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}")
-
-    client_kwargs: Dict[str, Any] = {}
-    if use_certifi_ca:
-        try:
-            import certifi
-            client_kwargs["tlsCAFile"] = certifi.where()
-        except Exception as e:
-            raise RuntimeError(
-                f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}"
-            )
-
-    client = MongoClient(uri, **client_kwargs)
-    try:
-        db_names = client.list_database_names()
-        for db_name in db_names:
-            # DB-level filters first
-            if include_dbs is not None and db_name not in include_dbs:
-                continue
-            if exclude_dbs is not None and db_name in exclude_dbs:
-                continue
-            if db_name in ("admin", "local", "config"):
-                continue
-
-            db = client[db_name]
-            try:
-                coll_names = db.list_collection_names()
-            except Exception:
-                continue
-
-            for coll_name in coll_names:
-                if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re):
-                    continue
-
-                coll = db[coll_name]
-                try:
-                    for idx in coll.list_indexes():
-                        yield {
-                            "database": db_name,
-                            "collection": coll_name,
-                            "index_name": idx.get("name", "<unknown_index_name>"),
-                            "key": idx.get("key"),
-                            "unique": bool(idx.get("unique", False)),
-                        }
-                except Exception:
-                    continue
-    finally:
-        client.close()
-
-
-# -------------------------
-# Output helpers
-# -------------------------
-
-def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None:
-    print(title)
-    print(f"Input: {input_label}")
-    print("Checking for unique and non-unique indexes on the same field/s...")
-    print(f"Limitations found: {len(limitations)}\n")
-
-    if not limitations:
-        print("No limitations found.")
-        return
-
-    for item in limitations:
-        ns = f"{item['database']}.{item['collection']}"
-        keys_dict = {k: v for k, v in item["index_keys"]}
-        print(
-            f"- {ns} | keys={keys_dict} "
-            f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}"
-        )
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)."
-    )
-
-    mode = parser.add_mutually_exclusive_group(required=True)
-    mode.add_argument("--uri", help="MongoDB connection string (online mode).")
-    mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).")
-
-    # Filters
-    parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).")
-    parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.")
-    parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".')
-
-    # Output / TLS helpers
-    parser.add_argument("--out", default=None, help="Write limitations to a JSON file.")
-    parser.add_argument(
-        "--use-certifi-ca",
-        action="store_true",
-        help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).",
-    )
-
-    args = parser.parse_args()
-
-    include_dbs = _parse_csv_set(args.include_dbs)
-    exclude_dbs = _parse_csv_set(args.exclude_dbs)
-    include_ns_re = _compile_regex(args.include_ns)
-
-    try:
-        if args.uri:
-            rows = iter_indexes_from_cluster(
-                args.uri,
-                include_dbs=include_dbs,
-                exclude_dbs=exclude_dbs,
-                include_ns_re=include_ns_re,
-                use_certifi_ca=args.use_certifi_ca,
-            )
-            limitations = find_limitations(rows)
-            print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri)
-
-        else:
-            with open(args.getmongodata, "r", encoding="utf-8") as f:
-                docs = json.load(f)
-            if not isinstance(docs, list):
-                print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr)
-                return 2
-
-            rows = iter_indexes_from_getmongodata(
-                docs,
-                include_dbs=include_dbs,
-                exclude_dbs=exclude_dbs,
-                include_ns_re=include_ns_re,
-            )
-            limitations = find_limitations(rows)
-            print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata)
-
-        if args.out:
-            with open(args.out, "w", encoding="utf-8") as f:
-                json.dump(limitations, f, indent=2)
-            print(f"\nWrote JSON report to: {args.out}")
-
-        print("\nFinishing mongosync limitations checker.")
-        return 0
-
-    except Exception as e:
-        print(f"An error occurred: {e}", file=sys.stderr)
-        return 2
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
+// List of system databases to exclude
+const excludeDatabases = ['admin', 'config', 'local'];
+const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2);
+const databaseInfo = [];
+
+// Function to check if an array contains a value
+const arrayContains = function(arr, val) {
+    return arr.indexOf(val) !== -1;
+};
+
+// Get all databases and exclude system ones
+const databases = db.adminCommand('listDatabases').databases.filter(function(database) {
+    return !arrayContains(excludeDatabases, database.name);
+});
+
+// Debugging: Log the databases found
+//print("Databases found (excluding system databases):");
+//databases.forEach(function(database) {
+//    print(" - " + database.name);
+//});
+
+for (var i = 0; i < databases.length; i++) {
+    const database = databases[i];
+    const currentDb = db.getSiblingDB(database.name);
+
+    // Debugging: Log the current database being processed
+    //print("Processing database: " + database.name);
+
+    // Use getCollectionNames()
+    const collections = currentDb.getCollectionNames();
+    
+    // Debugging: Log collections found in the database
+    //print("Collections found in " + database.name + ":");
+    //if (collections.length === 0) {
+    //    print("  No collections found.");
+    //}
+    collections.forEach(function(collectionName) {
+        //print("  - " + collectionName);
+        const currentCollection = currentDb.getCollection(collectionName);
+        const stats = currentCollection.stats(); // Get collection stats
+
+        databaseInfo.push({
+            db: database.name,
+            collection: collectionName,
+            size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB
+            size: stats.size // Size in bytes
+        });
+    });
+}
+
+// Sort by size (descending order)
+databaseInfo.sort(function(a, b) {
+    return b.size - a.size;
+});
+
+// Print the sorted list of collections
+print("Database | Collection | Size (MB)");
+print("---------------------------------");
+for (var j = 0; j < databaseInfo.length; j++) {
+    const info = databaseInfo[j];
+    print(info.db + " | " + info.collection + " | " + info.size_MB + " MB");
+}

From 6ab892a34858ee834d25c5792330c9fbb08bfeb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= <marcio.ribeiro@mongodb.com>
Date: Fri, 13 Feb 2026 10:31:26 -0300
Subject: [PATCH 16/16] Update mongosync_uniqueindex_limitation_checker.py

---
 ...ongosync_uniqueindex_limitation_checker.py | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/migration/toolbox/mongosync_uniqueindex_limitation_checker.py b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py
index 320fc416..d4d674f1 100644
--- a/migration/toolbox/mongosync_uniqueindex_limitation_checker.py
+++ b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py
@@ -7,7 +7,12 @@
 import re
 import sys
 from collections import defaultdict
-from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Set, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+
+# Order-preserving, hashable signature for an index key pattern
+KeySig = Tuple[Tuple[str, Any], ...]
+
 
 
 # -------------------------
@@ -57,32 +62,31 @@ def ns_allowed(
 # Normalization + core logic
 # -------------------------
 
-def normalize_key_pattern(key_obj: Any) -> FrozenSet[Tuple[str, Any]]:
+def normalize_key_pattern(key_obj: Any) -> KeySig:
     """
-    Normalize key patterns into a hashable representation.
+    Normalize index key patterns into an order-preserving, hashable representation.
 
-    NOTE: Order-insensitive comparison (frozenset), matching the behavior of your original script.
+    IMPORTANT: Order matters for compound indexes in MongoDB.
     """
+
     if isinstance(key_obj, dict):
-        return frozenset(key_obj.items())
+        return tuple((str(k), v) for k, v in key_obj.items())
 
     if isinstance(key_obj, (list, tuple)):
         pairs: List[Tuple[str, Any]] = []
-        ok = True
         for item in key_obj:
             if isinstance(item, (list, tuple)) and len(item) == 2:
                 pairs.append((str(item[0]), item[1]))
             else:
-                ok = False
-                break
-        if ok:
-            return frozenset(pairs)
+                return (("<<unrecognized_key>>", str(key_obj)),)
+        return tuple(pairs)
 
+    # Last resort: try .items() (dict-like)
     try:
         items = list(key_obj.items())  # type: ignore[attr-defined]
-        return frozenset((str(k), v) for k, v in items)
+        return tuple((str(k), v) for k, v in items)
     except Exception:
-        return frozenset({("<<unrecognized_key>>", str(key_obj))})
+        return (("<<unrecognized_key>>", str(key_obj)),)
 
 
 def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -97,7 +101,7 @@ def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any
       }
     """
 
-    per_collection: Dict[Tuple[str, str], Dict[FrozenSet[Tuple[str, Any]], Dict[str, List[str]]]] = defaultdict(
+    per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict(
         lambda: defaultdict(lambda: {"unique": [], "non_unique": []})
     )
 
@@ -124,7 +128,7 @@ def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any
                     {
                         "database": db,
                         "collection": coll,
-                        "index_keys": sorted([list(kv) for kv in key_pattern], key=lambda x: str(x[0])),
+                        "index_keys": [list(kv) for kv in key_pattern],
                         "unique_index_names": sorted(set(buckets["unique"])),
                         "non_unique_index_names": sorted(set(buckets["non_unique"])),
                     }