From 24fb395ff3469d35effc9b42fc48ae205110c29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Tue, 16 Dec 2025 12:06:13 -0300 Subject: [PATCH 01/16] Update README --- migration/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/migration/README.md b/migration/README.md index c88f01e8..c7639497 100644 --- a/migration/README.md +++ b/migration/README.md @@ -4,6 +4,9 @@ This section contains scripts to help before, during, and after migrations. ## [Mongosync Insights](mongosync_insights) This project parses **mongosync** logs and reads the internal database (metadata), generating a variety of plots to assist with monitoring and troubleshooting ongoing mongosync migrations. +## [Toolbox](toolbox) +It contains scripts used by Migration Factory team during the Data Capture Request. + ### License [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) From 8f68655e34ee05f9f592464f1f0c230b540e9f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Tue, 16 Dec 2025 12:07:30 -0300 Subject: [PATCH 02/16] toolbox scripts --- migration/toolbox/README.md | 28 +++++++++++ migration/toolbox/collectionSizes.js | 62 ++++++++++++++++++++++++ migration/toolbox/probIndexesComplete.js | 49 +++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 migration/toolbox/README.md create mode 100644 migration/toolbox/collectionSizes.js create mode 100644 migration/toolbox/probIndexesComplete.js diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md new file mode 100644 index 00000000..c3eb1764 --- /dev/null +++ b/migration/toolbox/README.md @@ -0,0 +1,28 @@ +# Toolbox +Toolbox is a collection of scripts used by Migration Factory team to facilitate the Data Capture Request. + +## Database and Collection size + +## Index size, parameters and utilization + +### License + +[Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) + +DISCLAIMER +---------- +Please note: all tools/ scripts in this repo are released for use "AS IS" **without any warranties of any kind**, +including, but not limited to their installation, use, or performance. We disclaim any and all warranties, either +express or implied, including but not limited to any warranty of noninfringement, merchantability, and/ or fitness +for a particular purpose. We do not warrant that the technology will meet your requirements, that the operation +thereof will be uninterrupted or error-free, or that any errors will be corrected. + +Any use of these scripts and tools is **at your own risk**. There is no guarantee that they have been through +thorough testing in a comparable environment and we are not responsible for any damage or data loss incurred with +their use. + +You are responsible for reviewing and testing any scripts you run *thoroughly* before use in any non-testing +environment. + +Thanks, +The MongoDB Support Team diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js new file mode 100644 index 00000000..ad4bae99 --- /dev/null +++ b/migration/toolbox/collectionSizes.js @@ -0,0 +1,62 @@ +// List of system databases to exclude +const excludeDatabases = ['admin', 'config', 'local']; +const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2); +const databaseInfo = []; + +// Function to check if an array contains a value +const arrayContains = function(arr, val) { + return arr.indexOf(val) !== -1; +}; + +// Get all databases and exclude system ones +const databases = db.adminCommand('listDatabases').databases.filter(function(database) { + return !arrayContains(excludeDatabases, database.name); +}); + +// Debugging: Log the databases found +//print("Databases found (excluding system databases):"); +//databases.forEach(function(database) { +// print(" - " + database.name); +//}); + +for (var i = 0; i < databases.length; i++) { + const database = databases[i]; + const currentDb = db.getSiblingDB(database.name); + + // Debugging: Log the current database being processed + //print("Processing database: " + database.name); + + // Use getCollectionNames() + const collections = currentDb.getCollectionNames(); + + // Debugging: Log collections found in the database + //print("Collections found in " + database.name + ":"); + //if (collections.length === 0) { + // print(" No collections found."); + //} + collections.forEach(function(collectionName) { + //print(" - " + collectionName); + const currentCollection = currentDb.getCollection(collectionName); + const stats = currentCollection.stats(); // Get collection stats + + databaseInfo.push({ + db: database.name, + collection: collectionName, + size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB + size: stats.size // Size in bytes + }); + }); +} + +// Sort by size (descending order) +databaseInfo.sort(function(a, b) { + return b.size - a.size; +}); + +// Print the sorted list of collections +print("Database | Collection | Size (MB)"); +print("---------------------------------"); +for (var j = 0; j < databaseInfo.length; j++) { + const info = databaseInfo[j]; + print(info.db + " | " + info.collection + " | " + info.size_MB + " MB"); +} \ No newline at end of file diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js new file mode 100644 index 00000000..78793e58 --- /dev/null +++ b/migration/toolbox/probIndexesComplete.js @@ -0,0 +1,49 @@ +const indexesUtilization = []; +const excludeDatabases = ['admin', 'config', 'local'] +const byteToMB = (byte) => ((byte/1024)/1024).toFixed(2); + +/* This version is used to get information on only a few DBs, add them to the following line*/ +const databases = db.adminCommand('listDatabases').databases.filter(({ name }) => !excludeDatabases.includes(name)) +const project = { $project: {'ops': "$accesses.ops", 'accesses.since': 1, 'name': 1, 'key': 1, 'spec': 1} }; + + +for (const database of databases) { + const currentDb = db.getSiblingDB(database.name) + + currentDb.getCollectionInfos({ type: "collection" }).forEach(function(collection){ + const currentCollection = currentDb.getCollection(collection.name); + + const indexes = currentCollection.getIndexes(); + const indexesSize = currentCollection.stats().indexSizes; + + currentCollection.aggregate( [ { $indexStats: { } }, project ] ).forEach(function(index){ + + const indexDetail = indexes.filter(i => i.name === index.name)[0]; + const idxValues = Object.values(Object.assign({}, index.key)); + + let indexType = "commom"; + if(index.name === '_id_') indexType = '[INTERNAL]'; + else if(idxValues.includes('2dsphere')) indexType = '2dsphere'; + else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack'; + else if(indexDetail.textIndexVersion !== undefined) indexType = 'text'; + else if(indexDetail.expireAfterSeconds !== undefined) indexType = 'TTL'; + else if(indexDetail.partialFilterExpression !== undefined) indexType = 'Partial'; + + indexesUtilization.push({ + db: database.name, + collection: collection.name, + name: index.name, + type: indexType, + unique: index.spec.unique, + accesses: index.ops, + 'size (MB)': parseFloat(byteToMB(indexesSize[index.name])), + size: indexesSize[index.name], + accesses_since: index.accesses.since, + }) + }); + }) +} + +//const indexesProblematic = indexesUtilization.filter(index => {return index.type === 'TTL'}) +console.table(indexesUtilization); +//console.table(indexesProblematic); \ No newline at end of file From 459d2800df739a1cc957cd356c9c9389a201a279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Tue, 16 Dec 2025 12:17:50 -0300 Subject: [PATCH 03/16] Update README --- migration/toolbox/README.md | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md index c3eb1764..432c2e22 100644 --- a/migration/toolbox/README.md +++ b/migration/toolbox/README.md @@ -3,8 +3,68 @@ Toolbox is a collection of scripts used by Migration Factory team to facilitate ## Database and Collection size +**Script:** `collectionSizes.js` + +Lists all databases and collections (excluding system databases: `admin`, `config`, `local`) with their sizes in MB, sorted from largest to smallest. + +### Usage + +```bash +mongosh "mongodb://localhost:27017" --quiet collectionSizes.js +``` + +Or with authentication: + +```bash +mongosh "mongodb://user:password@localhost:27017" --quiet collectionSizes.js +``` + +### Example Output + +``` +Database | Collection | Size (MB) +--------------------------------- +mydb | largeCollection | 1024.50 MB +mydb | mediumCollection | 256.25 MB +otherdb | smallCollection | 12.00 MB +``` + ## Index size, parameters and utilization +**Script:** `probIndexesComplete.js` + +Collects index statistics across all user databases (excluding `admin`, `config`, `local`). For each index, it reports: +- Database and collection name +- Index name and type (common, TTL, Partial, text, 2dsphere, geoHaystack, or `[INTERNAL]` for `_id_`) +- Whether the index is unique +- Access count (ops) and when tracking started +- Index size in MB and bytes + +### Usage + +```bash +mongosh "mongodb://localhost:27017" --quiet probIndexesComplete.js +``` + +Or with authentication: + +```bash +mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js +``` + +### Example Output + +``` +┌─────────┬────────┬────────────────┬──────────────┬────────────┬────────┬──────────┬──────────┬─────────┬─────────────────────────┐ +│ (index) │ db │ collection │ name │ type │ unique │ accesses │ size (MB)│ size │ accesses_since │ +├─────────┼────────┼────────────────┼──────────────┼────────────┼────────┼──────────┼──────────┼─────────┼─────────────────────────┤ +│ 0 │ mydb │ users │ _id_ │ [INTERNAL] │ │ 150 │ 0.25 │ 262144 │ 2024-01-15T10:30:00.000Z│ +│ 1 │ mydb │ users │ email_1 │ common │ true │ 1200 │ 0.12 │ 126976 │ 2024-01-15T10:30:00.000Z│ +│ 2 │ mydb │ sessions │ _id_ │ [INTERNAL] │ │ 50 │ 0.08 │ 81920 │ 2024-01-15T10:30:00.000Z│ +│ 3 │ mydb │ sessions │ expireAt_1 │ TTL │ │ 0 │ 0.04 │ 40960 │ 2024-01-15T10:30:00.000Z│ +└─────────┴────────┴────────────────┴──────────────┴────────────┴────────┴──────────┴──────────┴─────────┴─────────────────────────┘ +``` + ### License [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) From e8e96a71f8f65ddfa26145e326808dc2aafa9d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Wed, 4 Feb 2026 15:53:09 -0300 Subject: [PATCH 04/16] Add mongosync limitations --- migration/toolbox/README.md | 24 ++ .../toolbox/README_limitations_checker.md | 260 ++++++++++++++ .../mongosync_limitations_checker_unified.py | 334 ++++++++++++++++++ 3 files changed, 618 insertions(+) create mode 100644 migration/toolbox/README_limitations_checker.md create mode 100644 migration/toolbox/mongosync_limitations_checker_unified.py diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md index 432c2e22..96d4833b 100644 --- a/migration/toolbox/README.md +++ b/migration/toolbox/README.md @@ -65,6 +65,30 @@ mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js └─────────┴────────┴────────────────┴──────────────┴────────────┴────────┴──────────┴──────────┴─────────┴─────────────────────────┘ ``` +## Mongosync Limitations Checker + +**Script:** `mongosync_limitations_checker_unified.py` + +Detects a known mongosync limitation where a collection has two indexes with the exact same key pattern—one unique and one non-unique. This condition can cause mongosync to fail during migrations. + +The script supports two modes: +- **Online mode:** Connects directly to a MongoDB cluster via connection string +- **Offline mode:** Parses a `getMongoData` JSON file (no cluster access required) + +### Quick Usage + +**Offline (getMongoData):** +```bash +python3 mongosync_limitations_checker_unified.py --getmongodata .json +``` + +**Online (MongoDB cluster):** +```bash +python3 mongosync_limitations_checker_unified.py --uri "mongodb+srv://USER:PASS@host" +``` + +For full documentation, filtering options, and examples, see [README_limitations_checker.md](README_limitations_checker.md). + ### License [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md new file mode 100644 index 00000000..6fc28827 --- /dev/null +++ b/migration/toolbox/README_limitations_checker.md @@ -0,0 +1,260 @@ +# Mongosync Limitations Checker (Unified) + +This script detects a known **mongosync limitation**: + +> A collection that has two indexes with the exact same key pattern where one is **unique** and the other is **non-unique**. + +This condition can cause mongosync to fail or behave unexpectedly during migrations. +The script is intended as a **pre-check** for MRAs and migration readiness reviews. + +--- + +## What the script does + +For every collection it scans, the script: + +1. Retrieves all index definitions. +2. Separates them into: + - **unique** indexes + - **non-unique** indexes +3. Compares index key patterns. +4. Flags a limitation when it finds the *same key pattern* in both groups. + +### Output + +- Prints a clean terminal report. +- Optionally writes a JSON report using `--out`. + +Each finding includes: +- `database` +- `collection` +- `index_keys` +- `unique_index_names` +- `non_unique_index_names` + +**Sample terminal output:** + +``` +Starting mongosync limitations checker (ONLINE). +Input: mongodb+srv://... +Limitations found: 1 + +- mydb.users | keys=[['email', 1]] | unique=['email_unique_idx'] | non-unique=['email_idx'] + +Finishing mongosync limitations checker. +``` + +**Sample JSON output** (when using `--out`): + +```json +[ + { + "database": "mydb", + "collection": "users", + "index_keys": [["email", 1]], + "unique_index_names": ["email_unique_idx"], + "non_unique_index_names": ["email_idx"] + } +] +``` + +--- + +## What it runs against + +The script supports **two modes**. + +### Online mode (MongoDB cluster) + +Reads indexes directly from a MongoDB deployment using a connection string. + +Supported: +- MongoDB Atlas clusters +- Self-managed replica sets / Sharded clusters + +### Offline mode (getMongoData JSON) + +Runs without cluster access by parsing a `getMongoData` output JSON. + +--- + +## Requirements + +### Offline mode +- Python 3.7+ +- No external dependencies + +### Online mode +- Python 3.7+ +- PyMongo: +```bash +python3 -m pip install pymongo +``` + +--- + +## Atlas / SRV TLS note + +PyMongo uses the Python/OS trust store. On some machines you may need `certifi`: +```bash +python3 -m pip install certifi +``` +Run the script with `--use-certifi-ca` when connecting to Atlas. + +--- + +## Usage + +Exactly one mode flag is required. +```bash +python3 mongosync_limitations_checker_unified.py \ +(--uri "" | --getmongodata ) \ +[flags...] +``` + +--- + +## Flags + +**Mode selection (required)** + +| Flag | Description | +| ---------------- | ----------------------------------------- | +| `--uri` | Online mode. Connect to a MongoDB cluster | +| `--getmongodata` | Offline mode. Parse getMongoData JSON | + +--- + +**Filters (apply to both modes)** + +| Flag | Description | +| --------------- | -------------------------------- | +| `--include-dbs` | Comma-separated DB allow-list | +| `--exclude-dbs` | Comma-separated DB block-list | +| `--include-ns` | Regex applied to `db.collection` | + +--- + +**Output** + +| Flag | Description | +| ------- | ----------------------------- | +| `--out` | Write findings to a JSON file | + +--- + +**TLS helper (online only)** + +| Flag | Description | +| ------------------ | ---------------------------------------------- | +| `--use-certifi-ca` | Use certifi CA bundle (fixes Atlas TLS issues) | + +--- + +## How to use the filters + +**Include / exclude DBs** + +```bash +--include-dbs prod_01,prod_02 +--exclude-dbs test,staging +``` +- System DBs (`admin`, `local`, `config`) are always skipped. + +**Namespace regex filter** + +The `--include-ns` flag accepts a regex pattern that is searched against the full namespace (`db.collection`): + +```bash +--include-ns "^prod_" # Namespaces starting with "prod_" +--include-ns "\.users$" # Collections ending with "users" +--include-ns "orders" # Namespaces containing "orders" +``` + +--- + +## Examples + +### Offline (getMongoData) + +```bash +python3 mongosync_limitations_checker_unified.py \ +--getmongodata .json +``` + +With JSON output: + +```bash +python3 mongosync_limitations_checker_unified.py \ +--getmongodata .json \ +--out .json +``` + +Offline + DB filter: + +```bash +python3 mongosync_limitations_checker_unified.py \ +--getmongodata .json \ +--include-dbs , \ +--out .json +``` + +--- + +### Online (non-SRV) + +```bash +python3 mongosync_limitations_checker_unified.py \ +--uri "mongodb://:@:/admin?appName=" \ +--out .json +``` + +--- + +### Online (Atlas SRV) + +```bash +python3 mongosync_limitations_checker_unified.py \ +--uri "mongodb+srv://USER:PASS@/admin?appName=checker" \ +--out .json +``` + +If you see TLS errors: + +```bash +python3 -m pip install certifi +``` + +Then: + +```bash +python3 mongosync_limitations_checker_unified.py \ +--uri "mongodb+srv://USER:PASS@/admin?appName=checker" \ +--use-certifi-ca \ +--out .json +``` + +--- + +## Notes + +- The script is read-only. +- Permission errors on specific collections are skipped. + +DISCLAIMER +---------- +Please note: all tools/ scripts in this repo are released for use "AS IS" **without any warranties of any kind**, +including, but not limited to their installation, use, or performance. We disclaim any and all warranties, either +express or implied, including but not limited to any warranty of noninfringement, merchantability, and/ or fitness +for a particular purpose. We do not warrant that the technology will meet your requirements, that the operation +thereof will be uninterrupted or error-free, or that any errors will be corrected. + +Any use of these scripts and tools is **at your own risk**. There is no guarantee that they have been through +thorough testing in a comparable environment and we are not responsible for any damage or data loss incurred with +their use. + +You are responsible for reviewing and testing any scripts you run *thoroughly* before use in any non-testing +environment. + +Thanks, +The MongoDB Support Team \ No newline at end of file diff --git a/migration/toolbox/mongosync_limitations_checker_unified.py b/migration/toolbox/mongosync_limitations_checker_unified.py new file mode 100644 index 00000000..e8035c5d --- /dev/null +++ b/migration/toolbox/mongosync_limitations_checker_unified.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +import re +import sys +from collections import defaultdict +from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Set, Tuple + + +# ------------------------- +# Filter helpers +# ------------------------- + +def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]: + if not value: + return None + items = [v.strip() for v in value.split(",") if v.strip()] + return set(items) if items else None + + +def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]: + if not pattern: + return None + return re.compile(pattern) + + +def ns_allowed( + db: str, + coll: str, + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], +) -> bool: + # include/exclude DBs + if include_dbs is not None and db not in include_dbs: + return False + if exclude_dbs is not None and db in exclude_dbs: + return False + + # system DBs are always excluded + if db in ("admin", "local", "config"): + return False + + # include-ns regex on db.collection + if include_ns_re is not None: + ns = f"{db}.{coll}" + if not include_ns_re.search(ns): + return False + + return True + + +# ------------------------- +# Normalization + core logic +# ------------------------- + +def normalize_key_pattern(key_obj: Any) -> FrozenSet[Tuple[str, Any]]: + """ + Normalize key patterns into a hashable representation. + + NOTE: Order-insensitive comparison (frozenset), matching the behavior of your original script. + """ + if isinstance(key_obj, dict): + return frozenset(key_obj.items()) + + if isinstance(key_obj, (list, tuple)): + pairs: List[Tuple[str, Any]] = [] + ok = True + for item in key_obj: + if isinstance(item, (list, tuple)) and len(item) == 2: + pairs.append((str(item[0]), item[1])) + else: + ok = False + break + if ok: + return frozenset(pairs) + + try: + items = list(key_obj.items()) # type: ignore[attr-defined] + return frozenset((str(k), v) for k, v in items) + except Exception: + return frozenset({("<>", str(key_obj))}) + + +def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + index_rows yields dicts shaped like: + { + "database": str, + "collection": str, + "index_name": str, + "key": , + "unique": bool + } + """ + + per_collection: Dict[Tuple[str, str], Dict[FrozenSet[Tuple[str, Any]], Dict[str, List[str]]]] = defaultdict( + lambda: defaultdict(lambda: {"unique": [], "non_unique": []}) + ) + + for row in index_rows: + db = row.get("database") + coll = row.get("collection") + name = row.get("index_name", "") + key = row.get("key") + unique = bool(row.get("unique", False)) + + if not db or not coll or key is None: + continue + + key_pattern = normalize_key_pattern(key) + bucket = "unique" if unique else "non_unique" + per_collection[(db, coll)][key_pattern][bucket].append(str(name)) + + limitations: List[Dict[str, Any]] = [] + + for (db, coll), by_key in per_collection.items(): + for key_pattern, buckets in by_key.items(): + if buckets["unique"] and buckets["non_unique"]: + limitations.append( + { + "database": db, + "collection": coll, + "index_keys": sorted([list(kv) for kv in key_pattern], key=lambda x: str(x[0])), + "unique_index_names": sorted(set(buckets["unique"])), + "non_unique_index_names": sorted(set(buckets["non_unique"])), + } + ) + + limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"]))) + return limitations + + +# ------------------------- +# Offline extractor (getMongoData) +# ------------------------- + +def iter_indexes_from_getmongodata( + docs: List[Dict[str, Any]], + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], +) -> Iterable[Dict[str, Any]]: + for doc in docs: + if doc.get("section") != "data_info": + continue + if doc.get("subsection") != "indexes": + continue + if doc.get("error") is not None: + continue + + params = doc.get("commandParameters") or {} + db = params.get("db") + coll = params.get("collection") + output = doc.get("output") + + if not db or not coll or not isinstance(output, list): + continue + + if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re): + continue + + for idx in output: + if not isinstance(idx, dict): + continue + + yield { + "database": db, + "collection": coll, + "index_name": idx.get("name", ""), + "key": idx.get("key"), + "unique": bool(idx.get("unique", False)), + } + + +# ------------------------- +# Online extractor (MongoDB cluster) +# ------------------------- + +def iter_indexes_from_cluster( + uri: str, + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], + use_certifi_ca: bool = False, +) -> Iterable[Dict[str, Any]]: + try: + from pymongo import MongoClient + except Exception as e: + raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}") + + client_kwargs: Dict[str, Any] = {} + if use_certifi_ca: + try: + import certifi + client_kwargs["tlsCAFile"] = certifi.where() + except Exception as e: + raise RuntimeError( + f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}" + ) + + client = MongoClient(uri, **client_kwargs) + try: + db_names = client.list_database_names() + for db_name in db_names: + # DB-level filters first + if include_dbs is not None and db_name not in include_dbs: + continue + if exclude_dbs is not None and db_name in exclude_dbs: + continue + if db_name in ("admin", "local", "config"): + continue + + db = client[db_name] + try: + coll_names = db.list_collection_names() + except Exception: + continue + + for coll_name in coll_names: + if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re): + continue + + coll = db[coll_name] + try: + for idx in coll.list_indexes(): + yield { + "database": db_name, + "collection": coll_name, + "index_name": idx.get("name", ""), + "key": idx.get("key"), + "unique": bool(idx.get("unique", False)), + } + except Exception: + continue + finally: + client.close() + + +# ------------------------- +# Output helpers +# ------------------------- + +def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None: + print(title) + print(f"Input: {input_label}") + print(f"Limitations found: {len(limitations)}\n") + + if not limitations: + print("No limitations found.") + return + + for item in limitations: + ns = f"{item['database']}.{item['collection']}" + print( + f"- {ns} | keys={item['index_keys']} " + f"| unique={item['unique_index_names']} | non-unique={item['non_unique_index_names']}" + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)." + ) + + mode = parser.add_mutually_exclusive_group(required=True) + mode.add_argument("--uri", help="MongoDB connection string (online mode).") + mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).") + + # Filters + parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).") + parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.") + parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".') + + # Output / TLS helpers + parser.add_argument("--out", default=None, help="Write limitations to a JSON file.") + parser.add_argument( + "--use-certifi-ca", + action="store_true", + help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).", + ) + + args = parser.parse_args() + + include_dbs = _parse_csv_set(args.include_dbs) + exclude_dbs = _parse_csv_set(args.exclude_dbs) + include_ns_re = _compile_regex(args.include_ns) + + try: + if args.uri: + rows = iter_indexes_from_cluster( + args.uri, + include_dbs=include_dbs, + exclude_dbs=exclude_dbs, + include_ns_re=include_ns_re, + use_certifi_ca=args.use_certifi_ca, + ) + limitations = find_limitations(rows) + print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri) + + else: + with open(args.getmongodata, "r", encoding="utf-8") as f: + docs = json.load(f) + if not isinstance(docs, list): + print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr) + return 2 + + rows = iter_indexes_from_getmongodata( + docs, + include_dbs=include_dbs, + exclude_dbs=exclude_dbs, + include_ns_re=include_ns_re, + ) + limitations = find_limitations(rows) + print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata) + + if args.out: + with open(args.out, "w", encoding="utf-8") as f: + json.dump(limitations, f, indent=2) + print(f"\nWrote JSON report to: {args.out}") + + print("\nFinishing mongosync limitations checker.") + return 0 + + except Exception as e: + print(f"An error occurred: {e}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) From dab00f553ada9b4793117f4a0bca7cb082a0b672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Mon, 9 Feb 2026 15:40:01 -0300 Subject: [PATCH 05/16] Change Limitation chacker name --- migration/toolbox/README_limitations_checker.md | 14 +++++++------- ...=> mongosync_uniqueindex_limitation_checker.py} | 6 ++++-- 2 files changed, 11 insertions(+), 9 deletions(-) rename migration/toolbox/{mongosync_limitations_checker_unified.py => mongosync_uniqueindex_limitation_checker.py} (97%) diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md index 6fc28827..4712c9a4 100644 --- a/migration/toolbox/README_limitations_checker.md +++ b/migration/toolbox/README_limitations_checker.md @@ -107,7 +107,7 @@ Run the script with `--use-certifi-ca` when connecting to Atlas. Exactly one mode flag is required. ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ (--uri "" | --getmongodata ) \ [flags...] ``` @@ -178,14 +178,14 @@ The `--include-ns` flag accepts a regex pattern that is searched against the ful ### Offline (getMongoData) ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --getmongodata .json ``` With JSON output: ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --getmongodata .json \ --out .json ``` @@ -193,7 +193,7 @@ python3 mongosync_limitations_checker_unified.py \ Offline + DB filter: ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --getmongodata .json \ --include-dbs , \ --out .json @@ -204,7 +204,7 @@ python3 mongosync_limitations_checker_unified.py \ ### Online (non-SRV) ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --uri "mongodb://:@:/admin?appName=" \ --out .json ``` @@ -214,7 +214,7 @@ python3 mongosync_limitations_checker_unified.py \ ### Online (Atlas SRV) ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --uri "mongodb+srv://USER:PASS@/admin?appName=checker" \ --out .json ``` @@ -228,7 +228,7 @@ python3 -m pip install certifi Then: ```bash -python3 mongosync_limitations_checker_unified.py \ +python3 mongosync_uniqueindex_limitation_checker.py \ --uri "mongodb+srv://USER:PASS@/admin?appName=checker" \ --use-certifi-ca \ --out .json diff --git a/migration/toolbox/mongosync_limitations_checker_unified.py b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py similarity index 97% rename from migration/toolbox/mongosync_limitations_checker_unified.py rename to migration/toolbox/mongosync_uniqueindex_limitation_checker.py index e8035c5d..320fc416 100644 --- a/migration/toolbox/mongosync_limitations_checker_unified.py +++ b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py @@ -247,6 +247,7 @@ def iter_indexes_from_cluster( def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None: print(title) print(f"Input: {input_label}") + print("Checking for unique and non-unique indexes on the same field/s...") print(f"Limitations found: {len(limitations)}\n") if not limitations: @@ -255,9 +256,10 @@ def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str for item in limitations: ns = f"{item['database']}.{item['collection']}" + keys_dict = {k: v for k, v in item["index_keys"]} print( - f"- {ns} | keys={item['index_keys']} " - f"| unique={item['unique_index_names']} | non-unique={item['non_unique_index_names']}" + f"- {ns} | keys={keys_dict} " + f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}" ) From fa12416481709810e17eb719e2e8da67969522de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 10:49:27 -0300 Subject: [PATCH 06/16] Update migration/toolbox/probIndexesComplete.js Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- migration/toolbox/probIndexesComplete.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js index 78793e58..6c593b3a 100644 --- a/migration/toolbox/probIndexesComplete.js +++ b/migration/toolbox/probIndexesComplete.js @@ -21,7 +21,7 @@ for (const database of databases) { const indexDetail = indexes.filter(i => i.name === index.name)[0]; const idxValues = Object.values(Object.assign({}, index.key)); - let indexType = "commom"; + let indexType = "common"; if(index.name === '_id_') indexType = '[INTERNAL]'; else if(idxValues.includes('2dsphere')) indexType = '2dsphere'; else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack'; From d291b6827ec9d5a7f9130c5a1af3e97d3e44f488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 10:50:12 -0300 Subject: [PATCH 07/16] Update migration/toolbox/probIndexesComplete.js Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- migration/toolbox/probIndexesComplete.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js index 6c593b3a..979561bb 100644 --- a/migration/toolbox/probIndexesComplete.js +++ b/migration/toolbox/probIndexesComplete.js @@ -18,16 +18,16 @@ for (const database of databases) { currentCollection.aggregate( [ { $indexStats: { } }, project ] ).forEach(function(index){ - const indexDetail = indexes.filter(i => i.name === index.name)[0]; + const indexDetail = indexes.find(i => i.name === index.name); const idxValues = Object.values(Object.assign({}, index.key)); - let indexType = "common"; + let indexType = "commom"; if(index.name === '_id_') indexType = '[INTERNAL]'; else if(idxValues.includes('2dsphere')) indexType = '2dsphere'; else if(idxValues.includes("geoHaystack")) indexType = 'geoHaystack'; - else if(indexDetail.textIndexVersion !== undefined) indexType = 'text'; - else if(indexDetail.expireAfterSeconds !== undefined) indexType = 'TTL'; - else if(indexDetail.partialFilterExpression !== undefined) indexType = 'Partial'; + else if(indexDetail?.textIndexVersion !== undefined) indexType = 'text'; + else if(indexDetail?.expireAfterSeconds !== undefined) indexType = 'TTL'; + else if(indexDetail?.partialFilterExpression !== undefined) indexType = 'Partial'; indexesUtilization.push({ db: database.name, From 029d335fdc09036666a54af0de30d1eeea393dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 10:51:18 -0300 Subject: [PATCH 08/16] Update migration/toolbox/README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- migration/toolbox/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md index 96d4833b..881a6a59 100644 --- a/migration/toolbox/README.md +++ b/migration/toolbox/README.md @@ -67,7 +67,7 @@ mongosh "mongodb://user:password@localhost:27017" --quiet probIndexesComplete.js ## Mongosync Limitations Checker -**Script:** `mongosync_limitations_checker_unified.py` +**Script:** `mongosync_uniqueindex_limitation_checker.py` Detects a known mongosync limitation where a collection has two indexes with the exact same key pattern—one unique and one non-unique. This condition can cause mongosync to fail during migrations. From 8b6ea88115c139f93abed47b0f8581657633b820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 10:53:02 -0300 Subject: [PATCH 09/16] Update migration/toolbox/probIndexesComplete.js Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- migration/toolbox/probIndexesComplete.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/toolbox/probIndexesComplete.js b/migration/toolbox/probIndexesComplete.js index 979561bb..dd2be959 100644 --- a/migration/toolbox/probIndexesComplete.js +++ b/migration/toolbox/probIndexesComplete.js @@ -2,7 +2,7 @@ const indexesUtilization = []; const excludeDatabases = ['admin', 'config', 'local'] const byteToMB = (byte) => ((byte/1024)/1024).toFixed(2); -/* This version is used to get information on only a few DBs, add them to the following line*/ +/* This version gets information for all non-system DBs. To limit it to specific DBs, edit the filter in the next line (e.g., by adding an explicit include list). */ const databases = db.adminCommand('listDatabases').databases.filter(({ name }) => !excludeDatabases.includes(name)) const project = { $project: {'ops': "$accesses.ops", 'accesses.since': 1, 'name': 1, 'key': 1, 'spec': 1} }; From 9406e77c361d21d1b76689b1f91bbba25d52266b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 11:07:01 -0300 Subject: [PATCH 10/16] Update README.md --- migration/toolbox/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md index 881a6a59..9f6a3b73 100644 --- a/migration/toolbox/README.md +++ b/migration/toolbox/README.md @@ -79,12 +79,12 @@ The script supports two modes: **Offline (getMongoData):** ```bash -python3 mongosync_limitations_checker_unified.py --getmongodata .json +python3 mongosync_uniqueindex_limitation_checker.py --getmongodata .json ``` **Online (MongoDB cluster):** ```bash -python3 mongosync_limitations_checker_unified.py --uri "mongodb+srv://USER:PASS@host" +python3 mongosync_uniqueindex_limitation_checker.py --uri "mongodb+srv://USER:PASS@host" ``` For full documentation, filtering options, and examples, see [README_limitations_checker.md](README_limitations_checker.md). From 58e279e785cb9aa577f73712de82ab99d344ad04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 11:53:02 -0300 Subject: [PATCH 11/16] Update README.md --- migration/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/README.md b/migration/README.md index c7639497..daa77ecf 100644 --- a/migration/README.md +++ b/migration/README.md @@ -5,7 +5,7 @@ This section contains scripts to help before, during, and after migrations. This project parses **mongosync** logs and reads the internal database (metadata), generating a variety of plots to assist with monitoring and troubleshooting ongoing mongosync migrations. ## [Toolbox](toolbox) -It contains scripts used by Migration Factory team during the Data Capture Request. +Toolbox is a collection of helper scripts created by the Migration Factory team for data capture and analysis ### License From a93e464228fd87ef438fc4e76daa69c79d4487bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 11:54:39 -0300 Subject: [PATCH 12/16] Update README.md --- migration/toolbox/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/toolbox/README.md b/migration/toolbox/README.md index 9f6a3b73..c8b8b820 100644 --- a/migration/toolbox/README.md +++ b/migration/toolbox/README.md @@ -1,5 +1,5 @@ # Toolbox -Toolbox is a collection of scripts used by Migration Factory team to facilitate the Data Capture Request. +Toolbox is a collection of helper scripts created by the Migration Factory team for data capture and analysis. ## Database and Collection size From e91dcd5984717f2635a81d3253cf6dd465859bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 11:55:13 -0300 Subject: [PATCH 13/16] Update migration/toolbox/README_limitations_checker.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- migration/toolbox/README_limitations_checker.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/toolbox/README_limitations_checker.md b/migration/toolbox/README_limitations_checker.md index 4712c9a4..6204709c 100644 --- a/migration/toolbox/README_limitations_checker.md +++ b/migration/toolbox/README_limitations_checker.md @@ -39,7 +39,7 @@ Starting mongosync limitations checker (ONLINE). Input: mongodb+srv://... Limitations found: 1 -- mydb.users | keys=[['email', 1]] | unique=['email_unique_idx'] | non-unique=['email_idx'] +- mydb.users | keys={['email', 1]} | uniqueIndex=['email_unique_idx'] | non-uniqueIndex=['email_idx'] Finishing mongosync limitations checker. ``` From 6058eca184f924d9fadb99f3a6c98f494961a1ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Thu, 12 Feb 2026 12:19:34 -0300 Subject: [PATCH 14/16] Update collectionSizes.js --- migration/toolbox/collectionSizes.js | 402 ++++++++++++++++++++++----- 1 file changed, 340 insertions(+), 62 deletions(-) diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js index ad4bae99..d4d674f1 100644 --- a/migration/toolbox/collectionSizes.js +++ b/migration/toolbox/collectionSizes.js @@ -1,62 +1,340 @@ -// List of system databases to exclude -const excludeDatabases = ['admin', 'config', 'local']; -const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2); -const databaseInfo = []; - -// Function to check if an array contains a value -const arrayContains = function(arr, val) { - return arr.indexOf(val) !== -1; -}; - -// Get all databases and exclude system ones -const databases = db.adminCommand('listDatabases').databases.filter(function(database) { - return !arrayContains(excludeDatabases, database.name); -}); - -// Debugging: Log the databases found -//print("Databases found (excluding system databases):"); -//databases.forEach(function(database) { -// print(" - " + database.name); -//}); - -for (var i = 0; i < databases.length; i++) { - const database = databases[i]; - const currentDb = db.getSiblingDB(database.name); - - // Debugging: Log the current database being processed - //print("Processing database: " + database.name); - - // Use getCollectionNames() - const collections = currentDb.getCollectionNames(); - - // Debugging: Log collections found in the database - //print("Collections found in " + database.name + ":"); - //if (collections.length === 0) { - // print(" No collections found."); - //} - collections.forEach(function(collectionName) { - //print(" - " + collectionName); - const currentCollection = currentDb.getCollection(collectionName); - const stats = currentCollection.stats(); // Get collection stats - - databaseInfo.push({ - db: database.name, - collection: collectionName, - size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB - size: stats.size // Size in bytes - }); - }); -} - -// Sort by size (descending order) -databaseInfo.sort(function(a, b) { - return b.size - a.size; -}); - -// Print the sorted list of collections -print("Database | Collection | Size (MB)"); -print("---------------------------------"); -for (var j = 0; j < databaseInfo.length; j++) { - const info = databaseInfo[j]; - print(info.db + " | " + info.collection + " | " + info.size_MB + " MB"); -} \ No newline at end of file +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +import re +import sys +from collections import defaultdict +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + + +# Order-preserving, hashable signature for an index key pattern +KeySig = Tuple[Tuple[str, Any], ...] + + + +# ------------------------- +# Filter helpers +# ------------------------- + +def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]: + if not value: + return None + items = [v.strip() for v in value.split(",") if v.strip()] + return set(items) if items else None + + +def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]: + if not pattern: + return None + return re.compile(pattern) + + +def ns_allowed( + db: str, + coll: str, + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], +) -> bool: + # include/exclude DBs + if include_dbs is not None and db not in include_dbs: + return False + if exclude_dbs is not None and db in exclude_dbs: + return False + + # system DBs are always excluded + if db in ("admin", "local", "config"): + return False + + # include-ns regex on db.collection + if include_ns_re is not None: + ns = f"{db}.{coll}" + if not include_ns_re.search(ns): + return False + + return True + + +# ------------------------- +# Normalization + core logic +# ------------------------- + +def normalize_key_pattern(key_obj: Any) -> KeySig: + """ + Normalize index key patterns into an order-preserving, hashable representation. + + IMPORTANT: Order matters for compound indexes in MongoDB. + """ + + if isinstance(key_obj, dict): + return tuple((str(k), v) for k, v in key_obj.items()) + + if isinstance(key_obj, (list, tuple)): + pairs: List[Tuple[str, Any]] = [] + for item in key_obj: + if isinstance(item, (list, tuple)) and len(item) == 2: + pairs.append((str(item[0]), item[1])) + else: + return (("<>", str(key_obj)),) + return tuple(pairs) + + # Last resort: try .items() (dict-like) + try: + items = list(key_obj.items()) # type: ignore[attr-defined] + return tuple((str(k), v) for k, v in items) + except Exception: + return (("<>", str(key_obj)),) + + +def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + index_rows yields dicts shaped like: + { + "database": str, + "collection": str, + "index_name": str, + "key": , + "unique": bool + } + """ + + per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict( + lambda: defaultdict(lambda: {"unique": [], "non_unique": []}) + ) + + for row in index_rows: + db = row.get("database") + coll = row.get("collection") + name = row.get("index_name", "") + key = row.get("key") + unique = bool(row.get("unique", False)) + + if not db or not coll or key is None: + continue + + key_pattern = normalize_key_pattern(key) + bucket = "unique" if unique else "non_unique" + per_collection[(db, coll)][key_pattern][bucket].append(str(name)) + + limitations: List[Dict[str, Any]] = [] + + for (db, coll), by_key in per_collection.items(): + for key_pattern, buckets in by_key.items(): + if buckets["unique"] and buckets["non_unique"]: + limitations.append( + { + "database": db, + "collection": coll, + "index_keys": [list(kv) for kv in key_pattern], + "unique_index_names": sorted(set(buckets["unique"])), + "non_unique_index_names": sorted(set(buckets["non_unique"])), + } + ) + + limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"]))) + return limitations + + +# ------------------------- +# Offline extractor (getMongoData) +# ------------------------- + +def iter_indexes_from_getmongodata( + docs: List[Dict[str, Any]], + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], +) -> Iterable[Dict[str, Any]]: + for doc in docs: + if doc.get("section") != "data_info": + continue + if doc.get("subsection") != "indexes": + continue + if doc.get("error") is not None: + continue + + params = doc.get("commandParameters") or {} + db = params.get("db") + coll = params.get("collection") + output = doc.get("output") + + if not db or not coll or not isinstance(output, list): + continue + + if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re): + continue + + for idx in output: + if not isinstance(idx, dict): + continue + + yield { + "database": db, + "collection": coll, + "index_name": idx.get("name", ""), + "key": idx.get("key"), + "unique": bool(idx.get("unique", False)), + } + + +# ------------------------- +# Online extractor (MongoDB cluster) +# ------------------------- + +def iter_indexes_from_cluster( + uri: str, + include_dbs: Optional[Set[str]], + exclude_dbs: Optional[Set[str]], + include_ns_re: Optional[re.Pattern], + use_certifi_ca: bool = False, +) -> Iterable[Dict[str, Any]]: + try: + from pymongo import MongoClient + except Exception as e: + raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}") + + client_kwargs: Dict[str, Any] = {} + if use_certifi_ca: + try: + import certifi + client_kwargs["tlsCAFile"] = certifi.where() + except Exception as e: + raise RuntimeError( + f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}" + ) + + client = MongoClient(uri, **client_kwargs) + try: + db_names = client.list_database_names() + for db_name in db_names: + # DB-level filters first + if include_dbs is not None and db_name not in include_dbs: + continue + if exclude_dbs is not None and db_name in exclude_dbs: + continue + if db_name in ("admin", "local", "config"): + continue + + db = client[db_name] + try: + coll_names = db.list_collection_names() + except Exception: + continue + + for coll_name in coll_names: + if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re): + continue + + coll = db[coll_name] + try: + for idx in coll.list_indexes(): + yield { + "database": db_name, + "collection": coll_name, + "index_name": idx.get("name", ""), + "key": idx.get("key"), + "unique": bool(idx.get("unique", False)), + } + except Exception: + continue + finally: + client.close() + + +# ------------------------- +# Output helpers +# ------------------------- + +def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None: + print(title) + print(f"Input: {input_label}") + print("Checking for unique and non-unique indexes on the same field/s...") + print(f"Limitations found: {len(limitations)}\n") + + if not limitations: + print("No limitations found.") + return + + for item in limitations: + ns = f"{item['database']}.{item['collection']}" + keys_dict = {k: v for k, v in item["index_keys"]} + print( + f"- {ns} | keys={keys_dict} " + f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}" + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)." + ) + + mode = parser.add_mutually_exclusive_group(required=True) + mode.add_argument("--uri", help="MongoDB connection string (online mode).") + mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).") + + # Filters + parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).") + parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.") + parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".') + + # Output / TLS helpers + parser.add_argument("--out", default=None, help="Write limitations to a JSON file.") + parser.add_argument( + "--use-certifi-ca", + action="store_true", + help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).", + ) + + args = parser.parse_args() + + include_dbs = _parse_csv_set(args.include_dbs) + exclude_dbs = _parse_csv_set(args.exclude_dbs) + include_ns_re = _compile_regex(args.include_ns) + + try: + if args.uri: + rows = iter_indexes_from_cluster( + args.uri, + include_dbs=include_dbs, + exclude_dbs=exclude_dbs, + include_ns_re=include_ns_re, + use_certifi_ca=args.use_certifi_ca, + ) + limitations = find_limitations(rows) + print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri) + + else: + with open(args.getmongodata, "r", encoding="utf-8") as f: + docs = json.load(f) + if not isinstance(docs, list): + print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr) + return 2 + + rows = iter_indexes_from_getmongodata( + docs, + include_dbs=include_dbs, + exclude_dbs=exclude_dbs, + include_ns_re=include_ns_re, + ) + limitations = find_limitations(rows) + print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata) + + if args.out: + with open(args.out, "w", encoding="utf-8") as f: + json.dump(limitations, f, indent=2) + print(f"\nWrote JSON report to: {args.out}") + + print("\nFinishing mongosync limitations checker.") + return 0 + + except Exception as e: + print(f"An error occurred: {e}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) From 1a5547fbf8c4a0d5f33899cc4a804780b646c387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Fri, 13 Feb 2026 10:30:11 -0300 Subject: [PATCH 15/16] Update collectionSizes.js --- migration/toolbox/collectionSizes.js | 402 +++++---------------------- 1 file changed, 62 insertions(+), 340 deletions(-) diff --git a/migration/toolbox/collectionSizes.js b/migration/toolbox/collectionSizes.js index d4d674f1..f6c671e2 100644 --- a/migration/toolbox/collectionSizes.js +++ b/migration/toolbox/collectionSizes.js @@ -1,340 +1,62 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import argparse -import json -import re -import sys -from collections import defaultdict -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple - - -# Order-preserving, hashable signature for an index key pattern -KeySig = Tuple[Tuple[str, Any], ...] - - - -# ------------------------- -# Filter helpers -# ------------------------- - -def _parse_csv_set(value: Optional[str]) -> Optional[Set[str]]: - if not value: - return None - items = [v.strip() for v in value.split(",") if v.strip()] - return set(items) if items else None - - -def _compile_regex(pattern: Optional[str]) -> Optional[re.Pattern]: - if not pattern: - return None - return re.compile(pattern) - - -def ns_allowed( - db: str, - coll: str, - include_dbs: Optional[Set[str]], - exclude_dbs: Optional[Set[str]], - include_ns_re: Optional[re.Pattern], -) -> bool: - # include/exclude DBs - if include_dbs is not None and db not in include_dbs: - return False - if exclude_dbs is not None and db in exclude_dbs: - return False - - # system DBs are always excluded - if db in ("admin", "local", "config"): - return False - - # include-ns regex on db.collection - if include_ns_re is not None: - ns = f"{db}.{coll}" - if not include_ns_re.search(ns): - return False - - return True - - -# ------------------------- -# Normalization + core logic -# ------------------------- - -def normalize_key_pattern(key_obj: Any) -> KeySig: - """ - Normalize index key patterns into an order-preserving, hashable representation. - - IMPORTANT: Order matters for compound indexes in MongoDB. - """ - - if isinstance(key_obj, dict): - return tuple((str(k), v) for k, v in key_obj.items()) - - if isinstance(key_obj, (list, tuple)): - pairs: List[Tuple[str, Any]] = [] - for item in key_obj: - if isinstance(item, (list, tuple)) and len(item) == 2: - pairs.append((str(item[0]), item[1])) - else: - return (("<>", str(key_obj)),) - return tuple(pairs) - - # Last resort: try .items() (dict-like) - try: - items = list(key_obj.items()) # type: ignore[attr-defined] - return tuple((str(k), v) for k, v in items) - except Exception: - return (("<>", str(key_obj)),) - - -def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - index_rows yields dicts shaped like: - { - "database": str, - "collection": str, - "index_name": str, - "key": , - "unique": bool - } - """ - - per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict( - lambda: defaultdict(lambda: {"unique": [], "non_unique": []}) - ) - - for row in index_rows: - db = row.get("database") - coll = row.get("collection") - name = row.get("index_name", "") - key = row.get("key") - unique = bool(row.get("unique", False)) - - if not db or not coll or key is None: - continue - - key_pattern = normalize_key_pattern(key) - bucket = "unique" if unique else "non_unique" - per_collection[(db, coll)][key_pattern][bucket].append(str(name)) - - limitations: List[Dict[str, Any]] = [] - - for (db, coll), by_key in per_collection.items(): - for key_pattern, buckets in by_key.items(): - if buckets["unique"] and buckets["non_unique"]: - limitations.append( - { - "database": db, - "collection": coll, - "index_keys": [list(kv) for kv in key_pattern], - "unique_index_names": sorted(set(buckets["unique"])), - "non_unique_index_names": sorted(set(buckets["non_unique"])), - } - ) - - limitations.sort(key=lambda d: (d["database"], d["collection"], str(d["index_keys"]))) - return limitations - - -# ------------------------- -# Offline extractor (getMongoData) -# ------------------------- - -def iter_indexes_from_getmongodata( - docs: List[Dict[str, Any]], - include_dbs: Optional[Set[str]], - exclude_dbs: Optional[Set[str]], - include_ns_re: Optional[re.Pattern], -) -> Iterable[Dict[str, Any]]: - for doc in docs: - if doc.get("section") != "data_info": - continue - if doc.get("subsection") != "indexes": - continue - if doc.get("error") is not None: - continue - - params = doc.get("commandParameters") or {} - db = params.get("db") - coll = params.get("collection") - output = doc.get("output") - - if not db or not coll or not isinstance(output, list): - continue - - if not ns_allowed(db, coll, include_dbs, exclude_dbs, include_ns_re): - continue - - for idx in output: - if not isinstance(idx, dict): - continue - - yield { - "database": db, - "collection": coll, - "index_name": idx.get("name", ""), - "key": idx.get("key"), - "unique": bool(idx.get("unique", False)), - } - - -# ------------------------- -# Online extractor (MongoDB cluster) -# ------------------------- - -def iter_indexes_from_cluster( - uri: str, - include_dbs: Optional[Set[str]], - exclude_dbs: Optional[Set[str]], - include_ns_re: Optional[re.Pattern], - use_certifi_ca: bool = False, -) -> Iterable[Dict[str, Any]]: - try: - from pymongo import MongoClient - except Exception as e: - raise RuntimeError(f"PyMongo is required for --uri mode. Install with: pip install pymongo. Error: {e}") - - client_kwargs: Dict[str, Any] = {} - if use_certifi_ca: - try: - import certifi - client_kwargs["tlsCAFile"] = certifi.where() - except Exception as e: - raise RuntimeError( - f"--use-certifi-ca requested but certifi not available. Install: pip install certifi. Error: {e}" - ) - - client = MongoClient(uri, **client_kwargs) - try: - db_names = client.list_database_names() - for db_name in db_names: - # DB-level filters first - if include_dbs is not None and db_name not in include_dbs: - continue - if exclude_dbs is not None and db_name in exclude_dbs: - continue - if db_name in ("admin", "local", "config"): - continue - - db = client[db_name] - try: - coll_names = db.list_collection_names() - except Exception: - continue - - for coll_name in coll_names: - if not ns_allowed(db_name, coll_name, include_dbs, exclude_dbs, include_ns_re): - continue - - coll = db[coll_name] - try: - for idx in coll.list_indexes(): - yield { - "database": db_name, - "collection": coll_name, - "index_name": idx.get("name", ""), - "key": idx.get("key"), - "unique": bool(idx.get("unique", False)), - } - except Exception: - continue - finally: - client.close() - - -# ------------------------- -# Output helpers -# ------------------------- - -def print_report(limitations: List[Dict[str, Any]], title: str, input_label: str) -> None: - print(title) - print(f"Input: {input_label}") - print("Checking for unique and non-unique indexes on the same field/s...") - print(f"Limitations found: {len(limitations)}\n") - - if not limitations: - print("No limitations found.") - return - - for item in limitations: - ns = f"{item['database']}.{item['collection']}" - keys_dict = {k: v for k, v in item["index_keys"]} - print( - f"- {ns} | keys={keys_dict} " - f"| uniqueIndex={item['unique_index_names']} | non-uniqueIndex={item['non_unique_index_names']}" - ) - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Unified mongosync limitations checker (online MongoDB cluster OR offline getMongoData JSON)." - ) - - mode = parser.add_mutually_exclusive_group(required=True) - mode.add_argument("--uri", help="MongoDB connection string (online mode).") - mode.add_argument("--getmongodata", help="Path to getMongoData JSON file (offline mode).") - - # Filters - parser.add_argument("--include-dbs", default=None, help="Comma-separated DB list to include (only these DBs).") - parser.add_argument("--exclude-dbs", default=None, help="Comma-separated DB list to exclude.") - parser.add_argument("--include-ns", default=None, help=r'Regex filter on namespace "db.collection". Example: "^prod_".') - - # Output / TLS helpers - parser.add_argument("--out", default=None, help="Write limitations to a JSON file.") - parser.add_argument( - "--use-certifi-ca", - action="store_true", - help="Online mode only: use certifi CA bundle (fixes CERTIFICATE_VERIFY_FAILED on some machines).", - ) - - args = parser.parse_args() - - include_dbs = _parse_csv_set(args.include_dbs) - exclude_dbs = _parse_csv_set(args.exclude_dbs) - include_ns_re = _compile_regex(args.include_ns) - - try: - if args.uri: - rows = iter_indexes_from_cluster( - args.uri, - include_dbs=include_dbs, - exclude_dbs=exclude_dbs, - include_ns_re=include_ns_re, - use_certifi_ca=args.use_certifi_ca, - ) - limitations = find_limitations(rows) - print_report(limitations, "Starting mongosync limitations checker (ONLINE).", args.uri) - - else: - with open(args.getmongodata, "r", encoding="utf-8") as f: - docs = json.load(f) - if not isinstance(docs, list): - print("ERROR: getMongoData JSON top-level must be a list.", file=sys.stderr) - return 2 - - rows = iter_indexes_from_getmongodata( - docs, - include_dbs=include_dbs, - exclude_dbs=exclude_dbs, - include_ns_re=include_ns_re, - ) - limitations = find_limitations(rows) - print_report(limitations, "Starting mongosync limitations checker (OFFLINE getMongoData).", args.getmongodata) - - if args.out: - with open(args.out, "w", encoding="utf-8") as f: - json.dump(limitations, f, indent=2) - print(f"\nWrote JSON report to: {args.out}") - - print("\nFinishing mongosync limitations checker.") - return 0 - - except Exception as e: - print(f"An error occurred: {e}", file=sys.stderr) - return 2 - - -if __name__ == "__main__": - raise SystemExit(main()) +// List of system databases to exclude +const excludeDatabases = ['admin', 'config', 'local']; +const byteToMB = (byte) => ((byte / 1024) / 1024).toFixed(2); +const databaseInfo = []; + +// Function to check if an array contains a value +const arrayContains = function(arr, val) { + return arr.indexOf(val) !== -1; +}; + +// Get all databases and exclude system ones +const databases = db.adminCommand('listDatabases').databases.filter(function(database) { + return !arrayContains(excludeDatabases, database.name); +}); + +// Debugging: Log the databases found +//print("Databases found (excluding system databases):"); +//databases.forEach(function(database) { +// print(" - " + database.name); +//}); + +for (var i = 0; i < databases.length; i++) { + const database = databases[i]; + const currentDb = db.getSiblingDB(database.name); + + // Debugging: Log the current database being processed + //print("Processing database: " + database.name); + + // Use getCollectionNames() + const collections = currentDb.getCollectionNames(); + + // Debugging: Log collections found in the database + //print("Collections found in " + database.name + ":"); + //if (collections.length === 0) { + // print(" No collections found."); + //} + collections.forEach(function(collectionName) { + //print(" - " + collectionName); + const currentCollection = currentDb.getCollection(collectionName); + const stats = currentCollection.stats(); // Get collection stats + + databaseInfo.push({ + db: database.name, + collection: collectionName, + size_MB: parseFloat(byteToMB(stats.size)), // Collection size in MB + size: stats.size // Size in bytes + }); + }); +} + +// Sort by size (descending order) +databaseInfo.sort(function(a, b) { + return b.size - a.size; +}); + +// Print the sorted list of collections +print("Database | Collection | Size (MB)"); +print("---------------------------------"); +for (var j = 0; j < databaseInfo.length; j++) { + const info = databaseInfo[j]; + print(info.db + " | " + info.collection + " | " + info.size_MB + " MB"); +} From 6ab892a34858ee834d25c5792330c9fbb08bfeb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Ribeiro?= Date: Fri, 13 Feb 2026 10:31:26 -0300 Subject: [PATCH 16/16] Update mongosync_uniqueindex_limitation_checker.py --- ...ongosync_uniqueindex_limitation_checker.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/migration/toolbox/mongosync_uniqueindex_limitation_checker.py b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py index 320fc416..d4d674f1 100644 --- a/migration/toolbox/mongosync_uniqueindex_limitation_checker.py +++ b/migration/toolbox/mongosync_uniqueindex_limitation_checker.py @@ -7,7 +7,12 @@ import re import sys from collections import defaultdict -from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Set, Tuple +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + + +# Order-preserving, hashable signature for an index key pattern +KeySig = Tuple[Tuple[str, Any], ...] + # ------------------------- @@ -57,32 +62,31 @@ def ns_allowed( # Normalization + core logic # ------------------------- -def normalize_key_pattern(key_obj: Any) -> FrozenSet[Tuple[str, Any]]: +def normalize_key_pattern(key_obj: Any) -> KeySig: """ - Normalize key patterns into a hashable representation. + Normalize index key patterns into an order-preserving, hashable representation. - NOTE: Order-insensitive comparison (frozenset), matching the behavior of your original script. + IMPORTANT: Order matters for compound indexes in MongoDB. """ + if isinstance(key_obj, dict): - return frozenset(key_obj.items()) + return tuple((str(k), v) for k, v in key_obj.items()) if isinstance(key_obj, (list, tuple)): pairs: List[Tuple[str, Any]] = [] - ok = True for item in key_obj: if isinstance(item, (list, tuple)) and len(item) == 2: pairs.append((str(item[0]), item[1])) else: - ok = False - break - if ok: - return frozenset(pairs) + return (("<>", str(key_obj)),) + return tuple(pairs) + # Last resort: try .items() (dict-like) try: items = list(key_obj.items()) # type: ignore[attr-defined] - return frozenset((str(k), v) for k, v in items) + return tuple((str(k), v) for k, v in items) except Exception: - return frozenset({("<>", str(key_obj))}) + return (("<>", str(key_obj)),) def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -97,7 +101,7 @@ def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any } """ - per_collection: Dict[Tuple[str, str], Dict[FrozenSet[Tuple[str, Any]], Dict[str, List[str]]]] = defaultdict( + per_collection: Dict[Tuple[str, str], Dict[KeySig, Dict[str, List[str]]]] = defaultdict( lambda: defaultdict(lambda: {"unique": [], "non_unique": []}) ) @@ -124,7 +128,7 @@ def find_limitations(index_rows: Iterable[Dict[str, Any]]) -> List[Dict[str, Any { "database": db, "collection": coll, - "index_keys": sorted([list(kv) for kv in key_pattern], key=lambda x: str(x[0])), + "index_keys": [list(kv) for kv in key_pattern], "unique_index_names": sorted(set(buckets["unique"])), "non_unique_index_names": sorted(set(buckets["non_unique"])), }