From 82f740bb7091ed9d49c1735928b5c5162d55208f Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 10:32:37 -0500 Subject: [PATCH 1/7] feat(docketbert): update runpod entrypoint --- runpod_init.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runpod_init.sh b/runpod_init.sh index d093b31..e31c2bb 100755 --- a/runpod_init.sh +++ b/runpod_init.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash -git fetch origin -git reset --hard origin/main -pip install -e . +apt-get update && apt-get install -y rsync +git pull +pip install -e '.[dev]' pip install flash-attn --no-build-isolation clx config --autoload-env on -export CLX_HOME=/workspace/clx/home +export CLX_HOME=/workspace/clx export HF_HOME=/workspace/hf From 09c7df577f69a4bcec7692a700a8f42f99820beb Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 10:33:28 -0500 Subject: [PATCH 2/7] feat(docketbert): add script to pull runpod logs --- projects/docketbert/pull_runpod_logs.py | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 projects/docketbert/pull_runpod_logs.py diff --git a/projects/docketbert/pull_runpod_logs.py b/projects/docketbert/pull_runpod_logs.py new file mode 100644 index 0000000..ee323a9 --- /dev/null +++ b/projects/docketbert/pull_runpod_logs.py @@ -0,0 +1,35 @@ +import os +import subprocess + +from clx.settings import CLX_HOME + +RUNPOD_POD_IP = os.getenv("RUNPOD_POD_IP") +RUNPOD_POD_PORT = os.getenv("RUNPOD_POD_PORT") +RUNPOD_SSH_KEY = os.getenv("RUNPOD_SSH_KEY") + + +if __name__ == "__main__": + remote = f"root@{RUNPOD_POD_IP}:/workspace/clx/projects/docketbert/runs" + local = CLX_HOME / "projects" / "docketbert" + exclude_patterns = [ + "*.safetensors", + ] + + cmd = [ + "rsync", + "-avz", + "--progress", + "-e", + f"ssh -i {RUNPOD_SSH_KEY} -p {RUNPOD_POD_PORT}", + ] + + for pattern in exclude_patterns: + cmd.append(f"--exclude={pattern}") + + cmd += [ + remote, + str(local), + ] + + print("Running:", " ".join(cmd)) + subprocess.run(cmd, check=True) From 8d9b3c9f9d8175bb13a96891250fd29986f0d33f Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 10:33:59 -0500 Subject: [PATCH 3/7] feat(docketbert): add full model training configs --- projects/docketbert/train.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/projects/docketbert/train.py b/projects/docketbert/train.py index ae0dcda..1c7bfa5 100644 --- a/projects/docketbert/train.py +++ b/projects/docketbert/train.py @@ -9,6 +9,8 @@ from clx.settings import CLX_HOME PROJECT_DIR = CLX_HOME / "projects" / "docketbert" +EXP_DATA_PATH = CLX_HOME / "app_projects" / "docket-entry" / "docs.csv" +FULL_DATA_PATH = PROJECT_DIR / "data" / "train.csv" def create_sliced_model( @@ -35,6 +37,7 @@ def create_sliced_model( def get_experiment_config(experiment, batch_size=None): config = { + "data_path": EXP_DATA_PATH, "task": "mlm", "run_dir_parent": PROJECT_DIR / "runs", "base_model_name": "answerdotai/ModernBERT-base", @@ -234,6 +237,23 @@ def get_experiment_config(experiment, batch_size=None): "global_attn_every_n_layers": 2, } default_batch_size = 8 + elif experiment == "final-base-150M": + default_batch_size = 16 + config["data_path"] = FULL_DATA_PATH + elif experiment == "final-large-395M": + config["base_model_name"] = "answerdotai/ModernBERT-large" + default_batch_size = 8 + config["data_path"] = FULL_DATA_PATH + elif experiment == "final-sliced-175M": + base_model_name = ( + PROJECT_DIR / "runs" / "docketbert-final-large-395M" / "model" + ) + config["base_model_name"] = create_sliced_model( + "final-sliced-large-ft-interleaved-10l", + [0, 3, 6, 9, 12, 15, 18, 21, 24, 27], + base_model_name, + ) + config["data_path"] = FULL_DATA_PATH else: raise ValueError(f"Invalid experiment: {experiment}") @@ -308,7 +328,6 @@ def train_docketbert( overwrite, resume, check_params, mem_test, experiment, batch_size, exit ): """Train a docket language model.""" - from clx.models import DocketEntry try: if resume and overwrite: @@ -316,8 +335,12 @@ def train_docketbert( "Cannot use --resume and --overwrite together." ) + config = get_experiment_config(experiment, batch_size) + + data_path = config.pop("data_path") + data = pd.read_csv( - DocketEntry.get_project().cached_documents_path, + data_path, usecols=["text"], nrows=200000 if mem_test else None, ) @@ -325,8 +348,6 @@ def train_docketbert( train_data = data.head(-100000) eval_data = data.tail(100000) - config = get_experiment_config(experiment, batch_size) - if mem_test: config["tokenize_args"]["padding"] = "max_length" From 86cc615c0af8b3c0979a5fd8006ef810274fed57 Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 10:37:21 -0500 Subject: [PATCH 4/7] feat(docketbert): assemble large training set Fixes: #96 --- projects/docketbert/prepare_train_data.py | 95 +++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 projects/docketbert/prepare_train_data.py diff --git a/projects/docketbert/prepare_train_data.py b/projects/docketbert/prepare_train_data.py new file mode 100644 index 0000000..26de397 --- /dev/null +++ b/projects/docketbert/prepare_train_data.py @@ -0,0 +1,95 @@ +import os +from pathlib import Path + +import pandas as pd +import psycopg2 +from tqdm import tqdm + +from clx import pd_save_or_append +from clx.settings import CLX_HOME + +PROJECT_DIR = CLX_HOME / "projects" / "docketbert" + +DB_CONFIG = { + "host": os.getenv("DEV_DB_HOST"), + "port": int(os.getenv("DEV_DB_PORT", "5432")), + "dbname": os.getenv("DEV_DB_NAME"), + "user": os.getenv("DEV_DB_USER"), + "password": os.getenv("DEV_DB_PASSWORD"), +} + + +def pull_dev_data(table_name, nrows=5_000_000, batch_size=1_000_000) -> None: + data_path = Path(PROJECT_DIR / "data" / f"{table_name}_descriptions.csv") + data_path.parent.mkdir(parents=True, exist_ok=True) + + conn = psycopg2.connect(**DB_CONFIG) + + last_id = None + current_rows = 0 + if data_path.exists(): + chunks = pd.read_csv(data_path, chunksize=batch_size) + for chunk in chunks: + current_rows += len(chunk) + min_id = chunk["id"].min() + last_id = min_id if last_id is None else min(last_id, min_id) + + progress = tqdm(total=nrows, desc="Downloading") + progress.update(current_rows) + + try: + while current_rows < nrows: + last_id_condition = ( + "" if last_id is None else f"AND id < {last_id}" + ) + with conn.cursor() as cur: + cur.execute(f""" + SELECT id, description FROM {table_name} + WHERE description IS NOT NULL + AND description <> '' + {last_id_condition} + ORDER BY id DESC + LIMIT {batch_size} + """) + rows = cur.fetchall() + if not rows: + break + col_names = [desc[0] for desc in cur.description] + data = pd.DataFrame(rows, columns=col_names) + last_id = data["id"].min() + current_rows += len(data) + progress.update(len(data)) + pd_save_or_append(data, data_path) + finally: + conn.close() + + +def consolidate_data(): + d1 = pd.read_csv(CLX_HOME / "app_projects" / "docket-entry" / "docs.csv")[ + "text" + ] + d2 = pd.read_csv( + CLX_HOME / "app_projects" / "docket-entry-short" / "docs.csv" + )["text"] + d3 = pd.read_csv( + PROJECT_DIR / "data" / "search_recapdocument_descriptions.csv", + usecols=["description"], + ) + d3 = d3.rename(columns={"description": "text"}) + d4 = pd.read_csv( + PROJECT_DIR / "data" / "search_docketentry_descriptions.csv", + usecols=["description"], + ) + d4 = d4.rename(columns={"description": "text"}) + data = pd.concat([d1, d2, d3, d4]) + data = data.drop_duplicates("text") + data = data.sample(frac=1) + return data + + +if __name__ == "__main__": + pull_dev_data("search_docketentry", nrows=40_000_000) + pull_dev_data("search_recapdocument", nrows=20_000_000) + data = consolidate_data() + data.to_csv(PROJECT_DIR / "data" / "train.csv", index=False) + print(data) From db514039946daff5412ca1285c535f0ba14bef6b Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 17:17:07 -0500 Subject: [PATCH 5/7] feat(ml): pass data path to training run for streaming --- clx/ml/training_run.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/clx/ml/training_run.py b/clx/ml/training_run.py index 8edaae2..1f5efdf 100644 --- a/clx/ml/training_run.py +++ b/clx/ml/training_run.py @@ -11,7 +11,7 @@ import requests import simplejson as json import torch -from datasets import Dataset +from datasets import Dataset, IterableDataset, load_dataset from tqdm import tqdm from transformers import ( AutoModel, @@ -185,8 +185,8 @@ def pipe(self) -> Pipeline: def train( self, - train_data: pd.DataFrame, - eval_data: pd.DataFrame | None = None, + train_data: pd.DataFrame | Path | str, + eval_data: pd.DataFrame | Path | str | None = None, overwrite: bool = False, resume_from_checkpoint: str | bool | None = None, lazy_tokenize: bool = False, @@ -223,13 +223,25 @@ def train( "Please delete it or set `overwrite=True` to overwrite it.", ) - # Validate the input data format - self.validate_data_format(train_data) - if eval_data is not None: - self.validate_data_format(eval_data) - # Prepare the datasets - def prepare_dataset(data: pd.DataFrame) -> Dataset: + def prepare_dataset( + data: pd.DataFrame | Path | str, + ) -> Dataset | IterableDataset: + if isinstance(data, str | Path): + dataset = load_dataset( + "csv", + data_files=str(data), + streaming=True, + )["train"] + dataset = dataset.map(self.tokenize, batched=True) + dataset = dataset.select_columns(self.dataset_cols) + dataset = dataset.shuffle(buffer_size=10_000) + count = 0 + for chunk in pd.read_csv(data, chunksize=1_000_000): + count += len(chunk) + return dataset, count + + self.validate_data_format(data) dataset = Dataset.from_pandas(data) if lazy_tokenize: @@ -238,11 +250,11 @@ def prepare_dataset(data: pd.DataFrame) -> Dataset: dataset = dataset.map(self.tokenize, batched=True) dataset = dataset.select_columns(self.dataset_cols) dataset.set_format(type="torch") - return dataset + return dataset, len(data) - train_dataset = prepare_dataset(train_data) - eval_dataset = ( - None if eval_data is None else prepare_dataset(eval_data) + train_dataset, train_count = prepare_dataset(train_data) + eval_dataset, eval_count = ( + (None, 0) if eval_data is None else prepare_dataset(eval_data) ) if callbacks is None: @@ -284,8 +296,8 @@ def prepare_dataset(data: pd.DataFrame) -> Dataset: # Evaluate the model if eval_dataset is not None: eval_results = trainer.evaluate(eval_dataset) - eval_results["num_train_examples"] = len(train_data) - eval_results["num_eval_examples"] = len(eval_data) + eval_results["num_train_examples"] = train_count + eval_results["num_eval_examples"] = eval_count self.results_path.write_text(json.dumps(eval_results, indent=4)) # Save the model From 274410e2319cadf4fdb25fafe4a444dc1a197ae0 Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 17:17:35 -0500 Subject: [PATCH 6/7] feat(docketbert): update docketbert train config --- projects/docketbert/prepare_train_data.py | 10 +++---- projects/docketbert/train.py | 36 ++++++++++++++--------- runpod_init.sh | 6 ++-- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/projects/docketbert/prepare_train_data.py b/projects/docketbert/prepare_train_data.py index 26de397..daba9f2 100644 --- a/projects/docketbert/prepare_train_data.py +++ b/projects/docketbert/prepare_train_data.py @@ -74,13 +74,11 @@ def consolidate_data(): d3 = pd.read_csv( PROJECT_DIR / "data" / "search_recapdocument_descriptions.csv", usecols=["description"], - ) - d3 = d3.rename(columns={"description": "text"}) + ).rename(columns={"description": "text"}) d4 = pd.read_csv( PROJECT_DIR / "data" / "search_docketentry_descriptions.csv", usecols=["description"], - ) - d4 = d4.rename(columns={"description": "text"}) + ).rename(columns={"description": "text"}) data = pd.concat([d1, d2, d3, d4]) data = data.drop_duplicates("text") data = data.sample(frac=1) @@ -91,5 +89,7 @@ def consolidate_data(): pull_dev_data("search_docketentry", nrows=40_000_000) pull_dev_data("search_recapdocument", nrows=20_000_000) data = consolidate_data() + eval_data = data.tail(100000) + data = data.head(-100000) data.to_csv(PROJECT_DIR / "data" / "train.csv", index=False) - print(data) + eval_data.to_csv(PROJECT_DIR / "data" / "eval.csv", index=False) diff --git a/projects/docketbert/train.py b/projects/docketbert/train.py index 1c7bfa5..214081e 100644 --- a/projects/docketbert/train.py +++ b/projects/docketbert/train.py @@ -10,7 +10,8 @@ PROJECT_DIR = CLX_HOME / "projects" / "docketbert" EXP_DATA_PATH = CLX_HOME / "app_projects" / "docket-entry" / "docs.csv" -FULL_DATA_PATH = PROJECT_DIR / "data" / "train.csv" +FULL_DATA_TRAIN_PATH = PROJECT_DIR / "data" / "train.csv" +FULL_DATA_EVAL_PATH = PROJECT_DIR / "data" / "eval.csv" def create_sliced_model( @@ -37,7 +38,7 @@ def create_sliced_model( def get_experiment_config(experiment, batch_size=None): config = { - "data_path": EXP_DATA_PATH, + "use_full_data": False, "task": "mlm", "run_dir_parent": PROJECT_DIR / "runs", "base_model_name": "answerdotai/ModernBERT-base", @@ -238,12 +239,14 @@ def get_experiment_config(experiment, batch_size=None): } default_batch_size = 8 elif experiment == "final-base-150M": + config["training_args"]["max_steps"] = 40761591 + config["use_full_data"] = True default_batch_size = 16 - config["data_path"] = FULL_DATA_PATH elif experiment == "final-large-395M": config["base_model_name"] = "answerdotai/ModernBERT-large" + config["training_args"]["max_steps"] = 40761591 + config["use_full_data"] = True default_batch_size = 8 - config["data_path"] = FULL_DATA_PATH elif experiment == "final-sliced-175M": base_model_name = ( PROJECT_DIR / "runs" / "docketbert-final-large-395M" / "model" @@ -253,7 +256,8 @@ def get_experiment_config(experiment, batch_size=None): [0, 3, 6, 9, 12, 15, 18, 21, 24, 27], base_model_name, ) - config["data_path"] = FULL_DATA_PATH + config["training_args"]["max_steps"] = 40761591 + config["use_full_data"] = True else: raise ValueError(f"Invalid experiment: {experiment}") @@ -337,16 +341,20 @@ def train_docketbert( config = get_experiment_config(experiment, batch_size) - data_path = config.pop("data_path") + use_full_data = config.pop("use_full_data") - data = pd.read_csv( - data_path, - usecols=["text"], - nrows=200000 if mem_test else None, - ) - data = data.sample(frac=1, random_state=42) - train_data = data.head(-100000) - eval_data = data.tail(100000) + if use_full_data: + train_data = FULL_DATA_TRAIN_PATH + eval_data = FULL_DATA_EVAL_PATH + else: + data = pd.read_csv( + EXP_DATA_PATH, + usecols=["text"], + nrows=200000 if mem_test else None, + ) + data = data.sample(frac=1, random_state=42) + train_data = data.head(-100000) + eval_data = data.tail(100000) if mem_test: config["tokenize_args"]["padding"] = "max_length" diff --git a/runpod_init.sh b/runpod_init.sh index e31c2bb..de1b903 100755 --- a/runpod_init.sh +++ b/runpod_init.sh @@ -5,5 +5,7 @@ git pull pip install -e '.[dev]' pip install flash-attn --no-build-isolation clx config --autoload-env on -export CLX_HOME=/workspace/clx -export HF_HOME=/workspace/hf +cat > .env << 'EOF' +CLX_HOME=/workspace/clx +HF_HOME=/workspace/hf +EOF From 187a8cab53e07008af647b357acf88b88a0640da Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Mon, 19 Jan 2026 17:43:36 -0500 Subject: [PATCH 7/7] feat(docketbert): update max_steps --- projects/docketbert/pull_runpod_logs.py | 2 ++ projects/docketbert/train.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/projects/docketbert/pull_runpod_logs.py b/projects/docketbert/pull_runpod_logs.py index ee323a9..f701559 100644 --- a/projects/docketbert/pull_runpod_logs.py +++ b/projects/docketbert/pull_runpod_logs.py @@ -13,6 +13,8 @@ local = CLX_HOME / "projects" / "docketbert" exclude_patterns = [ "*.safetensors", + "*.pt", + "*.csv", ] cmd = [ diff --git a/projects/docketbert/train.py b/projects/docketbert/train.py index 214081e..3036098 100644 --- a/projects/docketbert/train.py +++ b/projects/docketbert/train.py @@ -239,12 +239,12 @@ def get_experiment_config(experiment, batch_size=None): } default_batch_size = 8 elif experiment == "final-base-150M": - config["training_args"]["max_steps"] = 40761591 + config["training_args"]["max_steps"] = 40761591 // 256 config["use_full_data"] = True default_batch_size = 16 elif experiment == "final-large-395M": config["base_model_name"] = "answerdotai/ModernBERT-large" - config["training_args"]["max_steps"] = 40761591 + config["training_args"]["max_steps"] = 40761591 // 256 config["use_full_data"] = True default_batch_size = 8 elif experiment == "final-sliced-175M": @@ -256,7 +256,7 @@ def get_experiment_config(experiment, batch_size=None): [0, 3, 6, 9, 12, 15, 18, 21, 24, 27], base_model_name, ) - config["training_args"]["max_steps"] = 40761591 + config["training_args"]["max_steps"] = 40761591 // 256 config["use_full_data"] = True else: raise ValueError(f"Invalid experiment: {experiment}")