Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
098fdfe
#42 add some splits for HuggingFaceTB/finemath dataset
tyoc213 Feb 26, 2025
d7afb93
#42 generate train and validation from single train split
tyoc213 Mar 1, 2025
d416b7b
donf't shuffle
tyoc213 Mar 1, 2025
aaa61de
Create test and validation for finemath-4plus and generate sets of 1M…
tyoc213 Mar 5, 2025
ff706b3
revert changes to convert_dataset_hf.py to generate finemath splits
tyoc213 Mar 5, 2025
4a0a213
Add some logs and file checks
tyoc213 Mar 7, 2025
2a5820e
fix missing import and path check
tyoc213 Mar 7, 2025
c29df23
Generate train/val for tulu-3-sft-olmo-2-mixture without aya source a…
tyoc213 Mar 8, 2025
d00bf8f
starting to work on split_hf_datasets.py
matdmiller Mar 9, 2025
5d4d2f7
Refactor to single file and multiple calls for datasets: finemath, tu…
tyoc213 Mar 14, 2025
367393d
Adding configurations to allow script tokenize new datasets
tyoc213 Mar 15, 2025
5ef92f9
OK Splitting into train/test and starting exploring dataset tokenization
tyoc213 Mar 22, 2025
3afadc7
little cleanup
tyoc213 Mar 22, 2025
d750861
numina can now be tokenized, tulu is still failing
tyoc213 Mar 23, 2025
1230588
split q/a messages column into prompt/response and remove unnecessary…
tyoc213 Mar 26, 2025
9db7f4b
`data_prep/split_hf_datasets.py -h` for help and `data_prep/split_hf_…
tyoc213 Mar 28, 2025
96d1bd0
Making use of convert_finetuning_dataset_from_args to tokenize tulu a…
tyoc213 Mar 29, 2025
e76f1ee
feat `download_repo.py` and `modal_script#pull_hf_to_folder`
tyoc213 Mar 30, 2025
c8c0a9c
fix modal DATASETS_VOLUME_MOUNT_PATH and args
tyoc213 Mar 30, 2025
fadae9e
fix autoscaling parameter https://modal.com/docs/guide/modal-1-0-migr…
tyoc213 Mar 30, 2025
dd74ed2
add pull_hf_to_folder which downloads: tulu, numina and finemath hf s…
tyoc213 Mar 31, 2025
d976060
add glaiveai/glaive-code-assistant-v3
tyoc213 Apr 1, 2025
32b5c75
adding avelina python edu
tyoc213 Apr 3, 2025
f0d8012
extra commas and path changes
tyoc213 Apr 7, 2025
947c6ca
fix glaive -1M option added some extra logs
tyoc213 Apr 8, 2025
b4270d5
revert back to `concurrency_limit` for modal < 0.73.76
tyoc213 Apr 9, 2025
f608b4d
added missings glaive, avelinapythonedu configs
tyoc213 Apr 10, 2025
98705be
missing cmdn line options in previous commit
tyoc213 Apr 10, 2025
bfea302
adding concat tokens for pretran datasets that indicates use the toke…
tyoc213 Apr 10, 2025
308ee90
quickfix `"` by `'`
tyoc213 Apr 11, 2025
3cb3109
fix concat_tokens value
tyoc213 Apr 12, 2025
9c40d49
addking kind to make clear which datasets are instruct and which pret…
tyoc213 Apr 24, 2025
3010ece
Override ablations internal configuration with `--one-k`` to do only …
tyoc213 Apr 24, 2025
b4645ec
missed 8192 tokens
tyoc213 Apr 24, 2025
6cdf2ca
add constants directly in convert_dataset_hf so that it can be called…
tyoc213 May 2, 2025
8932988
tokenize each row and padd it up to max_length
tyoc213 May 2, 2025
1bc9d97
add glaive preprocessing after_pull on original dataset
tyoc213 May 27, 2025
5ae1dd5
use tokenizer if tokenizer is specified without concat tokens
tyoc213 May 29, 2025
46eb8a3
pretrain set to use tokenizer with concat tokens = None
tyoc213 May 30, 2025
21b52a4
adding system prompt chat template
tyoc213 May 31, 2025
798e838
numina and tulu preproc after_pull
tyoc213 May 31, 2025
7a12c49
`built_tokenizer` when tokenizer exist for pretraining dataset
tyoc213 Jun 5, 2025
bbbeddb
set correct max_seq_len for instruct and pretrain datasets
tyoc213 Jun 7, 2025
a1fed85
preprocs methods need to return the colums that have the template app…
tyoc213 Jun 7, 2025
feb9513
`concat_tokens=max_seq_len` for pretrain datasets
tyoc213 Jun 8, 2025
33bda53
remove dead code
tyoc213 Jun 9, 2025
041433b
extend timeout to process datasets, fix instruct tokenizer, fix paddi…
tyoc213 Jun 12, 2025
b3e86e6
`no_wrap=True` to trim samples for pretrain data
tyoc213 Jun 17, 2025
d768c4f
force enter loop if any token present and break; make batchsize=1 so …
tyoc213 Aug 26, 2025
4dccc9d
add little more info on how to use
tyoc213 Aug 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from llmfoundry.command_utils.data_prep.convert_dataset_hf import (
convert_dataset_hf,
convert_dataset_hf_from_args,
DataSplitConstants,
DatasetConstants,
add_dataset_config,
CONSTS,
)
from llmfoundry.command_utils.data_prep.convert_dataset_json import (
convert_dataset_json,
Expand Down Expand Up @@ -45,6 +49,10 @@
'eval_from_yaml',
'convert_dataset_hf',
'convert_dataset_hf_from_args',
'add_dataset_config',
'DataSplitConstants',
'DatasetConstants',
'CONSTS',
'convert_dataset_json',
'convert_dataset_json_from_args',
'convert_delta_to_contrastive_mds',
Expand Down
7 changes: 6 additions & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ def __init__(

CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}

def add_dataset_config(name, splits):
global CONSTS
CONSTS[name] = splits

def build_hf_dataset(
dataset_name: str,
Expand Down Expand Up @@ -348,6 +351,8 @@ def convert_dataset_hf(
else:
mode = ConcatMode.NO_CONCAT
built_tokenizer = None
if tokenizer:
built_tokenizer = build_tokenizer(tokenizer, tokenizer_kwargs)
columns = {'text': 'str'}

for split_name in splits:
Expand Down Expand Up @@ -377,7 +382,7 @@ def convert_dataset_hf(
)
loader = build_dataloader(
dataset=hf_dataset,
batch_size=512,
batch_size=1,
num_workers=num_workers,
)
samples = generate_samples(
Expand Down
3 changes: 2 additions & 1 deletion llmfoundry/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,14 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
)
iids = encoded['input_ids']
buffer = buffer + self.bos_tokens + iids + self.eos_tokens
while len(buffer) >= self.max_length:
while len(buffer) >= self.max_length or len(buffer) > 0:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:] if self.should_wrap else []
yield {
# convert to ndarray to store in MDS format
'tokens': np.asarray(concat_sample, dtype=np.int32),
}
break


def stream_remote_local_validate(
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Streaming dataset conversion scripts for C4 and The Pile."""
from argparse import ArgumentParser, Namespace

from llmfoundry.command_utils import convert_dataset_hf_from_args
from llmfoundry.command_utils import convert_dataset_hf_from_args, DatasetConstants, DataSplitConstants, add_dataset_config, CONSTS


def parse_args() -> Namespace:
Expand Down
36 changes: 36 additions & 0 deletions scripts/data_prep/dataset_constants_split_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from llmfoundry.command_utils import DatasetConstants, DataSplitConstants, add_dataset_config

def generate_constants(chars_per_sample, chars_per_token, label=None, splits=("full", 1, 10, 100, 1000)):
ds_const = DatasetConstants(
chars_per_sample=chars_per_sample, # Computed over validation set
chars_per_token=chars_per_token, # OpenAI estimate
)
total_rows = None
# we generate only train and test use --data_subset <xyzk> --out_root <defj>
ds_const.splits[f"train"] = DataSplitConstants(
hf_split="train",
folder_split=f"train",
raw_samples=total_rows,
truncated_samples=total_rows,
)

ds_const.splits[f"test"] = DataSplitConstants(
hf_split="test",
folder_split=f"test",
raw_samples=total_rows,
truncated_samples=total_rows,
)
return ds_const


def register_new_datasets(target = "LocalResearchGroup"):
_finemath = generate_constants(12163, 4)
add_dataset_config(f"{target}/split-finemath", _finemath)
_tulu = generate_constants(12163, 4)
add_dataset_config(f"{target}/split-tulu-3-sft-olmo-2-mixture", _tulu)
_numina = generate_constants(12163, 4)
add_dataset_config(f"{target}/split-NuminaMath-CoT", _numina)
_pythonedu = generate_constants(12163, 4)
add_dataset_config(f"{target}/split-avelina-python-edu", _pythonedu)
_glaive = generate_constants(12163, 4)
add_dataset_config(f"{target}/split-glaive-code-assistant-v3", _glaive)
70 changes: 70 additions & 0 deletions scripts/data_prep/download_repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from argparse import ArgumentParser, Namespace, BooleanOptionalAction
from huggingface_hub import HfApi, login
import os


def main(args):
api = HfApi()
datasets = {
"tulu": {
"target": f"{args.repo}/split-tulu-3-sft-olmo-2-mixture",
},
"numina": {
"target": f"{args.repo}/split-NuminaMath-CoT",
},
"finemath" :{
"target": f"{args.repo}/split-finemath",
},
"glaive" : {
"target": f"{args.repo}/split-glaive-code-assistant-v3",
},
"avelinapythonedu": {
"target": f"{args.repo}/split-avelina-python-edu",
},
}

for ds in args.dataset:
ld = f"{args.out}/{ds}"
datadown = datasets[ds]["target"]
print(f"downloading {datadown=} to {ld=}\n")
local_dir = api.snapshot_download(
repo_id=datadown,
repo_type="dataset",
local_dir=ld,
)

def parse_args() -> Namespace:
"""Parse commandline arguments."""
parser = ArgumentParser(
description=
"Downloads tokenized versions of train/test 1M, 100k, 10k, 1k",
)
parser.add_argument(
"--dataset",
nargs="+",
choices=["tulu", "numina", "finemath", "glaive", "avelinapythonedu"],
default=["tulu", "numina", "finemath", "glaive", "avelinapythonedu"],
)

parser.add_argument(
"--repo",
default="LocalResearchGroup",
help="repo containing tokenizations",
)

parser.add_argument(
"--out",
default=".",
help="local download folder",
)

parsed = parser.parse_args()
return parsed


if __name__ == "__main__":
args = parse_args()
if not os.environ.get("HUGGING_FACE_HUB_TOKEN"):
print("No Hugging Face token found. Please login.")
login()
main(args)
9 changes: 9 additions & 0 deletions scripts/data_prep/preproc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from preproc.preprocs import pre_ml_glaive, pre_ml_tulu, pre_ml_numina
__all__ = [
"pre_ml_glaive",
"pre_ml_tulu",
"pre_ml_numina",
# "pre_glaive",
# "pre_tulu",
# "pre_numina",
]
21 changes: 21 additions & 0 deletions scripts/data_prep/preproc/preprocs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from llmfoundry.data.finetuning.tasks import (
DatasetConstructor,
)

dataset_constructor = DatasetConstructor()

@dataset_constructor.register(f"LocalResearchGroup/split-tulu-3-sft-olmo-2-mixture")
def pre_ml_tulu(inp: dict):
return {"prompt": inp["prompt"], "response": inp["response"]}


@dataset_constructor.register(f"LocalResearchGroup/split-NuminaMath-CoT")
def pre_ml_numina(inp: dict):
return {"prompt": inp["prompt"], "response": inp["response"]}


@dataset_constructor.register(f"LocalResearchGroup/split-glaive-code-assistant-v3")
def pre_ml_glaive(inp: dict):
return {"prompt": inp["prompt"], "response": inp["response"]}


Loading
Loading