This repository was archived by the owner on Mar 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 102
105 lines (103 loc) · 5.53 KB
/
test.yml
File metadata and controls
105 lines (103 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
name: Test Examples
on: workflow_dispatch
jobs:
test:
if: github.ref_name == 'main' && github.repository == 'hpcaitech/ColossalAI-Examples' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
name: Build ColossalAI and test examples
runs-on: [self-hosted, gpu]
timeout-minutes: 60
container:
image: frankleeeee/pytorch-cuda:1.10.1-11.3.0
options: --gpus all --rm -v /data/scratch:/data
steps:
- name: Install dependencies
run: |
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install -U pip setuptools==59.5.0 wheel --user
pip install -U tensorboard transformers timm scipy
- uses: actions/checkout@v2
with:
repository: hpcaitech/ColossalAI
- name: Install Colossal-AI
run: |
pip install -r requirements/requirements.txt
pip install -v --no-cache-dir .
- uses: actions/checkout@v2
- name: Test Feature/AMP
run: |
cd features/amp
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_apex.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_naive.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_torch.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_fp32.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_apex.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_naive.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_torch.py
python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_fp32.py
env:
DATA: /data
- name: Test Feature/Gradient Accumulation
run: |
cd features/gradient_accumulation
python -m torch.distributed.run --standalone --nproc_per_node=1 train_with_engine.py
env:
DATA: /data/cifar-10
- name: Test Feature/Gradient Clipping
run: |
cd features/gradient_clipping
python -m torch.distributed.run --standalone --nproc_per_node=1 train_with_engine.py
env:
DATA: /data/cifar-10
- name: Test Feature/Pipeline Parallel
run: |
cd features/pipeline_parallel
python -m torch.distributed.run --standalone --nproc_per_node=4 resnet.py
env:
DATA: /data/cifar-10
- name: Test Feature/Tensor Parallel
run: |
cd features/tensor_parallel
python -m torch.distributed.run --standalone --nproc_per_node=2 tensor_parallel_1d.py --from_torch
python -m torch.distributed.run --standalone --nproc_per_node=4 tensor_parallel_2d.py --from_torch
python -m torch.distributed.run --standalone --nproc_per_node=8 tensor_parallel_2p5d.py --from_torch
python -m torch.distributed.run --standalone --nproc_per_node=8 tensor_parallel_3d.py --from_torch
env:
DATA: /data/cifar-10
- name: Test Feature/Zero
run: |
cd features/zero
python -m torch.distributed.run --standalone --nproc_per_node=1 train.py
- name: Test Image/Resnet
run: |
cd image/resnet
python -m torch.distributed.run --standalone --nproc_per_node=4 run_resnet_cifar10_with_engine.py
python -m torch.distributed.run --standalone --nproc_per_node=4 run_resnet_cifar10_with_trainer.py
env:
DATA: /data/cifar-10
# - name: Test Image/ViT Data Parallel
# run: |
# cd image/vision_transformer/data_parallel
# python -m torch.distributed.run --standalone --nproc_per_node=4 train.py --config config.py
# env:
# DATA: /data/imagenet-100
# FIXME(ver217): CUDA OOM, too slow
- name: Test Image/ViT Pipeline Parallel
run: |
cd image/vision_transformer/pipeline_parallel
python -m torch.distributed.run --standalone --nproc_per_node=8 vit.py
env:
DATA: /data/cifar-10
- name: Test Language/GPT2
run: |
cd language/gpt
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_1d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_2d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_2p5d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_3d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_pp.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_pp1d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_vanilla.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_zero3_pp1d.py
python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_zero3.py
env:
DATA: /data/gpt_data/small-gpt-dataset.json