ColossalAI-Examples/.github/workflows/test.yml at main · hpcaitech/ColossalAI-Examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
name: Test Examples

on: workflow_dispatch

jobs:
  test:
    if: github.ref_name == 'main' && github.repository == 'hpcaitech/ColossalAI-Examples' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
    name: Build ColossalAI and test examples
    runs-on: [self-hosted, gpu]
    timeout-minutes: 60
    container:
      image: frankleeeee/pytorch-cuda:1.10.1-11.3.0
      options: --gpus all --rm -v /data/scratch:/data
    steps:
    - name: Install dependencies
      run: |
        pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
        pip install -U pip setuptools==59.5.0 wheel --user
        pip install -U tensorboard transformers timm scipy
    - uses: actions/checkout@v2
      with:
        repository: hpcaitech/ColossalAI
    - name: Install Colossal-AI
      run: |
        pip install -r requirements/requirements.txt
        pip install -v --no-cache-dir .
    - uses: actions/checkout@v2
    - name: Test Feature/AMP
      run: |
        cd features/amp
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_apex.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_naive.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_AMP_torch.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_engine.py --config config/config_fp32.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_apex.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_naive.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_AMP_torch.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 train_with_trainer.py --config config/config_fp32.py
      env:
        DATA: /data
    - name: Test Feature/Gradient Accumulation
      run: |
        cd features/gradient_accumulation
        python -m torch.distributed.run --standalone --nproc_per_node=1 train_with_engine.py
      env:
        DATA: /data/cifar-10
    - name: Test Feature/Gradient Clipping
      run: |
        cd features/gradient_clipping
        python -m torch.distributed.run --standalone --nproc_per_node=1 train_with_engine.py
      env:
        DATA: /data/cifar-10
    - name: Test Feature/Pipeline Parallel
      run: |
        cd features/pipeline_parallel
        python -m torch.distributed.run --standalone --nproc_per_node=4 resnet.py
      env:
        DATA: /data/cifar-10
    - name: Test Feature/Tensor Parallel
      run: |
        cd features/tensor_parallel
        python -m torch.distributed.run --standalone --nproc_per_node=2 tensor_parallel_1d.py --from_torch
        python -m torch.distributed.run --standalone --nproc_per_node=4 tensor_parallel_2d.py --from_torch
        python -m torch.distributed.run --standalone --nproc_per_node=8 tensor_parallel_2p5d.py --from_torch
        python -m torch.distributed.run --standalone --nproc_per_node=8 tensor_parallel_3d.py --from_torch
      env:
        DATA: /data/cifar-10
    - name: Test Feature/Zero
      run: |
        cd features/zero
        python -m torch.distributed.run --standalone --nproc_per_node=1 train.py
    - name: Test Image/Resnet
      run: |
        cd image/resnet
        python -m torch.distributed.run --standalone --nproc_per_node=4 run_resnet_cifar10_with_engine.py
        python -m torch.distributed.run --standalone --nproc_per_node=4 run_resnet_cifar10_with_trainer.py
      env:
        DATA: /data/cifar-10
    # - name: Test Image/ViT Data Parallel
    #   run: |
    #     cd image/vision_transformer/data_parallel
    #     python -m torch.distributed.run --standalone --nproc_per_node=4 train.py --config config.py
    #   env:
    #     DATA: /data/imagenet-100
    # FIXME(ver217): CUDA OOM, too slow
    - name: Test Image/ViT Pipeline Parallel
      run: |
        cd image/vision_transformer/pipeline_parallel
        python -m torch.distributed.run --standalone --nproc_per_node=8 vit.py
      env:
        DATA: /data/cifar-10
    - name: Test Language/GPT2
      run: |
        cd language/gpt
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_1d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_2d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_2p5d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_3d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_pp.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_pp1d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_vanilla.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_zero3_pp1d.py
        python -m torch.distributed.run --standalone --nproc_per_node=8 train_gpt.py --from_torch --config gpt2_configs/gpt2_zero3.py
      env:
        DATA: /data/gpt_data/small-gpt-dataset.json