diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..76ab4d9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,62 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 + + - name: Lint with flake8 + run: | + # Stop on syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # Warn on everything else (non-blocking) + flake8 . --count --exit-zero --max-complexity=15 --max-line-length=120 --statistics + + check-imports: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Verify imports + run: | + # Check that all Python files have valid syntax and imports resolve + python -c " + import ast, sys, pathlib + errors = [] + for p in pathlib.Path('.').rglob('*.py'): + try: + ast.parse(p.read_text()) + except SyntaxError as e: + errors.append(f'{p}: {e}') + if errors: + print('\n'.join(errors)) + sys.exit(1) + print('All Python files have valid syntax.') + " diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..fcbf10f --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,99 @@ +# Contributing to AsyncPP + +Thank you for your interest in contributing to the Asynchronous Pipeline Parallel project! + +## Getting Started + +### Prerequisites + +- Python 3.12+ +- PyTorch 2.5.1+ +- CUDA 12.6+ (for GPU experiments) +- At least 8 GPUs (for running the full pipeline) + +### Setup + +```bash +git clone https://github.com/PluralisResearch/AsyncPP.git +cd AsyncPP +pip install -r requirements.txt +``` + +### Running Experiments + +The main entry point is `run.bash`, which launches both the asynchronous method and GPipe baseline: + +```bash +bash run.bash +``` + +This script configures an 8-stage pipeline on a WikiText-103 language modeling task. Key parameters: + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `nnodes` | 8 | Number of pipeline stages | +| `batch` | 8 | Mini-batch size | +| `epochs` | 50 | Training epochs | +| `lr` | 3e-4 | Learning rate | +| `momentum` | 0.99 | Nesterov momentum (our method) | + +## Project Structure + +``` +AsyncPP/ +├── main_with_runtime.py # Async pipeline parallel training (our method) +├── sync_main.py # Synchronous baselines (GPipe, 1F1B) +├── run.bash # Launch script for experiments +├── data_utils.py # Dataset loading utilities +├── models/ +│ └── gptn/ # GPT model partitioned across pipeline stages +├── runtime/ +│ ├── runtime.py # Core pipeline runtime (scheduling, forward/backward) +│ ├── communication.py # Distributed communication primitives +│ └── runtime_utilities.py +└── optim/ + ├── adamw.py # AdamW optimizer + └── nadamw.py # NAdamW optimizer (Nesterov variant) +``` + +## How to Contribute + +### Reporting Issues + +Open a GitHub issue with: +- A clear description of the problem or suggestion +- Steps to reproduce (for bugs) +- Your environment (Python version, PyTorch version, GPU setup) + +### Pull Requests + +1. Fork the repository +2. Create a feature branch: `git checkout -b my-feature` +3. Make your changes +4. Ensure your code passes linting: `flake8 . --select=E9,F63,F7,F82` +5. Commit with a descriptive message +6. Push and open a PR against `main` + +### Code Style + +- Follow PEP 8 guidelines +- Maximum line length: 120 characters +- Use descriptive variable names +- Add comments for non-obvious logic + +## Citation + +If you use this code in your research, please cite: + +```bibtex +@article{ajanthan2025asyncpp, + title={Nesterov Method for Asynchronous Pipeline Parallel Optimization}, + author={Ajanthan, Thalaiyasingam and Ramasinghe, Sameera and Zuo, Yan and Avraham, Gil and Long, Alexander}, + journal={ICML}, + year={2025} +} +``` + +## License + +This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.