diff --git a/contrib/models/gpt2/README.md b/contrib/models/gpt2/README.md index 4bedaa8..8ca1814 100644 --- a/contrib/models/gpt2/README.md +++ b/contrib/models/gpt2/README.md @@ -1,23 +1,37 @@ -# Contrib Model: gpt2 +# Contrib Model: GPT-2 -NeuronX Distributed Inference implementation of gpt2. +NeuronX Distributed Inference implementation of GPT-2. ## Model Information - **HuggingFace ID:** `openai-community/gpt2` - **Model Type:** Decoder-only transformer -- **License:** Check HuggingFace model card +- **Parameters:** ~124M +- **License:** MIT ## Architecture Details -- **Layers:** Check model config -- **Hidden Size:** Check model config -- **Attention Heads:** Check model config -- **Vocabulary:** Check model config +- **Layers:** 12 decoder layers +- **Hidden Size:** 768 +- **Attention Heads:** 12 +- **Intermediate Size:** 3072 +- **Vocabulary:** 50,257 +- **Max Position Embeddings:** 1024 + +### GPT-2-Specific Features + +| Feature | Value | Description | +|---------|-------|-------------| +| Position Embeddings | Absolute | Learned position embeddings (not RoPE) | +| Normalization | LayerNorm | Standard LayerNorm (not RMSNorm) | +| Activation | GELU | GELU activation in MLP | +| QKV Bias | True | Bias in attention projections | +| Tied Embeddings | True | lm_head shares weights with embed_tokens | +| QKV Layout | Fused | Combined QKV projection (c_attn) | ## Validation Results -**Validated:** 2026-01-29 +**Validated:** 2026-02-07 **Configuration:** TP=2, batch_size=1, seq_len=128, bfloat16 ### Test Results @@ -25,49 +39,111 @@ NeuronX Distributed Inference implementation of gpt2. | Test | Status | Result | |------|--------|--------| | Smoke Test | ✅ PASS | Model loads successfully | -| Token Matching | ⚠️ LOW | **20.3% match** | -| Cosine Similarity | ✅ PASS | **1.0000** | +| Token Matching | ✅ PASS | **100% match** | + +### Benchmark Results (LightEval) + +Full evaluation on all samples, compared against HF reference (CPU, float32). + +| Task | Neuron (BF16) | HF (FP32) | Delta | Status | +|------|---------------|-----------|-------|--------| +| arc:challenge | 0.1937 | 0.1903 | +0.003 | ✅ PASS | +| arc:easy | 0.4398 | 0.4381 | +0.002 | ✅ PASS | +| hellaswag (em) | 0.0066 | 0.0050 | +0.002 | ✅ PASS | +| truthfulqa_mc1 | 0.2375 | 0.2277 | +0.010 | ✅ PASS | +| truthfulqa_mc2 | 0.4252 | 0.4069 | +0.018 | ✅ PASS | +| winogrande | 0.4862 | 0.4838 | +0.002 | ✅ PASS | + +All benchmarks pass within ±2% of the HF reference. Largest delta is truthfulqa_mc2 at +1.8%. + +**Status:** ✅ PASS + +## Implementation Notes + +### Absolute Position Embeddings + +GPT-2 uses learned absolute position embeddings (not RoPE): + +```python +# Token embeddings + Position embeddings +inputs_embeds = self.embed_tokens(input_ids) +position_embeds = self.wpe(position_ids) +hidden_states = inputs_embeds + position_embeds +``` + +### Conv1D Weight Transposition + +GPT-2 uses Conv1D layers which store weights transposed: + +```python +# HuggingFace Conv1D: weight shape [in_features, out_features] +# Standard Linear: weight shape [out_features, in_features] +# Must transpose during state dict conversion +weight = state_dict[f"{layer_prefix}.attn.c_attn.weight"].t().contiguous() +``` + +### Fused QKV Projection + +GPT-2 uses a single combined QKV projection: -**Status:** VALIDATED +```python +# c_attn.weight shape: [hidden_size, 3 * hidden_size] +# Split into Q, K, V +qkv_weight = qkv_weight.t().contiguous() # Transpose Conv1D +q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0) +``` + +### Vocab Size Padding + +GPT-2's vocab size (50257) is not divisible by common TP degrees. Use `pad=True`: + +```python +self.lm_head = ColumnParallelLinear( + config.hidden_size, + config.vocab_size, + bias=False, + gather_output=True, + pad=True, # Enable padding for non-divisible vocab sizes +) +``` ## Usage ```python +import torch from transformers import AutoTokenizer from neuronx_distributed_inference.models.config import NeuronConfig -from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config - -# Import model classes from src -from src.modeling_gpt2 import Model, Config +from src.modeling_gpt2 import NeuronGPT2ForCausalLM, GPT2InferenceConfig model_path = "/path/to/gpt2/" compiled_model_path = "/path/to/compiled/" -# Configure and use model -# ... (see integration test for full example) +neuron_config = NeuronConfig( + tp_degree=2, + batch_size=1, + seq_len=128, + torch_dtype=torch.bfloat16, +) + +config = GPT2InferenceConfig.from_pretrained(model_path, neuron_config=neuron_config) +model = NeuronGPT2ForCausalLM(model_path, config) +model.compile(compiled_model_path) +model.load(compiled_model_path) + +tokenizer = AutoTokenizer.from_pretrained(model_path) +inputs = tokenizer("Hello, I am a language model", return_tensors="pt") +# Use manual generation loop (see test file for example) ``` ## Compatibility Matrix | Instance/Version | 2.20+ | 2.19 and earlier | |------------------|-------|------------------| -| Trn1 | ✅ Working | Not tested | +| Trn1 | ✅ Functional | Not tested | | Inf2 | Not tested | Not tested | -## Testing - -Run integration tests: - -```bash -pytest nxdi_contrib_models/models/gpt2/test/integration/test_model.py --capture=tee-sys -``` - -## Example Checkpoints - -* openai-community/gpt2 - ## Maintainer -Neuroboros Team - Annapurna Labs +Annapurna Labs -**Last Updated:** 2026-01-30 +**Last Updated:** 2026-02-07 diff --git a/contrib/models/gpt2/src/modeling_gpt2.py b/contrib/models/gpt2/src/modeling_gpt2.py index 019407b..9130c6f 100644 --- a/contrib/models/gpt2/src/modeling_gpt2.py +++ b/contrib/models/gpt2/src/modeling_gpt2.py @@ -422,17 +422,19 @@ def init_model(self, config: GPT2InferenceConfig): # Language modeling head # ✅ CRITICAL: lm_head belongs HERE in base model, not in CausalLM wrapper + # Note: GPT2 vocab size (50257) may not be divisible by TP degree, so we use pad=True + # We do NOT tie weights here - the state dict conversion handles weight sharing self.lm_head = ColumnParallelLinear( config.hidden_size, config.vocab_size, bias=False, # GPT2 typically doesn't use bias in lm_head gather_output=True, dtype=config.neuron_config.torch_dtype, + pad=True, # Enable padding for non-divisible vocab sizes ) - - # Tie embeddings if specified - if getattr(config, 'tie_word_embeddings', True): - self.lm_head.weight = self.embed_tokens.weight + + # Note: We don't tie embeddings here because the framework's preshard_hook + # expects separate weights. The state dict conversion handles weight sharing. def forward( self, @@ -577,10 +579,12 @@ def convert_hf_to_neuron_state_dict(state_dict, config): neuron_state_dict["norm.bias"] = state_dict["ln_f.bias"].clone() # Language modeling head + # GPT2 ties embeddings by default, so lm_head.weight = embed_tokens.weight + # We need to provide the weight for the framework's preshard_hook if "lm_head.weight" in state_dict: neuron_state_dict["lm_head.weight"] = state_dict["lm_head.weight"].clone() - elif "wte.weight" in state_dict and getattr(config, 'tie_word_embeddings', True): - # If tied embeddings, use the same weight + elif "wte.weight" in state_dict: + # Use embedding weight for tied embeddings neuron_state_dict["lm_head.weight"] = state_dict["wte.weight"].clone() # Decoder layers (base class strips "transformer." prefix, so keys are h.{i}.*) diff --git a/contrib/models/gpt2/test/integration/test_model.py b/contrib/models/gpt2/test/integration/test_model.py index f33a0c8..2de14fe 100755 --- a/contrib/models/gpt2/test/integration/test_model.py +++ b/contrib/models/gpt2/test/integration/test_model.py @@ -1,6 +1,13 @@ #!/usr/bin/env python3 """ -Integration tests for gpt2 NeuronX implementation. +Integration tests for GPT-2 NeuronX implementation. + +This model uses the GPT-2 architecture with: +- Conv1D layers (require weight transposition) +- Non-power-of-2 vocab size (50257, requires padding) +- Tied embeddings (handled via state dict) +- LayerNorm (not RMSNorm) +- Learned positional embeddings (not RoPE) """ import pytest @@ -15,7 +22,7 @@ # Import from src directory import sys sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) -from modeling_gpt2 import * +from modeling_gpt2 import NeuronGPT2ForCausalLM, GPT2InferenceConfig # Test configuration