diff --git a/contrib/models/gpt2/README.md b/contrib/models/gpt2/README.md
index 4bedaa8..8ca1814 100644
--- a/contrib/models/gpt2/README.md
+++ b/contrib/models/gpt2/README.md
@@ -1,23 +1,37 @@
-# Contrib Model: gpt2
+# Contrib Model: GPT-2
 
-NeuronX Distributed Inference implementation of gpt2.
+NeuronX Distributed Inference implementation of GPT-2.
 
 ## Model Information
 
 - **HuggingFace ID:** `openai-community/gpt2`
 - **Model Type:** Decoder-only transformer
-- **License:** Check HuggingFace model card
+- **Parameters:** ~124M
+- **License:** MIT
 
 ## Architecture Details
 
-- **Layers:** Check model config
-- **Hidden Size:** Check model config
-- **Attention Heads:** Check model config
-- **Vocabulary:** Check model config
+- **Layers:** 12 decoder layers
+- **Hidden Size:** 768
+- **Attention Heads:** 12
+- **Intermediate Size:** 3072
+- **Vocabulary:** 50,257
+- **Max Position Embeddings:** 1024
+
+### GPT-2-Specific Features
+
+| Feature | Value | Description |
+|---------|-------|-------------|
+| Position Embeddings | Absolute | Learned position embeddings (not RoPE) |
+| Normalization | LayerNorm | Standard LayerNorm (not RMSNorm) |
+| Activation | GELU | GELU activation in MLP |
+| QKV Bias | True | Bias in attention projections |
+| Tied Embeddings | True | lm_head shares weights with embed_tokens |
+| QKV Layout | Fused | Combined QKV projection (c_attn) |
 
 ## Validation Results
 
-**Validated:** 2026-01-29  
+**Validated:** 2026-02-07  
 **Configuration:** TP=2, batch_size=1, seq_len=128, bfloat16
 
 ### Test Results
@@ -25,49 +39,111 @@ NeuronX Distributed Inference implementation of gpt2.
 | Test | Status | Result |
 |------|--------|--------|
 | Smoke Test | ✅ PASS | Model loads successfully |
-| Token Matching | ⚠️ LOW | **20.3% match** |
-| Cosine Similarity | ✅ PASS | **1.0000** |
+| Token Matching | ✅ PASS | **100% match** |
+
+### Benchmark Results (LightEval)
+
+Full evaluation on all samples, compared against HF reference (CPU, float32).
+
+| Task | Neuron (BF16) | HF (FP32) | Delta | Status |
+|------|---------------|-----------|-------|--------|
+| arc:challenge | 0.1937 | 0.1903 | +0.003 | ✅ PASS |
+| arc:easy | 0.4398 | 0.4381 | +0.002 | ✅ PASS |
+| hellaswag (em) | 0.0066 | 0.0050 | +0.002 | ✅ PASS |
+| truthfulqa_mc1 | 0.2375 | 0.2277 | +0.010 | ✅ PASS |
+| truthfulqa_mc2 | 0.4252 | 0.4069 | +0.018 | ✅ PASS |
+| winogrande | 0.4862 | 0.4838 | +0.002 | ✅ PASS |
+
+All benchmarks pass within ±2% of the HF reference. Largest delta is truthfulqa_mc2 at +1.8%.
+
+**Status:** ✅ PASS
+
+## Implementation Notes
+
+### Absolute Position Embeddings
+
+GPT-2 uses learned absolute position embeddings (not RoPE):
+
+```python
+# Token embeddings + Position embeddings
+inputs_embeds = self.embed_tokens(input_ids)
+position_embeds = self.wpe(position_ids)
+hidden_states = inputs_embeds + position_embeds
+```
+
+### Conv1D Weight Transposition
+
+GPT-2 uses Conv1D layers which store weights transposed:
+
+```python
+# HuggingFace Conv1D: weight shape [in_features, out_features]
+# Standard Linear: weight shape [out_features, in_features]
+# Must transpose during state dict conversion
+weight = state_dict[f"{layer_prefix}.attn.c_attn.weight"].t().contiguous()
+```
+
+### Fused QKV Projection
+
+GPT-2 uses a single combined QKV projection:
 
-**Status:** VALIDATED
+```python
+# c_attn.weight shape: [hidden_size, 3 * hidden_size]
+# Split into Q, K, V
+qkv_weight = qkv_weight.t().contiguous()  # Transpose Conv1D
+q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
+```
+
+### Vocab Size Padding
+
+GPT-2's vocab size (50257) is not divisible by common TP degrees. Use `pad=True`:
+
+```python
+self.lm_head = ColumnParallelLinear(
+    config.hidden_size,
+    config.vocab_size,
+    bias=False,
+    gather_output=True,
+    pad=True,  # Enable padding for non-divisible vocab sizes
+)
+```
 
 ## Usage
 
 ```python
+import torch
 from transformers import AutoTokenizer
 from neuronx_distributed_inference.models.config import NeuronConfig
-from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config
-
-# Import model classes from src
-from src.modeling_gpt2 import Model, Config
+from src.modeling_gpt2 import NeuronGPT2ForCausalLM, GPT2InferenceConfig
 
 model_path = "/path/to/gpt2/"
 compiled_model_path = "/path/to/compiled/"
 
-# Configure and use model
-# ... (see integration test for full example)
+neuron_config = NeuronConfig(
+    tp_degree=2,
+    batch_size=1,
+    seq_len=128,
+    torch_dtype=torch.bfloat16,
+)
+
+config = GPT2InferenceConfig.from_pretrained(model_path, neuron_config=neuron_config)
+model = NeuronGPT2ForCausalLM(model_path, config)
+model.compile(compiled_model_path)
+model.load(compiled_model_path)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+inputs = tokenizer("Hello, I am a language model", return_tensors="pt")
+# Use manual generation loop (see test file for example)
 ```
 
 ## Compatibility Matrix
 
 | Instance/Version | 2.20+ | 2.19 and earlier |
 |------------------|-------|------------------|
-| Trn1             | ✅ Working | Not tested |
+| Trn1             | ✅ Functional | Not tested |
 | Inf2             | Not tested | Not tested |
 
-## Testing
-
-Run integration tests:
-
-```bash
-pytest nxdi_contrib_models/models/gpt2/test/integration/test_model.py --capture=tee-sys
-```
-
-## Example Checkpoints
-
-* openai-community/gpt2
-
 ## Maintainer
 
-Neuroboros Team - Annapurna Labs
+Annapurna Labs
 
-**Last Updated:** 2026-01-30
+**Last Updated:** 2026-02-07
diff --git a/contrib/models/gpt2/src/modeling_gpt2.py b/contrib/models/gpt2/src/modeling_gpt2.py
index 019407b..9130c6f 100644
--- a/contrib/models/gpt2/src/modeling_gpt2.py
+++ b/contrib/models/gpt2/src/modeling_gpt2.py
@@ -422,17 +422,19 @@ def init_model(self, config: GPT2InferenceConfig):
 
         # Language modeling head
         # ✅ CRITICAL: lm_head belongs HERE in base model, not in CausalLM wrapper
+        # Note: GPT2 vocab size (50257) may not be divisible by TP degree, so we use pad=True
+        # We do NOT tie weights here - the state dict conversion handles weight sharing
         self.lm_head = ColumnParallelLinear(
             config.hidden_size,
             config.vocab_size,
             bias=False,  # GPT2 typically doesn't use bias in lm_head
             gather_output=True,
             dtype=config.neuron_config.torch_dtype,
+            pad=True,  # Enable padding for non-divisible vocab sizes
         )
-
-        # Tie embeddings if specified
-        if getattr(config, 'tie_word_embeddings', True):
-            self.lm_head.weight = self.embed_tokens.weight
+        
+        # Note: We don't tie embeddings here because the framework's preshard_hook
+        # expects separate weights. The state dict conversion handles weight sharing.
 
     def forward(
         self,
@@ -577,10 +579,12 @@ def convert_hf_to_neuron_state_dict(state_dict, config):
             neuron_state_dict["norm.bias"] = state_dict["ln_f.bias"].clone()
 
         # Language modeling head
+        # GPT2 ties embeddings by default, so lm_head.weight = embed_tokens.weight
+        # We need to provide the weight for the framework's preshard_hook
         if "lm_head.weight" in state_dict:
             neuron_state_dict["lm_head.weight"] = state_dict["lm_head.weight"].clone()
-        elif "wte.weight" in state_dict and getattr(config, 'tie_word_embeddings', True):
-            # If tied embeddings, use the same weight
+        elif "wte.weight" in state_dict:
+            # Use embedding weight for tied embeddings
             neuron_state_dict["lm_head.weight"] = state_dict["wte.weight"].clone()
 
         # Decoder layers (base class strips "transformer." prefix, so keys are h.{i}.*)
diff --git a/contrib/models/gpt2/test/integration/test_model.py b/contrib/models/gpt2/test/integration/test_model.py
index f33a0c8..2de14fe 100755
--- a/contrib/models/gpt2/test/integration/test_model.py
+++ b/contrib/models/gpt2/test/integration/test_model.py
@@ -1,6 +1,13 @@
 #!/usr/bin/env python3
 """
-Integration tests for gpt2 NeuronX implementation.
+Integration tests for GPT-2 NeuronX implementation.
+
+This model uses the GPT-2 architecture with:
+- Conv1D layers (require weight transposition)
+- Non-power-of-2 vocab size (50257, requires padding)
+- Tied embeddings (handled via state dict)
+- LayerNorm (not RMSNorm)
+- Learned positional embeddings (not RoPE)
 """
 
 import pytest
@@ -15,7 +22,7 @@
 # Import from src directory
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
-from modeling_gpt2 import *
+from modeling_gpt2 import NeuronGPT2ForCausalLM, GPT2InferenceConfig
 
 
 # Test configuration