diff --git a/Cargo.toml b/Cargo.toml
index 0429f82..47d0cfc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -92,3 +92,7 @@ path = "examples/batch_processing_example.rs"
 [[example]]
 name = "early_stopping_example"
 path = "examples/early_stopping_example.rs"
+
+[[example]]
+name = "linear_layer_example"
+path = "examples/linear_layer_example.rs"
diff --git a/README.md b/README.md
index dc26443..84571c6 100644
--- a/README.md
+++ b/README.md
@@ -34,14 +34,14 @@ graph TD
 ## Features
 
 - **LSTM, BiLSTM & GRU Networks** with multi-layer support
+- **Linear (Dense) Layer** for classification and output projection
 - **Complete Training System** with backpropagation through time (BPTT)
-- **Multiple Optimizers**: SGD, Adam, RMSprop with comprehensive learning rate scheduling
-- **Advanced Learning Rate Scheduling**: 12 different schedulers including OneCycle, Warmup, Cyclical, and Polynomial
-- **Early Stopping**: Prevent overfitting with configurable patience and metric monitoring
+- **Multiple Optimizers**: SGD, Adam, RMSprop with learning rate scheduling
+- **Learning Rate Scheduling**: 12 schedulers including OneCycle, Warmup, Cyclical, Polynomial
+- **Early Stopping**: Configurable patience and metric monitoring
 - **Loss Functions**: MSE, MAE, Cross-entropy with softmax
-- **Advanced Dropout**: Input, recurrent, output dropout, variational dropout, and zoneout
-- **Batch Processing**: 4-5x training speedup with efficient batch operations
-- **Schedule Visualization**: ASCII visualization of learning rate schedules
+- **Advanced Dropout**: Input, recurrent, output, variational dropout, and zoneout
+- **Batch Processing**: Efficient batch operations
 - **Model Persistence**: Save/load models in JSON or binary format
 - **Peephole LSTM variant** for enhanced performance
 
@@ -51,7 +51,7 @@ Add to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-rust-lstm = "0.5.0"
+rust-lstm = "0.6"
 ```
 
 ### Basic Usage
@@ -181,6 +181,24 @@ let mut gru = GRUNetwork::new(input_size, hidden_size, num_layers)
 let (output, _) = gru.forward(&input, &hidden_state);
 ```
 
+### Linear Layer
+
+```rust
+use rust_lstm::layers::linear::LinearLayer;
+use rust_lstm::optimizers::Adam;
+
+// Create linear layer for classification: hidden_size -> num_classes
+let mut classifier = LinearLayer::new(hidden_size, num_classes);
+let mut optimizer = Adam::new(0.001);
+
+// Forward pass
+let logits = classifier.forward(&lstm_output);
+
+// Backward pass
+let (gradients, input_grad) = classifier.backward(&grad_output);
+classifier.update_parameters(&gradients, &mut optimizer, "classifier");
+```
+
 #### LSTM vs GRU Cell Comparison
 
 ```mermaid
@@ -261,13 +279,13 @@ LRScheduleVisualizer::print_schedule(poly_scheduler, 0.01, 100, 60, 10);
 - **OneCycleLR**: One cycle policy for super-convergence
 - **ReduceLROnPlateau**: Adaptive reduction on validation plateaus
 - **LinearLR**: Linear interpolation between rates
-- **PolynomialLR** ✨: Polynomial decay with configurable power
-- **CyclicalLR** ✨: Triangular, triangular2, and exponential range modes
-- **WarmupScheduler** ✨: Gradual warmup wrapper for any base scheduler
+- **PolynomialLR**: Polynomial decay with configurable power
+- **CyclicalLR**: Triangular, triangular2, and exponential range modes
+- **WarmupScheduler**: Gradual warmup wrapper for any base scheduler
 
 ## Architecture
 
-- **`layers`**: LSTM and GRU cells (standard, peephole, bidirectional) with dropout
+- **`layers`**: LSTM cells, GRU cells, Linear (dense) layer, dropout, peephole LSTM, bidirectional LSTM
 - **`models`**: High-level network architectures (LSTM, BiLSTM, GRU)
 - **`training`**: Training utilities with automatic train/eval mode switching
 - **`optimizers`**: SGD, Adam, RMSprop with scheduling
@@ -288,7 +306,8 @@ cargo run --example time_series_prediction
 # Advanced architectures
 cargo run --example gru_example              # GRU vs LSTM comparison
 cargo run --example bilstm_example           # Bidirectional LSTM
-cargo run --example dropout_example          # Dropout demo
+cargo run --example dropout_example          # Dropout regularization
+cargo run --example linear_layer_example     # Linear layer for classification
 
 # Learning and scheduling
 cargo run --example learning_rate_scheduling    # Basic schedulers
@@ -343,36 +362,12 @@ cargo run --example model_inspection
 cargo test
 ```
 
-## Performance Examples
-
-The library includes comprehensive examples that demonstrate its capabilities:
-
-### Training with Different Schedulers
-Run the learning rate scheduling examples to see different scheduler behaviors:
-```bash
-cargo run --example learning_rate_scheduling    # Compare basic schedulers
-cargo run --example advanced_lr_scheduling      # Advanced schedulers with ASCII visualization
-```
-
-### Architecture Comparison  
-Compare LSTM vs GRU performance:
-```bash
-cargo run --example gru_example
-```
-
-### Real-world Applications
-Test the library with practical examples:
-```bash
-cargo run --example stock_prediction      # Stock price predictions
-cargo run --example weather_prediction    # Weather forecasting  
-cargo run --example text_classification_bilstm  # Classification accuracy
-```
-
-The examples output training metrics, loss values, and predictions that you can analyze or plot with external tools.
-
 ## Version History
 
-- **v0.4.0**: Advanced learning rate scheduling with 12 different schedulers, warmup support, cyclical learning rates, polynomial decay, and ASCII visualization
+- **v0.6.1**: Fixed text generation in advanced example
+- **v0.6.0**: Early stopping support with configurable patience and metric monitoring
+- **v0.5.0**: Model persistence (JSON/binary), batch processing
+- **v0.4.0**: Advanced learning rate scheduling (12 schedulers), warmup, cyclical LR, visualization
 - **v0.3.0**: Bidirectional LSTM networks with flexible combine modes
 - **v0.2.0**: Complete training system with BPTT and comprehensive dropout
 - **v0.1.0**: Initial LSTM implementation with forward pass
diff --git a/examples/linear_layer_example.rs b/examples/linear_layer_example.rs
new file mode 100644
index 0000000..760f18e
--- /dev/null
+++ b/examples/linear_layer_example.rs
@@ -0,0 +1,252 @@
+use ndarray::arr2;
+use rust_lstm::layers::linear::LinearLayer;
+use rust_lstm::optimizers::{SGD, Adam};
+use rust_lstm::models::lstm_network::LSTMNetwork;
+
+/// Example 1: Basic LinearLayer usage for classification
+fn basic_classification_example() {
+    println!("=== Basic Classification Example ===");
+    
+    // Create a linear layer: 4 input features -> 3 classes
+    let mut linear = LinearLayer::new(4, 3);
+    let mut optimizer = SGD::new(0.1);
+    
+    // Sample input: batch of 2 samples, each with 4 features
+    let input = arr2(&[
+        [1.0, 0.5],  // feature 1
+        [0.8, -0.2], // feature 2  
+        [1.2, 0.9],  // feature 3
+        [-0.1, 0.3]  // feature 4
+    ]); // Shape: (4, 2)
+    
+    // Target classes (one-hot encoded)
+    let targets = arr2(&[
+        [1.0, 0.0],  // class 1 for sample 1, class 2 for sample 2
+        [0.0, 1.0],  // 
+        [0.0, 0.0]   //
+    ]); // Shape: (3, 2)
+    
+    println!("Input shape: {:?}", input.shape());
+    println!("Target shape: {:?}", targets.shape());
+    
+    // Training loop
+    for epoch in 0..10 {
+        // Forward pass
+        let output = linear.forward(&input);
+        
+        // Simple loss: mean squared error
+        let loss = (&output - &targets).map(|x| x * x).sum() / (output.len() as f64);
+        
+        // Backward pass
+        let grad_output = 2.0 * (&output - &targets) / (output.len() as f64);
+        let (gradients, _input_grad) = linear.backward(&grad_output);
+        
+        // Update parameters
+        linear.update_parameters(&gradients, &mut optimizer, "classifier");
+        
+        if epoch % 2 == 0 {
+            println!("Epoch {}: Loss = {:.4}", epoch, loss);
+        }
+    }
+    
+    // Final prediction
+    let final_output = linear.forward(&input);
+    println!("Final output:\n{:.3}", final_output);
+    println!("Target:\n{:.3}", targets);
+    println!();
+}
+
+/// Example 2: LSTM + LinearLayer for sequence classification
+fn lstm_with_linear_example() {
+    println!("=== LSTM + LinearLayer Example ===");
+    
+    // Create LSTM network: 5 input features -> 8 hidden units -> 3 classes
+    let mut lstm = LSTMNetwork::new(5, 8, 1);
+    let mut classifier = LinearLayer::new(8, 3);
+    let mut optimizer = Adam::new(0.001);
+    
+    // Sample sequence data: 4 time steps, 5 features, batch size 1
+    let sequence = vec![
+        arr2(&[[1.0], [0.5], [0.2], [0.8], [0.1]]), // t=0
+        arr2(&[[0.9], [0.6], [0.3], [0.7], [0.2]]), // t=1
+        arr2(&[[0.8], [0.7], [0.4], [0.6], [0.3]]), // t=2
+        arr2(&[[0.7], [0.8], [0.5], [0.5], [0.4]]), // t=3
+    ];
+    
+    // Target: classify the entire sequence (shape: 3 classes, 1 sample)
+    let target = arr2(&[[0.0], [1.0], [0.0]]); // Class 2
+    
+    println!("Sequence length: {}", sequence.len());
+    println!("Input features: {}", sequence[0].nrows());
+    println!("LSTM hidden size: {}", 8);
+    println!("Output classes: {}", target.nrows());
+    
+    // Training loop
+    for epoch in 0..20 {
+        // LSTM forward pass
+        let (lstm_outputs, _) = lstm.forward_sequence_with_cache(&sequence);
+        
+        // Use the last LSTM output for classification
+        let last_hidden = &lstm_outputs.last().unwrap().0;
+        
+        // Linear layer forward pass
+        let class_logits = classifier.forward(last_hidden);
+        
+        // Loss calculation
+        let loss = (&class_logits - &target).map(|x| x * x).sum() / (class_logits.len() as f64);
+        
+        // Backward pass through linear layer
+        let grad_output = 2.0 * (&class_logits - &target) / (class_logits.len() as f64);
+        let (linear_grads, _lstm_grad) = classifier.backward(&grad_output);
+        
+        // Update linear layer
+        classifier.update_parameters(&linear_grads, &mut optimizer, "classifier");
+        
+        // Note: In a complete implementation, you would also backpropagate through LSTM
+        // This example focuses on demonstrating LinearLayer usage
+        
+        if epoch % 5 == 0 {
+            println!("Epoch {}: Loss = {:.4}", epoch, loss);
+        }
+    }
+    
+    // Final prediction
+    let (final_lstm_outputs, _) = lstm.forward_sequence_with_cache(&sequence);
+    let final_hidden = &final_lstm_outputs.last().unwrap().0;
+    let final_prediction = classifier.forward(final_hidden);
+    
+    println!("Final prediction: [{:.3}, {:.3}, {:.3}]", 
+             final_prediction[[0, 0]], final_prediction[[1, 0]], final_prediction[[2, 0]]);
+    println!("Target:           [{:.3}, {:.3}, {:.3}]", 
+             target[[0, 0]], target[[1, 0]], target[[2, 0]]);
+    println!();
+}
+
+/// Example 3: Multi-layer perceptron using multiple LinearLayers
+fn multilayer_perceptron_example() {
+    println!("=== Multi-Layer Perceptron Example ===");
+    
+    // Create a 3-layer MLP: 2 -> 4 -> 4 -> 1
+    let mut layer1 = LinearLayer::new(2, 4);
+    let mut layer2 = LinearLayer::new(4, 4);
+    let mut layer3 = LinearLayer::new(4, 1);
+    let mut optimizer = Adam::new(0.01);
+    
+    // XOR problem dataset
+    let inputs = arr2(&[
+        [0.0, 1.0, 0.0, 1.0], // input 1
+        [0.0, 0.0, 1.0, 1.0]  // input 2
+    ]); // Shape: (2, 4)
+    
+    let targets = arr2(&[[0.0, 1.0, 1.0, 0.0]]); // XOR outputs
+    
+    println!("Training MLP on XOR problem...");
+    println!("Input shape: {:?}", inputs.shape());
+    println!("Target shape: {:?}", targets.shape());
+    
+    // Training loop
+    for epoch in 0..100 {
+        // Forward pass
+        let h1 = layer1.forward(&inputs);
+        let h1_relu = h1.map(|&x| if x > 0.0 { x } else { 0.0 }); // ReLU activation
+        
+        let h2 = layer2.forward(&h1_relu);
+        let h2_relu = h2.map(|&x| if x > 0.0 { x } else { 0.0 }); // ReLU activation
+        
+        let output = layer3.forward(&h2_relu);
+        
+        // Loss calculation
+        let loss = (&output - &targets).map(|x| x * x).sum() / (output.len() as f64);
+        
+        // Backward pass
+        let grad_output = 2.0 * (&output - &targets) / (output.len() as f64);
+        
+        // Layer 3 backward
+        let (grad3, grad_h2) = layer3.backward(&grad_output);
+        
+        // ReLU backward for h2
+        let grad_h2_relu = &grad_h2 * &h2.map(|&x| if x > 0.0 { 1.0 } else { 0.0 });
+        
+        // Layer 2 backward
+        let (grad2, grad_h1) = layer2.backward(&grad_h2_relu);
+        
+        // ReLU backward for h1
+        let grad_h1_relu = &grad_h1 * &h1.map(|&x| if x > 0.0 { 1.0 } else { 0.0 });
+        
+        // Layer 1 backward
+        let (grad1, _) = layer1.backward(&grad_h1_relu);
+        
+        // Update all layers
+        layer1.update_parameters(&grad1, &mut optimizer, "layer1");
+        layer2.update_parameters(&grad2, &mut optimizer, "layer2");
+        layer3.update_parameters(&grad3, &mut optimizer, "layer3");
+        
+        if epoch % 20 == 0 {
+            println!("Epoch {}: Loss = {:.4}", epoch, loss);
+        }
+    }
+    
+    // Final predictions
+    let h1 = layer1.forward(&inputs);
+    let h1_relu = h1.map(|&x| if x > 0.0 { x } else { 0.0 });
+    let h2 = layer2.forward(&h1_relu);
+    let h2_relu = h2.map(|&x| if x > 0.0 { x } else { 0.0 });
+    let final_output = layer3.forward(&h2_relu);
+    
+    println!("Final predictions:");
+    for i in 0..4 {
+        let input_vals = (inputs[[0, i]], inputs[[1, i]]);
+        let prediction = final_output[[0, i]];
+        let target_val = targets[[0, i]];
+        println!("  {:?} -> {:.3} (target: {:.1})", input_vals, prediction, target_val);
+    }
+    println!();
+}
+
+/// Example 4: Demonstrating different initialization methods
+fn initialization_example() {
+    println!("=== Initialization Methods Example ===");
+    
+    // Method 1: Default random initialization (Xavier/Glorot)
+    let layer_random = LinearLayer::new(3, 2);
+    println!("Random initialization:");
+    println!("  Weight range: [{:.3}, {:.3}]", 
+             layer_random.weight.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(),
+             layer_random.weight.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap());
+    
+    // Method 2: Zero initialization
+    let layer_zeros = LinearLayer::new_zeros(3, 2);
+    println!("Zero initialization:");
+    println!("  All weights: {}", layer_zeros.weight.iter().all(|&x| x == 0.0));
+    
+    // Method 3: Custom initialization
+    let custom_weights = arr2(&[[1.0, 0.5, -0.2], [0.8, -0.1, 0.3]]);
+    let custom_bias = arr2(&[[0.1], [-0.05]]);
+    let layer_custom = LinearLayer::from_weights(custom_weights.clone(), custom_bias.clone());
+    println!("Custom initialization:");
+    println!("  Custom weights shape: {:?}", layer_custom.weight.shape());
+    println!("  Custom bias shape: {:?}", layer_custom.bias.shape());
+    
+    // Show layer information
+    println!("Layer dimensions: {:?}", layer_custom.dimensions());
+    println!("Number of parameters: {}", layer_custom.num_parameters());
+    println!();
+}
+
+fn main() {
+    println!("LinearLayer Examples");
+    println!("===================\n");
+    
+    basic_classification_example();
+    lstm_with_linear_example();
+    multilayer_perceptron_example();
+    initialization_example();
+    
+    println!("All examples completed successfully! 🎉");
+    println!("\nKey takeaways:");
+    println!("- LinearLayer enables standard neural network architectures");
+    println!("- Works seamlessly with LSTM networks for classification");
+    println!("- Supports multiple initialization methods");
+    println!("- Integrates with all existing optimizers");
+    println!("- Essential for text generation and classification tasks");
+}
diff --git a/src/layers/linear.rs b/src/layers/linear.rs
new file mode 100644
index 0000000..0af756c
--- /dev/null
+++ b/src/layers/linear.rs
@@ -0,0 +1,255 @@
+use ndarray::Array2;
+use ndarray_rand::RandomExt;
+use ndarray_rand::rand_distr::Uniform;
+use crate::optimizers::Optimizer;
+
+/// Holds gradients for linear layer parameters during backpropagation
+#[derive(Clone, Debug)]
+pub struct LinearGradients {
+    pub weight: Array2<f64>,
+    pub bias: Array2<f64>,
+}
+
+/// A fully connected (linear/dense) layer for neural networks
+/// 
+/// Performs the transformation: output = input * weight^T + bias
+/// where weight has shape (output_size, input_size) and bias has shape (output_size, 1)
+#[derive(Clone, Debug)]
+pub struct LinearLayer {
+    pub weight: Array2<f64>,     // (output_size, input_size)
+    pub bias: Array2<f64>,       // (output_size, 1)
+    pub input_size: usize,
+    pub output_size: usize,
+    input_cache: Option<Array2<f64>>, // Cache input for backward pass
+}
+
+impl LinearLayer {
+    /// Create a new linear layer with random initialization
+    /// 
+    /// # Arguments
+    /// * `input_size` - Size of input features
+    /// * `output_size` - Size of output features
+    /// 
+    /// # Returns
+    /// * New LinearLayer with Xavier/Glorot initialization
+    pub fn new(input_size: usize, output_size: usize) -> Self {
+        // Xavier/Glorot initialization: scale by sqrt(2 / (input_size + output_size))
+        let scale = (2.0 / (input_size + output_size) as f64).sqrt();
+        let weight_range = scale;
+        
+        let weight = Array2::random((output_size, input_size), Uniform::new(-weight_range, weight_range));
+        let bias = Array2::zeros((output_size, 1));
+        
+        Self {
+            weight,
+            bias,
+            input_size,
+            output_size,
+            input_cache: None,
+        }
+    }
+    
+    /// Create a new linear layer with zero initialization
+    pub fn new_zeros(input_size: usize, output_size: usize) -> Self {
+        let weight = Array2::zeros((output_size, input_size));
+        let bias = Array2::zeros((output_size, 1));
+        
+        Self {
+            weight,
+            bias,
+            input_size,
+            output_size,
+            input_cache: None,
+        }
+    }
+    
+    /// Create a new linear layer with custom initialization
+    pub fn from_weights(weight: Array2<f64>, bias: Array2<f64>) -> Self {
+        let (output_size, input_size) = weight.dim();
+        assert_eq!(bias.shape(), &[output_size, 1], "Bias shape must be (output_size, 1)");
+        
+        Self {
+            weight,
+            bias,
+            input_size,
+            output_size,
+            input_cache: None,
+        }
+    }
+    
+    /// Forward pass through the linear layer
+    /// 
+    /// # Arguments
+    /// * `input` - Input tensor of shape (input_size, batch_size)
+    /// 
+    /// # Returns
+    /// * Output tensor of shape (output_size, batch_size)
+    pub fn forward(&mut self, input: &Array2<f64>) -> Array2<f64> {
+        let (input_features, _batch_size) = input.dim();
+        assert_eq!(input_features, self.input_size, 
+                  "Input size {} doesn't match layer input size {}", 
+                  input_features, self.input_size);
+        
+        // Cache input for backward pass
+        self.input_cache = Some(input.clone());
+        
+        // output = weight @ input + bias (bias broadcasts automatically)
+        &self.weight.dot(input) + &self.bias
+    }
+    
+    /// Backward pass through the linear layer
+    /// 
+    /// # Arguments
+    /// * `grad_output` - Gradient w.r.t. output of shape (output_size, batch_size)
+    /// 
+    /// # Returns
+    /// * Tuple of (gradients, input_gradient)
+    ///   - gradients: LinearGradients containing weight and bias gradients
+    ///   - input_gradient: Gradient w.r.t. input of shape (input_size, batch_size)
+    pub fn backward(&self, grad_output: &Array2<f64>) -> (LinearGradients, Array2<f64>) {
+        let input = self.input_cache.as_ref().expect("Input cache not found for backward pass");
+        let (output_features, batch_size) = grad_output.dim();
+        let (input_features, input_batch_size) = input.dim();
+        
+        assert_eq!(output_features, self.output_size, "Gradient output size mismatch");
+        assert_eq!(input_features, self.input_size, "Input size mismatch");
+        assert_eq!(batch_size, input_batch_size, "Batch size mismatch");
+        
+        // Gradient w.r.t. weight: grad_output @ input^T
+        let weight_grad = grad_output.dot(&input.t());
+        
+        // Gradient w.r.t. bias: sum over batch dimension, keep as column vector
+        let bias_grad = grad_output.sum_axis(ndarray::Axis(1)).insert_axis(ndarray::Axis(1));
+        
+        // Gradient w.r.t. input: weight^T @ grad_output
+        let input_grad = self.weight.t().dot(grad_output);
+        
+        let gradients = LinearGradients {
+            weight: weight_grad,
+            bias: bias_grad,
+        };
+        
+        (gradients, input_grad)
+    }
+    
+    /// Update parameters using the provided optimizer
+    pub fn update_parameters<O: Optimizer>(&mut self, gradients: &LinearGradients, optimizer: &mut O, prefix: &str) {
+        optimizer.update(&format!("{}_weight", prefix), &mut self.weight, &gradients.weight);
+        optimizer.update(&format!("{}_bias", prefix), &mut self.bias, &gradients.bias);
+    }
+    
+    /// Initialize zero gradients for accumulation
+    pub fn zero_gradients(&self) -> LinearGradients {
+        LinearGradients {
+            weight: Array2::zeros(self.weight.raw_dim()),
+            bias: Array2::zeros(self.bias.raw_dim()),
+        }
+    }
+    
+    /// Get the number of parameters in this layer
+    pub fn num_parameters(&self) -> usize {
+        self.weight.len() + self.bias.len()
+    }
+    
+    /// Get layer dimensions
+    pub fn dimensions(&self) -> (usize, usize) {
+        (self.input_size, self.output_size)
+    }
+    
+    /// Set the layer to training mode
+    pub fn train(&mut self) {
+        // Linear layer has no specific training mode behavior like dropout
+    }
+    
+    /// Set the layer to evaluation mode
+    pub fn eval(&mut self) {
+        // Linear layer has no specific evaluation mode behavior
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::arr2;
+    use crate::optimizers::SGD;
+
+    #[test]
+    fn test_linear_layer_creation() {
+        let layer = LinearLayer::new(10, 5);
+        assert_eq!(layer.input_size, 10);
+        assert_eq!(layer.output_size, 5);
+        assert_eq!(layer.weight.shape(), &[5, 10]);
+        assert_eq!(layer.bias.shape(), &[5, 1]);
+    }
+
+    #[test]
+    fn test_linear_layer_forward() {
+        let mut layer = LinearLayer::new_zeros(3, 2);
+        let input = arr2(&[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]); // (3, 2)
+        
+        let output = layer.forward(&input);
+        assert_eq!(output.shape(), &[2, 2]); // (output_size, batch_size)
+        
+        // With zero weights and bias, output should be zero
+        assert!(output.iter().all(|&x| x == 0.0));
+    }
+
+    #[test]
+    fn test_linear_layer_backward() {
+        let mut layer = LinearLayer::new(3, 2);
+        let input = arr2(&[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]); // (3, 2)
+        let grad_output = arr2(&[[1.0, 1.0], [1.0, 1.0]]); // (2, 2)
+        
+        // Forward pass first to cache input
+        let _output = layer.forward(&input);
+        
+        let (gradients, input_grad) = layer.backward(&grad_output);
+        
+        assert_eq!(gradients.weight.shape(), &[2, 3]);
+        assert_eq!(gradients.bias.shape(), &[2, 1]);
+        assert_eq!(input_grad.shape(), &[3, 2]);
+    }
+
+    #[test]
+    fn test_linear_layer_with_optimizer() {
+        let mut layer = LinearLayer::new(2, 1);
+        let mut optimizer = SGD::new(0.1);
+        
+        let input = arr2(&[[1.0], [2.0]]); // (2, 1)
+        let target = arr2(&[[3.0]]); // (1, 1)
+        
+        // Forward pass
+        let output = layer.forward(&input);
+        
+        // Simple loss gradient (output - target)
+        let grad_output = &output - &target;
+        
+        // Backward pass
+        let (gradients, _) = layer.backward(&grad_output);
+        
+        // Update parameters
+        layer.update_parameters(&gradients, &mut optimizer, "linear");
+        
+        // Parameters should have changed
+        assert!(layer.weight.iter().any(|&x| x != 0.0) || layer.bias.iter().any(|&x| x != 0.0));
+    }
+
+    #[test]
+    fn test_linear_layer_dimensions() {
+        let layer = LinearLayer::new(128, 10);
+        assert_eq!(layer.dimensions(), (128, 10));
+        assert_eq!(layer.num_parameters(), 128 * 10 + 10); // weights + bias
+    }
+
+    #[test]
+    fn test_from_weights() {
+        let weight = arr2(&[[1.0, 2.0], [3.0, 4.0]]);
+        let bias = arr2(&[[0.5], [-0.5]]);
+        
+        let layer = LinearLayer::from_weights(weight.clone(), bias.clone());
+        assert_eq!(layer.weight, weight);
+        assert_eq!(layer.bias, bias);
+        assert_eq!(layer.input_size, 2);
+        assert_eq!(layer.output_size, 2);
+    }
+}
diff --git a/src/layers/mod.rs b/src/layers/mod.rs
index 6e9a350..888b87a 100644
--- a/src/layers/mod.rs
+++ b/src/layers/mod.rs
@@ -3,3 +3,4 @@ pub mod peephole_lstm_cell;
 pub mod gru_cell;
 pub mod dropout;
 pub mod bilstm_network;
+pub mod linear;
diff --git a/src/lib.rs b/src/lib.rs
index b8dbeed..58cffdd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -50,6 +50,7 @@ pub use layers::peephole_lstm_cell::PeepholeLSTMCell;
 pub use layers::gru_cell::{GRUCell, GRUCellGradients, GRUCellCache};
 pub use layers::bilstm_network::{BiLSTMNetwork, CombineMode, BiLSTMNetworkCache};
 pub use layers::dropout::{Dropout, Zoneout};
+pub use layers::linear::{LinearLayer, LinearGradients};
 pub use training::{
     LSTMTrainer, ScheduledLSTMTrainer, LSTMBatchTrainer, TrainingConfig, TrainingMetrics,
     EarlyStoppingConfig, EarlyStoppingMetric, EarlyStopper,