From 60aa2bd583d757de03bbb13900d24e3b2b3a5b46 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 4 Oct 2017 17:37:56 +0200
Subject: [PATCH 01/23] small improvements: better random initialization, use
 of stdlib instead of external crates, bugfixes added L2 reularization
 replaced sigmoid by ReLU activation

---
 Cargo.toml | 21 +++-------------
 README.md  |  9 +++----
 src/lib.rs | 72 +++++++++++++++++++++++++++++++++---------------------
 3 files changed, 52 insertions(+), 50 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 0073463..178f8aa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,21 +1,8 @@
 [package]
-
 name = "nn"
-version = "0.1.6"
-authors = ["Jack Montgomery <jackm321@gmail.com>"]
-repository = "https://github.com/jackm321/RustNN"
-documentation = "https://jackm321.github.io/RustNN/doc/nn/"
-license = "Apache-2.0"
-readme = "README.md"
-
-description = """
-A multilayer feedforward backpropagation neural network library
-"""
-
-keywords = ["nn", "neural-network", "classifier", "backpropagation",
-            "machine-learning"]
+version = "0.6.0"
+authors = ["https://github.com/jackm321/RustNN"]
 
 [dependencies]
-rand = "0.3.7"
-rustc-serialize = "0.3.12"
-time = "0.1.24"
+rand = "0.3.*"
+rustc-serialize = "0.3.*"
diff --git a/README.md b/README.md
index 5eb76f2..9baa924 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,8 @@
 # RustNN
 
-[![Build Status](https://travis-ci.org/jackm321/RustNN.svg?branch=master)](https://travis-ci.org/jackm321/RustNN)
-
 An easy to use neural network library written in Rust.
 
-[Crate](https://crates.io/crates/nn)
-  
-[Documentation](https://jackm321.github.io/RustNN/doc/nn/)
+For the documentation take a look at the original library. There is only an additional lambda factor for training.
 
 ## Description
 RustNN is a [feedforward neural network ](http://en.wikipedia.org/wiki/Feedforward_neural_network)
@@ -15,6 +11,9 @@ generates fully connected multi-layer artificial neural networks that
 are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation).
 Networks are trained using an incremental training mode.
 
+## Fork
+This fork adds L2 regularization to the original crate and replaces sigmoid by the relu activation function. Additionally, there are a few minor improvements.
+
 ## XOR example
 
 This example creates a neural network with `2` nodes in the input layer,
diff --git a/src/lib.rs b/src/lib.rs
index 45c2a77..2d97bb5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,5 @@
+//! Modified version, originally from: https://github.com/jackm321/RustNN
+//!
 //! An easy to use neural network library written in Rust.
 //!
 //! # Description
@@ -57,19 +59,20 @@
 
 extern crate rand;
 extern crate rustc_serialize;
-extern crate time;
 
 use HaltCondition::{ Epochs, MSE, Timer };
 use LearningMode::{ Incremental };
 use std::iter::{Zip, Enumerate};
 use std::slice;
+use std::time::{ Duration, Instant };
 use rustc_serialize::json;
-use time::{ Duration, PreciseTime };
-use rand::Rng;
+//use rand::Rng;
+use rand::distributions::{Normal, IndependentSample};
 
-static DEFAULT_LEARNING_RATE: f64 = 0.3f64;
-static DEFAULT_MOMENTUM: f64 = 0f64;
-static DEFAULT_EPOCHS: u32 = 1000;
+const DEFAULT_LEARNING_RATE: f64 = 0.3f64;
+const DEFAULT_LAMBDA: f64 = 0.0f64;
+const DEFAULT_MOMENTUM: f64 = 0.0f64;
+const DEFAULT_EPOCHS: u32 = 1000;
 
 /// Specifies when to stop training the network
 #[derive(Debug, Copy, Clone)]
@@ -95,6 +98,7 @@ pub struct Trainer<'a,'b> {
     examples: &'b [(Vec<f64>, Vec<f64>)],
     rate: f64,
     momentum: f64,
+	lambda: f64,
     log_interval: Option<u32>,
     halt_condition: HaltCondition,
     learning_mode: LearningMode,
@@ -117,10 +121,20 @@ impl<'a,'b> Trainer<'a,'b>  {
         self.rate = rate;
         self
     }
+	
+	/// Specifies the lambda factor for L2 regularization used when training (default is 0.0)
+	pub fn lambda(&mut self, lambda: f64) -> &mut Trainer<'a,'b> {
+		if lambda <= 0f64 {
+			panic!("the lambda value must be a positive number");
+		}
+		
+		self.lambda = lambda;
+		self
+	}
 
     /// Specifies the momentum to be used when training (default is `0.0`)
     pub fn momentum(&mut self, momentum: f64) -> &mut Trainer<'a,'b> {
-        if momentum <= 0f64 {
+        if momentum < 0f64 {
             panic!("momentum must be positive");
         }
 
@@ -175,6 +189,7 @@ impl<'a,'b> Trainer<'a,'b>  {
         self.nn.train_details(
             self.examples,
             self.rate,
+			self.lambda,
             self.momentum,
             self.log_interval,
             self.halt_condition
@@ -220,10 +235,11 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
+			let normal = Normal::new(0.0, (2.0/prev_layer_size as f64).sqrt());
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
                 for _ in 0..prev_layer_size+1 {
-                    let random_weight: f64 = rng.gen_range(-0.5f64, 0.5f64);
+                    let random_weight: f64 = normal.ind_sample(&mut rng);
                     node.push(random_weight);
                 }
                 node.shrink_to_fit();
@@ -257,6 +273,7 @@ impl NN {
             examples: examples,
             rate: DEFAULT_LEARNING_RATE,
             momentum: DEFAULT_MOMENTUM,
+			lambda: DEFAULT_LAMBDA,
             log_interval: None,
             halt_condition: Epochs(DEFAULT_EPOCHS),
             learning_mode: Incremental,
@@ -275,7 +292,7 @@ impl NN {
         network
     }
 
-    fn train_details(&mut self, examples: &[(Vec<f64>, Vec<f64>)], rate: f64, momentum: f64, log_interval: Option<u32>,
+    fn train_details(&mut self, examples: &[(Vec<f64>, Vec<f64>)], rate: f64, lambda: f64, momentum: f64, log_interval: Option<u32>,
                     halt_condition: HaltCondition) -> f64 {
 
         // check that input and output sizes are correct
@@ -290,16 +307,16 @@ impl NN {
             }
         }
 
-        self.train_incremental(examples, rate, momentum, log_interval, halt_condition)
+        self.train_incremental(examples, rate, lambda, momentum, log_interval, halt_condition)
     }
 
-    fn train_incremental(&mut self, examples: &[(Vec<f64>, Vec<f64>)], rate: f64, momentum: f64, log_interval: Option<u32>,
+    fn train_incremental(&mut self, examples: &[(Vec<f64>, Vec<f64>)], rate: f64, lambda: f64, momentum: f64, log_interval: Option<u32>,
                     halt_condition: HaltCondition) -> f64 {
 
         let mut prev_deltas = self.make_weights_tracker(0.0f64);
         let mut epochs = 0u32;
         let mut training_error_rate = 0f64;
-        let start_time = PreciseTime::now();
+        let start_time = Instant::now();
 
         loop {
 
@@ -321,8 +338,7 @@ impl NN {
                         if training_error_rate <= target_error { break }
                     },
                     Timer(duration) => {
-                        let now = PreciseTime::now();
-                        if start_time.to(now) >= duration { break }
+                        if start_time.elapsed() >= duration { break }
                     }
                 }
             }
@@ -333,7 +349,7 @@ impl NN {
                 let results = self.do_run(&inputs);
                 let weight_updates = self.calculate_weight_updates(&results, &targets);
                 training_error_rate += calculate_error(&results, &targets);
-                self.update_weights(&weight_updates, &mut prev_deltas, rate, momentum)
+                self.update_weights(&weight_updates, &mut prev_deltas, rate, lambda, momentum)
             }
 
             epochs += 1;
@@ -348,7 +364,7 @@ impl NN {
         for (layer_index, layer) in self.layers.iter().enumerate() {
             let mut layer_results = Vec::new();
             for node in layer.iter() {
-                layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) )
+                layer_results.push( relu(modified_dotprod(&node, &results[layer_index])) )
             }
             results.push(layer_results);
         }
@@ -356,7 +372,7 @@ impl NN {
     }
 
     // updates all weights in the network
-    fn update_weights(&mut self, network_weight_updates: &Vec<Vec<Vec<f64>>>, prev_deltas: &mut Vec<Vec<Vec<f64>>>, rate: f64, momentum: f64) {
+    fn update_weights(&mut self, network_weight_updates: &Vec<Vec<Vec<f64>>>, prev_deltas: &mut Vec<Vec<Vec<f64>>>, rate: f64, lambda: f64, momentum: f64) {
         for layer_index in 0..self.layers.len() {
             let mut layer = &mut self.layers[layer_index];
             let layer_weight_updates = &network_weight_updates[layer_index];
@@ -367,7 +383,7 @@ impl NN {
                     let weight_update = node_weight_updates[weight_index];
                     let prev_delta = prev_deltas[layer_index][node_index][weight_index];
                     let delta = (rate * weight_update) + (momentum * prev_delta);
-                    node[weight_index] += delta;
+                    node[weight_index] = (1.0 - rate * lambda) * node[weight_index] + delta;
                     prev_deltas[layer_index][node_index][weight_index] = delta;
                 }
             }
@@ -382,32 +398,32 @@ impl NN {
         let layers = &self.layers;
         let network_results = &results[1..]; // skip the input layer
         let mut next_layer_nodes: Option<&Vec<Vec<f64>>> = None;
-
+		
         for (layer_index, (layer_nodes, layer_results)) in iter_zip_enum(layers, network_results).rev() {
             let prev_layer_results = &results[layer_index];
             let mut layer_errors = Vec::new();
             let mut layer_weight_updates = Vec::new();
-
-
+			
+			
             for (node_index, (node, &result)) in iter_zip_enum(layer_nodes, layer_results) {
                 let mut node_weight_updates = Vec::new();
-                let mut node_error;
-
+                let node_error;
+				
                 // calculate error for this node
                 if layer_index == layers.len() - 1 {
-                    node_error = result * (1f64 - result) * (targets[node_index] - result);
+                    node_error = (if result > 0.0f64 { 1.0f64 } else { 0.0f64 }) * (targets[node_index] - result); //derivative of activation function appears here
                 } else {
                     let mut sum = 0f64;
                     let next_layer_errors = &network_errors[network_errors.len() - 1];
                     for (next_node, &next_node_error_data) in next_layer_nodes.unwrap().iter().zip((next_layer_errors).iter()) {
                         sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold
                     }
-                    node_error = result * (1f64 - result) * sum;
+                    node_error = (if result > 0.0f64 { 1.0f64 } else { 0.0f64 }) * sum; //derivative of activation function appears here
                 }
 
                 // calculate weight updates for this node
                 for weight_index in 0..node.len() {
-                    let mut prev_layer_result;
+                    let prev_layer_result;
                     if weight_index == 0 {
                         prev_layer_result = 1f64; // threshold
                     } else {
@@ -459,8 +475,8 @@ fn modified_dotprod(node: &Vec<f64>, values: &Vec<f64>) -> f64 {
     total
 }
 
-fn sigmoid(y: f64) -> f64 {
-    1f64 / (1f64 + (-y).exp())
+fn relu(y: f64) -> f64 {
+	y.max(0.0) //below 0 the output ist 0, above it is output=input (linear)
 }
 
 

From 28d3ff51d402c6d106ce11674672aa4e796925ee Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 4 Oct 2017 17:50:18 +0200
Subject: [PATCH 02/23] bugfix

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 2d97bb5..bf879fa 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -124,7 +124,7 @@ impl<'a,'b> Trainer<'a,'b>  {
 	
 	/// Specifies the lambda factor for L2 regularization used when training (default is 0.0)
 	pub fn lambda(&mut self, lambda: f64) -> &mut Trainer<'a,'b> {
-		if lambda <= 0f64 {
+		if lambda < 0f64 {
 			panic!("the lambda value must be a positive number");
 		}
 		

From 2e3f45d91b12dfce7d9027d9185e14eb2e2994ae Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 00:00:19 +0200
Subject: [PATCH 03/23] replace ReLU by PELU to fix dead gradients

---
 .travis.yml  |  1 -
 README.md    |  2 +-
 src/lib.rs   | 33 +++++++++++++++++++++++++++------
 tests/xor.rs |  3 +--
 4 files changed, 29 insertions(+), 10 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 613564f..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1 +0,0 @@
-language: rust
\ No newline at end of file
diff --git a/README.md b/README.md
index 9baa924..c50f546 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation).
 Networks are trained using an incremental training mode.
 
 ## Fork
-This fork adds L2 regularization to the original crate and replaces sigmoid by the relu activation function. Additionally, there are a few minor improvements.
+This fork adds L2 regularization to the original crate and replaces sigmoid by the PELU activation function. Additionally, there are a few minor improvements.
 
 ## XOR example
 
diff --git a/src/lib.rs b/src/lib.rs
index bf879fa..0d0317f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -74,6 +74,10 @@ const DEFAULT_LAMBDA: f64 = 0.0f64;
 const DEFAULT_MOMENTUM: f64 = 0.0f64;
 const DEFAULT_EPOCHS: u32 = 1000;
 
+const PELU_FACTOR_A: f64 = 1.0f64;
+const PELU_FACTOR_B: f64 = 1.0f64;
+
+
 /// Specifies when to stop training the network
 #[derive(Debug, Copy, Clone)]
 pub enum HaltCondition {
@@ -364,7 +368,8 @@ impl NN {
         for (layer_index, layer) in self.layers.iter().enumerate() {
             let mut layer_results = Vec::new();
             for node in layer.iter() {
-                layer_results.push( relu(modified_dotprod(&node, &results[layer_index])) )
+                layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ) //pelu
+				//layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ) //sigmoid
             }
             results.push(layer_results);
         }
@@ -388,7 +393,6 @@ impl NN {
                 }
             }
         }
-
     }
 
     // calculates all weight updates by backpropagation
@@ -411,14 +415,18 @@ impl NN {
 				
                 // calculate error for this node
                 if layer_index == layers.len() - 1 {
-                    node_error = (if result > 0.0f64 { 1.0f64 } else { 0.0f64 }) * (targets[node_index] - result); //derivative of activation function appears here
+					let act_deriv = if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }; //pelu
+					//let act_deriv = result * (1.0 - result); //sigmoid
+                    node_error = act_deriv * (targets[node_index] - result);
                 } else {
                     let mut sum = 0f64;
                     let next_layer_errors = &network_errors[network_errors.len() - 1];
                     for (next_node, &next_node_error_data) in next_layer_nodes.unwrap().iter().zip((next_layer_errors).iter()) {
                         sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold
                     }
-                    node_error = (if result > 0.0f64 { 1.0f64 } else { 0.0f64 }) * sum; //derivative of activation function appears here
+					let act_deriv = if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }; //pelu
+					//let act_deriv = result * (1.0 - result); //sigmoid
+                    node_error = act_deriv * sum;
                 }
 
                 // calculate weight updates for this node
@@ -475,8 +483,21 @@ fn modified_dotprod(node: &Vec<f64>, values: &Vec<f64>) -> f64 {
     total
 }
 
-fn relu(y: f64) -> f64 {
-	y.max(0.0) //below 0 the output ist 0, above it is output=input (linear)
+#[allow(dead_code)]
+fn sigmoid(y: f64) -> f64 {
+    1f64 / (1f64 + (-y).exp())
+}
+
+#[allow(dead_code)]
+fn pelu(y: f64) -> f64 {
+	if y < 0.0 //PELU activation
+	{
+		PELU_FACTOR_A * ((y / PELU_FACTOR_B).exp() - 1.0)
+	}
+	else
+	{
+		(PELU_FACTOR_A / PELU_FACTOR_B) * y
+	}
 }
 
 
diff --git a/tests/xor.rs b/tests/xor.rs
index 4ef51eb..c93110f 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -1,5 +1,4 @@
 extern crate nn;
-extern crate time;
 
 use nn::{NN, HaltCondition, LearningMode};
 
@@ -15,7 +14,7 @@ fn xor_4layers() {
 
     // create a new neural network
     let mut net1 = NN::new(&[2,3,3,1]);
-
+	
     // train the network
     net1.train(&examples)
         .log_interval(Some(1000))

From 2616bc394fc8a872cbdaae13d95767b3c01dc39a Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 01:39:24 +0200
Subject: [PATCH 04/23] little change to initialization

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 0d0317f..55e600e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -239,7 +239,7 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
-			let normal = Normal::new(0.0, (2.0/prev_layer_size as f64).sqrt());
+			let normal = Normal::new(0.0, (9.0/prev_layer_size as f64).sqrt());
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
                 for _ in 0..prev_layer_size+1 {

From 1e5e4008c03ebaa7356fa81863e9cccb6c66a9be Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 02:00:12 +0200
Subject: [PATCH 05/23] provide both Sigmoid and PELU

---
 README.md  |  3 ++-
 src/lib.rs | 38 ++++++++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index c50f546..f81b244 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,8 @@ are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation).
 Networks are trained using an incremental training mode.
 
 ## Fork
-This fork adds L2 regularization to the original crate and replaces sigmoid by the PELU activation function. Additionally, there are a few minor improvements.
+This fork adds L2 regularization and PELU activation to the original crate. Additionally, there are a few minor improvements.
+Lambda can be set just like the learning rate. The activation function gets set in NN::new as second parameter.
 
 ## XOR example
 
diff --git a/src/lib.rs b/src/lib.rs
index 55e600e..162aa68 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -78,6 +78,15 @@ const PELU_FACTOR_A: f64 = 1.0f64;
 const PELU_FACTOR_B: f64 = 1.0f64;
 
 
+/// Specifies the activation function
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum Activation {
+	/// Sigmoid activation
+	Sigmoid,
+	/// PELU activation
+	PELU,
+}
+
 /// Specifies when to stop training the network
 #[derive(Debug, Copy, Clone)]
 pub enum HaltCondition {
@@ -207,16 +216,17 @@ impl<'a,'b> Trainer<'a,'b>  {
 pub struct NN {
     layers: Vec<Vec<Vec<f64>>>,
     num_inputs: u32,
+	activation: u32,
 }
 
 impl NN {
-
-    /// Each number in the `layers_sizes` parameter specifies a
+	/// Each number in the `layers_sizes` parameter specifies a
     /// layer in the network. The number itself is the number of nodes in that
     /// layer. The first number is the input layer, the last
     /// number is the output layer, and all numbers between the first and
     /// last are hidden layers. There must be at least two layers in the network.
-    pub fn new(layers_sizes: &[u32]) -> NN {
+	/// The activation function can be Sigmoid or PELU.
+    pub fn new(layers_sizes: &[u32], activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
 
         if layers_sizes.len() < 2 {
@@ -254,7 +264,7 @@ impl NN {
             prev_layer_size = layer_size;
         }
         layers.shrink_to_fit();
-        NN { layers: layers, num_inputs: first_layer_size }
+        NN { layers: layers, num_inputs: first_layer_size, activation: if activation == Activation::Sigmoid { 0 } else { 1 } }
     }
 
     /// Runs the network on an input and returns a vector of the results.
@@ -368,8 +378,10 @@ impl NN {
         for (layer_index, layer) in self.layers.iter().enumerate() {
             let mut layer_results = Vec::new();
             for node in layer.iter() {
-                layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ) //pelu
-				//layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ) //sigmoid
+				match self.activation {
+					0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
+					_ => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
+				}
             }
             results.push(layer_results);
         }
@@ -415,8 +427,10 @@ impl NN {
 				
                 // calculate error for this node
                 if layer_index == layers.len() - 1 {
-					let act_deriv = if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }; //pelu
-					//let act_deriv = result * (1.0 - result); //sigmoid
+					let act_deriv = match self.activation {
+						0 => result * (1.0 - result), //sigmoid
+						_ => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
+					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
                     let mut sum = 0f64;
@@ -424,8 +438,10 @@ impl NN {
                     for (next_node, &next_node_error_data) in next_layer_nodes.unwrap().iter().zip((next_layer_errors).iter()) {
                         sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold
                     }
-					let act_deriv = if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }; //pelu
-					//let act_deriv = result * (1.0 - result); //sigmoid
+					let act_deriv = match self.activation {
+						0 => result * (1.0 - result), //sigmoid
+						_ => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
+					};
                     node_error = act_deriv * sum;
                 }
 
@@ -483,12 +499,10 @@ fn modified_dotprod(node: &Vec<f64>, values: &Vec<f64>) -> f64 {
     total
 }
 
-#[allow(dead_code)]
 fn sigmoid(y: f64) -> f64 {
     1f64 / (1f64 + (-y).exp())
 }
 
-#[allow(dead_code)]
 fn pelu(y: f64) -> f64 {
 	if y < 0.0 //PELU activation
 	{

From ced89e1ff73956fe00a018c670456839d77bdb90 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 02:53:01 +0200
Subject: [PATCH 06/23] replace PELU by SELU

---
 README.md  |  2 +-
 src/lib.rs | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index f81b244..079e2ff 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation).
 Networks are trained using an incremental training mode.
 
 ## Fork
-This fork adds L2 regularization and PELU activation to the original crate. Additionally, there are a few minor improvements.
+This fork adds L2 regularization and SELU activation to the original crate. Additionally, there are a few minor improvements.
 Lambda can be set just like the learning rate. The activation function gets set in NN::new as second parameter.
 
 ## XOR example
diff --git a/src/lib.rs b/src/lib.rs
index 162aa68..f1a3e51 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -74,8 +74,9 @@ const DEFAULT_LAMBDA: f64 = 0.0f64;
 const DEFAULT_MOMENTUM: f64 = 0.0f64;
 const DEFAULT_EPOCHS: u32 = 1000;
 
-const PELU_FACTOR_A: f64 = 1.0f64;
-const PELU_FACTOR_B: f64 = 1.0f64;
+//values for a (0,1) distribution (so (-1, 1) interval)
+const SELU_FACTOR_A: f64 = 1.0507f64; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf
+const SELU_FACTOR_B: f64 = 1.6733f64; //alpha in https://arxiv.org/pdf/1706.02515.pdf
 
 
 /// Specifies the activation function
@@ -83,8 +84,8 @@ const PELU_FACTOR_B: f64 = 1.0f64;
 pub enum Activation {
 	/// Sigmoid activation
 	Sigmoid,
-	/// PELU activation
-	PELU,
+	/// SELU activation
+	SELU,
 }
 
 /// Specifies when to stop training the network
@@ -225,7 +226,7 @@ impl NN {
     /// layer. The first number is the input layer, the last
     /// number is the output layer, and all numbers between the first and
     /// last are hidden layers. There must be at least two layers in the network.
-	/// The activation function can be Sigmoid or PELU.
+	/// The activation function can be Sigmoid or SELU. Important: SELU optimized for (-1,1) interval
     pub fn new(layers_sizes: &[u32], activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
 
@@ -380,7 +381,7 @@ impl NN {
             for node in layer.iter() {
 				match self.activation {
 					0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
-					_ => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
+					_ => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
 				}
             }
             results.push(layer_results);
@@ -429,7 +430,7 @@ impl NN {
                 if layer_index == layers.len() - 1 {
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
+						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_B }, //selu
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -440,7 +441,7 @@ impl NN {
                     }
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
+						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_B }, //selu
 					};
                     node_error = act_deriv * sum;
                 }
@@ -503,14 +504,14 @@ fn sigmoid(y: f64) -> f64 {
     1f64 / (1f64 + (-y).exp())
 }
 
-fn pelu(y: f64) -> f64 {
-	if y < 0.0 //PELU activation
+fn selu(y: f64) -> f64 {
+	SELU_FACTOR_A * if y <= 0.0 //SELU activation
 	{
-		PELU_FACTOR_A * ((y / PELU_FACTOR_B).exp() - 1.0)
+		SELU_FACTOR_B * y.exp() - SELU_FACTOR_B
 	}
 	else
 	{
-		(PELU_FACTOR_A / PELU_FACTOR_B) * y
+		y
 	}
 }
 

From 83ce74fe3bde83dc59a04ec17b1f73d2536623f0 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 10:59:54 +0200
Subject: [PATCH 07/23] MSRA initialization

---
 src/lib.rs | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f1a3e51..e07a3d0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -250,12 +250,19 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
-			let normal = Normal::new(0.0, (9.0/prev_layer_size as f64).sqrt());
+			let normal = Normal::new(0.0, (1.0/prev_layer_size as f64).sqrt());
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
-                for _ in 0..prev_layer_size+1 {
-                    let random_weight: f64 = normal.ind_sample(&mut rng);
-                    node.push(random_weight);
+                for i in 0..prev_layer_size+1 {
+					if i == 0 //threshold aka bias
+					{
+						node.push(0.0);
+					}
+					else
+					{
+						let random_weight: f64 = normal.ind_sample(&mut rng);
+						node.push(random_weight);
+					}
                 }
                 node.shrink_to_fit();
                 layer.push(node)

From 694dc42747f5e6df83c995ad92adf0b0cb317c7b Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 11:04:04 +0200
Subject: [PATCH 08/23] MSRA initialization

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index e07a3d0..025caf0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -250,7 +250,7 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
-			let normal = Normal::new(0.0, (1.0/prev_layer_size as f64).sqrt());
+			let normal = Normal::new(0.0, (2.0/prev_layer_size as f64).sqrt());
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
                 for i in 0..prev_layer_size+1 {

From 90df92351c9e3c4e542cb24cf2471df013a08673 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 11:10:06 +0200
Subject: [PATCH 09/23] SELU deviation fix

---
 src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 025caf0..11cbbf8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -437,7 +437,7 @@ impl NN {
                 if layer_index == layers.len() - 1 {
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_B }, //selu
+						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -448,7 +448,7 @@ impl NN {
                     }
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_B }, //selu
+						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 					};
                     node_error = act_deriv * sum;
                 }

From fdfd563cddaebab4d4ea12b994f30d5964d3e337 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 12:50:39 +0200
Subject: [PATCH 10/23] fixes, PELU and LReLU added

---
 src/lib.rs | 77 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 11cbbf8..ce7877c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -69,14 +69,22 @@ use rustc_serialize::json;
 //use rand::Rng;
 use rand::distributions::{Normal, IndependentSample};
 
-const DEFAULT_LEARNING_RATE: f64 = 0.3f64;
-const DEFAULT_LAMBDA: f64 = 0.0f64;
-const DEFAULT_MOMENTUM: f64 = 0.0f64;
-const DEFAULT_EPOCHS: u32 = 1000;
+const DEFAULT_LEARNING_RATE:f64 = 0.3;
+const DEFAULT_LAMBDA:f64 = 0.0;
+const DEFAULT_MOMENTUM:f64 = 0.0;
+const DEFAULT_EPOCHS:u32 = 1000;
 
-//values for a (0,1) distribution (so (-1, 1) interval)
-const SELU_FACTOR_A: f64 = 1.0507f64; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf
-const SELU_FACTOR_B: f64 = 1.6733f64; //alpha in https://arxiv.org/pdf/1706.02515.pdf
+//values for a (0,1) distribution (so (-1, 1) interval in standard deviation)
+//const SELU_FACTOR_A:f64 = 1.0507; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf
+//const SELU_FACTOR_B:f64 = 1.6733; //alpha in https://arxiv.org/pdf/1706.02515.pdf
+//values for a (0,2) distribution (so (-2, 2) interval in standard deviation)
+const SELU_FACTOR_A:f64 = 1.06071; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf
+const SELU_FACTOR_B:f64 = 1.97126; //alpha in https://arxiv.org/pdf/1706.02515.pdf
+
+const PELU_FACTOR_A:f64 = 2.0;
+const PELU_FACTOR_B:f64 = 10.0;
+
+const LRELU_FACTOR:f64 = 0.33;
 
 
 /// Specifies the activation function
@@ -86,6 +94,10 @@ pub enum Activation {
 	Sigmoid,
 	/// SELU activation
 	SELU,
+	/// PELU activation
+	PELU,
+	/// Leaky ReLU activation
+	LRELU,
 }
 
 /// Specifies when to stop training the network
@@ -226,7 +238,8 @@ impl NN {
     /// layer. The first number is the input layer, the last
     /// number is the output layer, and all numbers between the first and
     /// last are hidden layers. There must be at least two layers in the network.
-	/// The activation function can be Sigmoid or SELU. Important: SELU optimized for (-1,1) interval
+	/// The activation function can be Sigmoid, SELU, PELU or LRELU.
+	/// Important: Take care of inputs/outputs for the individual activation functions!
     pub fn new(layers_sizes: &[u32], activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
 
@@ -250,7 +263,7 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
-			let normal = Normal::new(0.0, (2.0/prev_layer_size as f64).sqrt());
+			let normal = Normal::new(0.0, (1.0 / prev_layer_size as f64).sqrt()); //2.0 / prev
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
                 for i in 0..prev_layer_size+1 {
@@ -272,7 +285,13 @@ impl NN {
             prev_layer_size = layer_size;
         }
         layers.shrink_to_fit();
-        NN { layers: layers, num_inputs: first_layer_size, activation: if activation == Activation::Sigmoid { 0 } else { 1 } }
+		let act = match activation {
+			Activation::Sigmoid => 0,
+			Activation::SELU => 1,
+			Activation::PELU => 2,
+			Activation::LRELU => 3,
+		};
+        NN { layers: layers, num_inputs: first_layer_size, activation: act }
     }
 
     /// Runs the network on an input and returns a vector of the results.
@@ -388,7 +407,9 @@ impl NN {
             for node in layer.iter() {
 				match self.activation {
 					0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
-					_ => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
+					1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
+					2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
+					_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
 				}
             }
             results.push(layer_results);
@@ -437,7 +458,9 @@ impl NN {
                 if layer_index == layers.len() - 1 {
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
+						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
+						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu
+						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -448,7 +471,9 @@ impl NN {
                     }
 					let act_deriv = match self.activation {
 						0 => result * (1.0 - result), //sigmoid
-						_ => if result > 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
+						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
+						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu
+						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
 					};
                     node_error = act_deriv * sum;
                 }
@@ -511,8 +536,8 @@ fn sigmoid(y: f64) -> f64 {
     1f64 / (1f64 + (-y).exp())
 }
 
-fn selu(y: f64) -> f64 {
-	SELU_FACTOR_A * if y <= 0.0 //SELU activation
+fn selu(y: f64) -> f64 { //SELU activation
+	SELU_FACTOR_A * if y < 0.0
 	{
 		SELU_FACTOR_B * y.exp() - SELU_FACTOR_B
 	}
@@ -522,6 +547,28 @@ fn selu(y: f64) -> f64 {
 	}
 }
 
+fn pelu(y: f64) -> f64 { //PELU activation
+	if y < 0.0
+	{
+		SELU_FACTOR_A * (y / SELU_FACTOR_B).exp() - SELU_FACTOR_A
+	}
+	else
+	{
+		(PELU_FACTOR_A / PELU_FACTOR_B) * y
+	}
+}
+
+fn lrelu(y: f64) -> f64 { //LRELU activation
+	if y < 0.0
+	{
+		LRELU_FACTOR * y
+	}
+	else
+	{
+		y
+	}
+}
+
 
 // takes two arrays and enumerates the iterator produced by zipping each of
 // their iterators together

From b911eaca4e035e3f5cb9e86171c8afaf518a0e3b Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 12:56:21 +0200
Subject: [PATCH 11/23] readme change for previous commit

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 079e2ff..2b4c55e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 An easy to use neural network library written in Rust.
 
-For the documentation take a look at the original library. There is only an additional lambda factor for training.
+For the documentation take a look at the original library or generate it using "cargo doc".
 
 ## Description
 RustNN is a [feedforward neural network ](http://en.wikipedia.org/wiki/Feedforward_neural_network)
@@ -12,7 +12,7 @@ are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation).
 Networks are trained using an incremental training mode.
 
 ## Fork
-This fork adds L2 regularization and SELU activation to the original crate. Additionally, there are a few minor improvements.
+This fork adds L2 regularization and several activation functions to the original crate. Additionally, there are a few minor improvements.
 Lambda can be set just like the learning rate. The activation function gets set in NN::new as second parameter.
 
 ## XOR example

From ff45185bb8bf1705e9770b59949f063e823246da Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 13:51:48 +0200
Subject: [PATCH 12/23] split hidden and output activation

---
 README.md  |  2 +-
 src/lib.rs | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 2b4c55e..4fe1694 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Networks are trained using an incremental training mode.
 
 ## Fork
 This fork adds L2 regularization and several activation functions to the original crate. Additionally, there are a few minor improvements.
-Lambda can be set just like the learning rate. The activation function gets set in NN::new as second parameter.
+Lambda can be set just like the learning rate. The activation functions for hidden and output gets set in NN::new as second and third parameter respectively.
 
 ## XOR example
 
diff --git a/src/lib.rs b/src/lib.rs
index ce7877c..f24704e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -229,7 +229,8 @@ impl<'a,'b> Trainer<'a,'b>  {
 pub struct NN {
     layers: Vec<Vec<Vec<f64>>>,
     num_inputs: u32,
-	activation: u32,
+	hid_act: u32,
+	out_act: u32,
 }
 
 impl NN {
@@ -240,7 +241,7 @@ impl NN {
     /// last are hidden layers. There must be at least two layers in the network.
 	/// The activation function can be Sigmoid, SELU, PELU or LRELU.
 	/// Important: Take care of inputs/outputs for the individual activation functions!
-    pub fn new(layers_sizes: &[u32], activation: Activation) -> NN {
+    pub fn new(layers_sizes: &[u32], hidden_activation: Activation, output_activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
 
         if layers_sizes.len() < 2 {
@@ -285,13 +286,21 @@ impl NN {
             prev_layer_size = layer_size;
         }
         layers.shrink_to_fit();
-		let act = match activation {
+		
+		//set activation functions
+		let hid_act = match hidden_activation {
 			Activation::Sigmoid => 0,
 			Activation::SELU => 1,
 			Activation::PELU => 2,
 			Activation::LRELU => 3,
 		};
-        NN { layers: layers, num_inputs: first_layer_size, activation: act }
+		let out_act = match output_activation {
+			Activation::Sigmoid => 0,
+			Activation::SELU => 1,
+			Activation::PELU => 2,
+			Activation::LRELU => 3,
+		};
+        NN { layers: layers, num_inputs: first_layer_size, hid_act: hid_act, out_act: out_act }
     }
 
     /// Runs the network on an input and returns a vector of the results.
@@ -405,11 +414,23 @@ impl NN {
         for (layer_index, layer) in self.layers.iter().enumerate() {
             let mut layer_results = Vec::new();
             for node in layer.iter() {
-				match self.activation {
-					0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
-					1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
-					2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
-					_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+				if layer_index == self.layers.len()-1 //output layer
+				{
+					match self.out_act {
+						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
+						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
+						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
+						_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+					}
+				}
+				else
+				{
+					match self.hid_act {
+						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
+						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
+						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
+						_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+					}
 				}
             }
             results.push(layer_results);
@@ -456,7 +477,7 @@ impl NN {
 				
                 // calculate error for this node
                 if layer_index == layers.len() - 1 {
-					let act_deriv = match self.activation {
+					let act_deriv = match self.out_act { //output activation
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu
@@ -469,7 +490,7 @@ impl NN {
                     for (next_node, &next_node_error_data) in next_layer_nodes.unwrap().iter().zip((next_layer_errors).iter()) {
                         sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold
                     }
-					let act_deriv = match self.activation {
+					let act_deriv = match self.hid_act { //hidden activation
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu

From 53704957b8ccbc9ceab9843f6d0117b77c89bf4a Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 13:53:11 +0200
Subject: [PATCH 13/23] fit test to updates NN class

---
 tests/xor.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/xor.rs b/tests/xor.rs
index c93110f..86fbf0f 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -1,6 +1,6 @@
 extern crate nn;
 
-use nn::{NN, HaltCondition, LearningMode};
+use nn::{NN, HaltCondition, LearningMode, Activation};
 
 #[test]
 fn xor_4layers() {
@@ -13,7 +13,7 @@ fn xor_4layers() {
     ];
 
     // create a new neural network
-    let mut net1 = NN::new(&[2,3,3,1]);
+    let mut net1 = NN::new(&[2,3,3,1], Activation::LRELU, Activation::Sigmoid);
 	
     // train the network
     net1.train(&examples)

From 1e01e73caa9dd285732ab65182371d89fdba4cb5 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 5 Oct 2017 15:21:55 +0200
Subject: [PATCH 14/23] PELU fix

---
 src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f24704e..f6653b3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -81,8 +81,8 @@ const DEFAULT_EPOCHS:u32 = 1000;
 const SELU_FACTOR_A:f64 = 1.06071; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf
 const SELU_FACTOR_B:f64 = 1.97126; //alpha in https://arxiv.org/pdf/1706.02515.pdf
 
-const PELU_FACTOR_A:f64 = 2.0;
-const PELU_FACTOR_B:f64 = 10.0;
+const PELU_FACTOR_A:f64 = 1.5;
+const PELU_FACTOR_B:f64 = 2.0;
 
 const LRELU_FACTOR:f64 = 0.33;
 
@@ -480,7 +480,7 @@ impl NN {
 					let act_deriv = match self.out_act { //output activation
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
-						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu
+						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
 						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
 					};
                     node_error = act_deriv * (targets[node_index] - result);
@@ -493,7 +493,7 @@ impl NN {
 					let act_deriv = match self.hid_act { //hidden activation
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
-						2 => if result >= 0.0f64 { SELU_FACTOR_A / SELU_FACTOR_B } else { (result + SELU_FACTOR_A) * SELU_FACTOR_B }, //pelu
+						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
 						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
 					};
                     node_error = act_deriv * sum;

From 761c4099866d2605cb945aa6b4b44b205a0d4e08 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Sat, 7 Oct 2017 16:52:54 +0200
Subject: [PATCH 15/23] linear activation, possibly good for output, added

---
 src/lib.rs | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f6653b3..8d9cfd5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -98,6 +98,8 @@ pub enum Activation {
 	PELU,
 	/// Leaky ReLU activation
 	LRELU,
+	/// Linear activation
+	Linear,
 }
 
 /// Specifies when to stop training the network
@@ -293,12 +295,14 @@ impl NN {
 			Activation::SELU => 1,
 			Activation::PELU => 2,
 			Activation::LRELU => 3,
+			Activation::Linear => 4,
 		};
 		let out_act = match output_activation {
 			Activation::Sigmoid => 0,
 			Activation::SELU => 1,
 			Activation::PELU => 2,
 			Activation::LRELU => 3,
+			Activation::Linear => 4,
 		};
         NN { layers: layers, num_inputs: first_layer_size, hid_act: hid_act, out_act: out_act }
     }
@@ -420,7 +424,8 @@ impl NN {
 						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
 						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
 						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
-						_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+						_ => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
 					}
 				}
 				else
@@ -429,7 +434,8 @@ impl NN {
 						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
 						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
 						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
-						_ => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
+						_ => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
 					}
 				}
             }
@@ -440,7 +446,7 @@ impl NN {
 
     // updates all weights in the network
     fn update_weights(&mut self, network_weight_updates: &Vec<Vec<Vec<f64>>>, prev_deltas: &mut Vec<Vec<Vec<f64>>>, rate: f64, lambda: f64, momentum: f64) {
-        for layer_index in 0..self.layers.len() {
+		for layer_index in 0..self.layers.len() {
             let mut layer = &mut self.layers[layer_index];
             let layer_weight_updates = &network_weight_updates[layer_index];
             for node_index in 0..layer.len() {
@@ -481,7 +487,8 @@ impl NN {
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
-						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
+						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
+						_ => 1.0, //linear
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -494,7 +501,8 @@ impl NN {
 						0 => result * (1.0 - result), //sigmoid
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
-						_ => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
+						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
+						_ => 1.0, //linear
 					};
                     node_error = act_deriv * sum;
                 }
@@ -590,6 +598,10 @@ fn lrelu(y: f64) -> f64 { //LRELU activation
 	}
 }
 
+fn linear(y: f64) -> f64 { //linear activation
+	y
+}
+
 
 // takes two arrays and enumerates the iterator produced by zipping each of
 // their iterators together

From da8af9dec08d9afcff694c97810d54e2019245b5 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Sun, 8 Oct 2017 02:51:15 +0200
Subject: [PATCH 16/23] fix PELU function

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8d9cfd5..51e26c7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -579,7 +579,7 @@ fn selu(y: f64) -> f64 { //SELU activation
 fn pelu(y: f64) -> f64 { //PELU activation
 	if y < 0.0
 	{
-		SELU_FACTOR_A * (y / SELU_FACTOR_B).exp() - SELU_FACTOR_A
+		PELU_FACTOR_A * (y / PELU_FACTOR_B).exp() - PELU_FACTOR_A
 	}
 	else
 	{

From c5c211249b6f28f6f8d814ed35599db455dedf4f Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Tue, 10 Oct 2017 14:17:44 +0200
Subject: [PATCH 17/23] Tanh activation added, changed initialization

---
 src/lib.rs | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 51e26c7..708e540 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -100,6 +100,8 @@ pub enum Activation {
 	LRELU,
 	/// Linear activation
 	Linear,
+	/// Tanh activation
+	Tanh,
 }
 
 /// Specifies when to stop training the network
@@ -241,7 +243,7 @@ impl NN {
     /// layer. The first number is the input layer, the last
     /// number is the output layer, and all numbers between the first and
     /// last are hidden layers. There must be at least two layers in the network.
-	/// The activation function can be Sigmoid, SELU, PELU or LRELU.
+	/// The activation function can be Sigmoid, SELU, PELU, LRELU or Tanh.
 	/// Important: Take care of inputs/outputs for the individual activation functions!
     pub fn new(layers_sizes: &[u32], hidden_activation: Activation, output_activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
@@ -266,7 +268,9 @@ impl NN {
         let mut prev_layer_size = first_layer_size;
         for &layer_size in it {
             let mut layer: Vec<Vec<f64>> = Vec::new();
-			let normal = Normal::new(0.0, (1.0 / prev_layer_size as f64).sqrt()); //2.0 / prev
+			let mut init_std_scale = 2.0; //He init
+			if hidden_activation == Activation::SELU { init_std_scale = 1.0; } //MSRA / Xavier init
+			let normal = Normal::new(0.0, (init_std_scale / prev_layer_size as f64).sqrt());
             for _ in 0..layer_size {
                 let mut node: Vec<f64> = Vec::new();
                 for i in 0..prev_layer_size+1 {
@@ -296,6 +300,7 @@ impl NN {
 			Activation::PELU => 2,
 			Activation::LRELU => 3,
 			Activation::Linear => 4,
+			Activation::Tanh => 5,
 		};
 		let out_act = match output_activation {
 			Activation::Sigmoid => 0,
@@ -303,6 +308,7 @@ impl NN {
 			Activation::PELU => 2,
 			Activation::LRELU => 3,
 			Activation::Linear => 4,
+			Activation::Tanh => 5,
 		};
         NN { layers: layers, num_inputs: first_layer_size, hid_act: hid_act, out_act: out_act }
     }
@@ -425,7 +431,8 @@ impl NN {
 						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
 						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
 						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
-						_ => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
+						4 => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
+						_ => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ), //tanh
 					}
 				}
 				else
@@ -435,7 +442,8 @@ impl NN {
 						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
 						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
 						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
-						_ => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
+						4 => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
+						_ => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ), //tanh
 					}
 				}
             }
@@ -488,7 +496,8 @@ impl NN {
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
 						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
-						_ => 1.0, //linear
+						4 => 1.0, //linear
+						_ => 1.0 - result * result, //tanh
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -502,7 +511,8 @@ impl NN {
 						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
 						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
 						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
-						_ => 1.0, //linear
+						4 => 1.0, //linear
+						_ => 1.0 - result * result, //tanh
 					};
                     node_error = act_deriv * sum;
                 }
@@ -602,6 +612,10 @@ fn linear(y: f64) -> f64 { //linear activation
 	y
 }
 
+fn tanh(y: f64) -> f64 { //tanh activation
+	y.tanh()
+}
+
 
 // takes two arrays and enumerates the iterator produced by zipping each of
 // their iterators together

From 8c5064c028fd257f2b8c824253b56787e4101783 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 8 Nov 2017 13:27:38 +0100
Subject: [PATCH 18/23] cleanup, cosmetic, updated json library

---
 Cargo.toml   | 27 ++++++++++++++-
 src/lib.rs   | 97 +++++++++++++++++++++-------------------------------
 tests/xor.rs |  2 +-
 3 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 178f8aa..d0155be 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,4 +5,29 @@ authors = ["https://github.com/jackm321/RustNN"]
 
 [dependencies]
 rand = "0.3.*"
-rustc-serialize = "0.3.*"
+serde = "1.*"
+serde_derive = "1.*"
+serde_json = "1.*"
+
+
+
+[profile.dev]
+opt-level = 3
+lto = true
+panic = "unwind"
+debug = true
+debug-assertions = true
+
+[profile.test]
+opt-level = 0
+lto = false
+panic = "unwind"
+debug = true
+debug-assertions = true
+
+[profile.release]
+opt-level = 3
+lto = true
+panic = "unwind"
+debug = false
+debug-assertions = false
diff --git a/src/lib.rs b/src/lib.rs
index 708e540..8c635dd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,7 +21,7 @@
 //! for more details.
 //!
 //! ```rust
-//! use nn::{NN, HaltCondition};
+//! use nn::{NN, HaltCondition, Activation};
 //!
 //! // create examples of the XOR function
 //! // the network is trained on tuples of vectors where the first vector
@@ -37,7 +37,7 @@
 //! // that specifies the number of layers and the number of nodes in each layer
 //! // in this case we have an input layer with 2 nodes, one hidden layer
 //! // with 3 nodes and the output layer has 1 node
-//! let mut net = NN::new(&[2, 3, 1]);
+//! let mut net = NN::new(&[2, 3, 1], Activation::PELU, Activation::Sigmoid);
 //!
 //! // train the network on the examples of the XOR function
 //! // all methods seen here are optional except go() which must be called to begin training
@@ -57,16 +57,18 @@
 //! }
 //! ```
 
+#[macro_use]
+extern crate serde_derive;
+
+extern crate serde;
+extern crate serde_json;
 extern crate rand;
-extern crate rustc_serialize;
 
 use HaltCondition::{ Epochs, MSE, Timer };
 use LearningMode::{ Incremental };
 use std::iter::{Zip, Enumerate};
 use std::slice;
 use std::time::{ Duration, Instant };
-use rustc_serialize::json;
-//use rand::Rng;
 use rand::distributions::{Normal, IndependentSample};
 
 const DEFAULT_LEARNING_RATE:f64 = 0.3;
@@ -88,7 +90,7 @@ const LRELU_FACTOR:f64 = 0.33;
 
 
 /// Specifies the activation function
-#[derive(Debug, Copy, Clone, PartialEq)]
+#[derive(Debug, Copy, Clone, PartialEq, Deserialize, Serialize)]
 pub enum Activation {
 	/// Sigmoid activation
 	Sigmoid,
@@ -229,12 +231,12 @@ impl<'a,'b> Trainer<'a,'b>  {
 }
 
 /// Neural network
-#[derive(Debug, Clone, RustcDecodable, RustcEncodable)]
+#[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct NN {
     layers: Vec<Vec<Vec<f64>>>,
     num_inputs: u32,
-	hid_act: u32,
-	out_act: u32,
+	hid_act: Activation,
+	out_act: Activation,
 }
 
 impl NN {
@@ -243,8 +245,9 @@ impl NN {
     /// layer. The first number is the input layer, the last
     /// number is the output layer, and all numbers between the first and
     /// last are hidden layers. There must be at least two layers in the network.
-	/// The activation function can be Sigmoid, SELU, PELU, LRELU or Tanh.
+	/// The activation function can be Sigmoid, SELU, PELU, LRELU, Linear or Tanh.
 	/// Important: Take care of inputs/outputs for the individual activation functions!
+	/// Do not use linear activation for hidden layers.
     pub fn new(layers_sizes: &[u32], hidden_activation: Activation, output_activation: Activation) -> NN {
         let mut rng = rand::thread_rng();
 
@@ -293,24 +296,7 @@ impl NN {
         }
         layers.shrink_to_fit();
 		
-		//set activation functions
-		let hid_act = match hidden_activation {
-			Activation::Sigmoid => 0,
-			Activation::SELU => 1,
-			Activation::PELU => 2,
-			Activation::LRELU => 3,
-			Activation::Linear => 4,
-			Activation::Tanh => 5,
-		};
-		let out_act = match output_activation {
-			Activation::Sigmoid => 0,
-			Activation::SELU => 1,
-			Activation::PELU => 2,
-			Activation::LRELU => 3,
-			Activation::Linear => 4,
-			Activation::Tanh => 5,
-		};
-        NN { layers: layers, num_inputs: first_layer_size, hid_act: hid_act, out_act: out_act }
+        NN { layers: layers, num_inputs: first_layer_size, hid_act: hidden_activation, out_act: output_activation }
     }
 
     /// Runs the network on an input and returns a vector of the results.
@@ -343,12 +329,12 @@ impl NN {
 
     /// Encodes the network as a JSON string.
     pub fn to_json(&self) -> String {
-        json::encode(self).ok().expect("encoding JSON failed")
+        serde_json::to_string(self).ok().expect("encoding JSON failed")
     }
 
     /// Builds a new network from a JSON string.
     pub fn from_json(encoded: &str) -> NN {
-        let network: NN = json::decode(encoded).ok().expect("decoding JSON failed");
+        let network:NN = serde_json::from_str(encoded).ok().expect("decoding JSON failed");
         network
     }
 
@@ -424,27 +410,22 @@ impl NN {
         for (layer_index, layer) in self.layers.iter().enumerate() {
             let mut layer_results = Vec::new();
             for node in layer.iter() {
+				let activation;
 				if layer_index == self.layers.len()-1 //output layer
 				{
-					match self.out_act {
-						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
-						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
-						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
-						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
-						4 => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
-						_ => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ), //tanh
-					}
+					activation = self.out_act;
 				}
 				else
 				{
-					match self.hid_act {
-						0 => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), //sigmoid
-						1 => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), //selu
-						2 => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), //pelu
-						3 => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), //lrelu
-						4 => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), //linear
-						_ => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ), //tanh
-					}
+					activation = self.hid_act;
+				}
+				match activation {
+					Activation::Sigmoid => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ),
+					Activation::SELU => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ),
+					Activation::PELU => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ),
+					Activation::LRELU => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ),
+					Activation::Linear => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ),
+					Activation::Tanh => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ),
 				}
             }
             results.push(layer_results);
@@ -492,12 +473,12 @@ impl NN {
                 // calculate error for this node
                 if layer_index == layers.len() - 1 {
 					let act_deriv = match self.out_act { //output activation
-						0 => result * (1.0 - result), //sigmoid
-						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
-						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
-						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
-						4 => 1.0, //linear
-						_ => 1.0 - result * result, //tanh
+						Activation::Sigmoid => result * (1.0 - result),
+						Activation::SELU => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B },
+						Activation::PELU => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B },
+						Activation::LRELU => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR },
+						Activation::Linear => 1.0,
+						Activation::Tanh => 1.0 - result * result,
 					};
                     node_error = act_deriv * (targets[node_index] - result);
                 } else {
@@ -507,12 +488,12 @@ impl NN {
                         sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold
                     }
 					let act_deriv = match self.hid_act { //hidden activation
-						0 => result * (1.0 - result), //sigmoid
-						1 => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, //selu
-						2 => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, //pelu
-						3 => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, //lrelu
-						4 => 1.0, //linear
-						_ => 1.0 - result * result, //tanh
+						Activation::Sigmoid => result * (1.0 - result),
+						Activation::SELU => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B },
+						Activation::PELU => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B },
+						Activation::LRELU => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR },
+						Activation::Linear => 1.0,
+						Activation::Tanh => 1.0 - result * result,
 					};
                     node_error = act_deriv * sum;
                 }
diff --git a/tests/xor.rs b/tests/xor.rs
index 86fbf0f..e6fa5f1 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -13,7 +13,7 @@ fn xor_4layers() {
     ];
 
     // create a new neural network
-    let mut net1 = NN::new(&[2,3,3,1], Activation::LRELU, Activation::Sigmoid);
+    let mut net1 = NN::new(&[2,3,3,1], Activation::PELU, Activation::Sigmoid);
 	
     // train the network
     net1.train(&examples)

From ded6672e142645f56ece9fe16cd5c9c962747a45 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 8 Nov 2017 13:30:59 +0100
Subject: [PATCH 19/23] fit readme to NN changes

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4fe1694..395e7c3 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ given examples. See the documentation for the `NN` and `Trainer` structs
 for more details.
 
 ```rust
-use nn::{NN, HaltCondition};
+use nn::{NN, HaltCondition, Activation};
 
 // create examples of the XOR function
 // the network is trained on tuples of vectors where the first vector
@@ -43,7 +43,7 @@ let examples = [
 // that specifies the number of layers and the number of nodes in each layer
 // in this case we have an input layer with 2 nodes, one hidden layer
 // with 3 nodes and the output layer has 1 node
-let mut net = NN::new(&[2, 3, 1]);
+let mut net = NN::new(&[2, 3, 1], Activation::PELU, Activation::Sigmoid);
     
 // train the network on the examples of the XOR function
 // all methods seen here are optional except go() which must be called to begin training

From 78d0eacfc00580081c4da88fa9a54cd9c05b6885 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 8 Nov 2017 13:43:18 +0100
Subject: [PATCH 20/23] readd trayis

---
 travis.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 travis.yml

diff --git a/travis.yml b/travis.yml
new file mode 100644
index 0000000..613564f
--- /dev/null
+++ b/travis.yml
@@ -0,0 +1 @@
+language: rust
\ No newline at end of file

From e311f1d8f149c2fd13b2765b826aa0a170c899e1 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Wed, 8 Nov 2017 13:50:17 +0100
Subject: [PATCH 21/23] trayis rename

---
 travis.yml => .travis.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename travis.yml => .travis.yml (100%)

diff --git a/travis.yml b/.travis.yml
similarity index 100%
rename from travis.yml
rename to .travis.yml

From db7d95eefc4fa149f7adbd92e6ca2575c0788c99 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 21 Dec 2017 16:19:09 +0100
Subject: [PATCH 22/23] remove warnings, that came up after rustc update

---
 src/lib.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8c635dd..4da4578 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -64,11 +64,11 @@ extern crate serde;
 extern crate serde_json;
 extern crate rand;
 
-use HaltCondition::{ Epochs, MSE, Timer };
-use LearningMode::{ Incremental };
+use HaltCondition::{Epochs, MSE, Timer};
+use LearningMode::{Incremental};
 use std::iter::{Zip, Enumerate};
 use std::slice;
-use std::time::{ Duration, Instant };
+use std::time::{Duration, Instant};
 use rand::distributions::{Normal, IndependentSample};
 
 const DEFAULT_LEARNING_RATE:f64 = 0.3;
@@ -436,10 +436,10 @@ impl NN {
     // updates all weights in the network
     fn update_weights(&mut self, network_weight_updates: &Vec<Vec<Vec<f64>>>, prev_deltas: &mut Vec<Vec<Vec<f64>>>, rate: f64, lambda: f64, momentum: f64) {
 		for layer_index in 0..self.layers.len() {
-            let mut layer = &mut self.layers[layer_index];
+            let layer = &mut self.layers[layer_index];
             let layer_weight_updates = &network_weight_updates[layer_index];
             for node_index in 0..layer.len() {
-                let mut node = &mut layer[node_index];
+                let node = &mut layer[node_index];
                 let node_weight_updates = &layer_weight_updates[node_index];
                 for weight_index in 0..node.len() {
                     let weight_update = node_weight_updates[weight_index];

From 4af774e92e2d357cdfd400fa620e4a69a3ab8f98 Mon Sep 17 00:00:00 2001
From: FlixCoder <xbbfanx@web.de>
Date: Thu, 21 Dec 2017 16:19:23 +0100
Subject: [PATCH 23/23] add example

---
 examples/selector.rs | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 examples/selector.rs

diff --git a/examples/selector.rs b/examples/selector.rs
new file mode 100644
index 0000000..d1f8289
--- /dev/null
+++ b/examples/selector.rs
@@ -0,0 +1,43 @@
+extern crate nn;
+
+use nn::{NN, HaltCondition, Activation};
+
+const ACTIONS:u32 = 10;
+
+
+fn main()
+{
+	// create examples of the xor function
+	let mut examples = Vec::new();
+	for i in 0..ACTIONS
+	{
+		let mut result = Vec::new();
+		for j in 0..ACTIONS
+		{
+			if j == i { result.push(1.0); }
+			else { result.push(0.0); }
+		}
+		let example = (vec![i as f64], result);
+		examples.push(example);
+	}
+
+	// create a new neural network
+	let mut nn = NN::new(&[1, 10, ACTIONS], Activation::PELU, Activation::Sigmoid);
+
+	// train the network
+	nn.train(&examples)
+		.log_interval(Some(1000))
+		.halt_condition( HaltCondition::MSE(0.01) )
+		.rate(0.025)
+		.momentum(0.5)
+		.lambda(0.00005)
+		.go();
+
+	// print results of the trained network
+	for &(ref input, _) in examples.iter()
+	{
+		let result = nn.run(input);
+		let print:Vec<String> = result.iter().map(|x:&f64| { format!("{:4.2}", (*x * 100.0).round() / 100.0) }).collect();
+		println!("{:1.0} -> {:?}", input[0], print);
+	}
+}