diff --git a/Cargo.toml b/Cargo.toml index 0073463..d0155be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,33 @@ [package] - name = "nn" -version = "0.1.6" -authors = ["Jack Montgomery "] -repository = "https://github.com/jackm321/RustNN" -documentation = "https://jackm321.github.io/RustNN/doc/nn/" -license = "Apache-2.0" -readme = "README.md" +version = "0.6.0" +authors = ["https://github.com/jackm321/RustNN"] -description = """ -A multilayer feedforward backpropagation neural network library -""" +[dependencies] +rand = "0.3.*" +serde = "1.*" +serde_derive = "1.*" +serde_json = "1.*" -keywords = ["nn", "neural-network", "classifier", "backpropagation", - "machine-learning"] -[dependencies] -rand = "0.3.7" -rustc-serialize = "0.3.12" -time = "0.1.24" + +[profile.dev] +opt-level = 3 +lto = true +panic = "unwind" +debug = true +debug-assertions = true + +[profile.test] +opt-level = 0 +lto = false +panic = "unwind" +debug = true +debug-assertions = true + +[profile.release] +opt-level = 3 +lto = true +panic = "unwind" +debug = false +debug-assertions = false diff --git a/README.md b/README.md index 5eb76f2..395e7c3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,8 @@ # RustNN -[![Build Status](https://travis-ci.org/jackm321/RustNN.svg?branch=master)](https://travis-ci.org/jackm321/RustNN) - An easy to use neural network library written in Rust. -[Crate](https://crates.io/crates/nn) - -[Documentation](https://jackm321.github.io/RustNN/doc/nn/) +For the documentation take a look at the original library or generate it using "cargo doc". ## Description RustNN is a [feedforward neural network ](http://en.wikipedia.org/wiki/Feedforward_neural_network) @@ -15,6 +11,10 @@ generates fully connected multi-layer artificial neural networks that are trained via [backpropagation](http://en.wikipedia.org/wiki/Backpropagation). Networks are trained using an incremental training mode. +## Fork +This fork adds L2 regularization and several activation functions to the original crate. Additionally, there are a few minor improvements. +Lambda can be set just like the learning rate. The activation functions for hidden and output gets set in NN::new as second and third parameter respectively. + ## XOR example This example creates a neural network with `2` nodes in the input layer, @@ -27,7 +27,7 @@ given examples. See the documentation for the `NN` and `Trainer` structs for more details. ```rust -use nn::{NN, HaltCondition}; +use nn::{NN, HaltCondition, Activation}; // create examples of the XOR function // the network is trained on tuples of vectors where the first vector @@ -43,7 +43,7 @@ let examples = [ // that specifies the number of layers and the number of nodes in each layer // in this case we have an input layer with 2 nodes, one hidden layer // with 3 nodes and the output layer has 1 node -let mut net = NN::new(&[2, 3, 1]); +let mut net = NN::new(&[2, 3, 1], Activation::PELU, Activation::Sigmoid); // train the network on the examples of the XOR function // all methods seen here are optional except go() which must be called to begin training diff --git a/examples/selector.rs b/examples/selector.rs new file mode 100644 index 0000000..d1f8289 --- /dev/null +++ b/examples/selector.rs @@ -0,0 +1,43 @@ +extern crate nn; + +use nn::{NN, HaltCondition, Activation}; + +const ACTIONS:u32 = 10; + + +fn main() +{ + // create examples of the xor function + let mut examples = Vec::new(); + for i in 0..ACTIONS + { + let mut result = Vec::new(); + for j in 0..ACTIONS + { + if j == i { result.push(1.0); } + else { result.push(0.0); } + } + let example = (vec![i as f64], result); + examples.push(example); + } + + // create a new neural network + let mut nn = NN::new(&[1, 10, ACTIONS], Activation::PELU, Activation::Sigmoid); + + // train the network + nn.train(&examples) + .log_interval(Some(1000)) + .halt_condition( HaltCondition::MSE(0.01) ) + .rate(0.025) + .momentum(0.5) + .lambda(0.00005) + .go(); + + // print results of the trained network + for &(ref input, _) in examples.iter() + { + let result = nn.run(input); + let print:Vec = result.iter().map(|x:&f64| { format!("{:4.2}", (*x * 100.0).round() / 100.0) }).collect(); + println!("{:1.0} -> {:?}", input[0], print); + } +} diff --git a/src/lib.rs b/src/lib.rs index 45c2a77..4da4578 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +//! Modified version, originally from: https://github.com/jackm321/RustNN +//! //! An easy to use neural network library written in Rust. //! //! # Description @@ -19,7 +21,7 @@ //! for more details. //! //! ```rust -//! use nn::{NN, HaltCondition}; +//! use nn::{NN, HaltCondition, Activation}; //! //! // create examples of the XOR function //! // the network is trained on tuples of vectors where the first vector @@ -35,7 +37,7 @@ //! // that specifies the number of layers and the number of nodes in each layer //! // in this case we have an input layer with 2 nodes, one hidden layer //! // with 3 nodes and the output layer has 1 node -//! let mut net = NN::new(&[2, 3, 1]); +//! let mut net = NN::new(&[2, 3, 1], Activation::PELU, Activation::Sigmoid); //! //! // train the network on the examples of the XOR function //! // all methods seen here are optional except go() which must be called to begin training @@ -55,21 +57,54 @@ //! } //! ``` +#[macro_use] +extern crate serde_derive; + +extern crate serde; +extern crate serde_json; extern crate rand; -extern crate rustc_serialize; -extern crate time; -use HaltCondition::{ Epochs, MSE, Timer }; -use LearningMode::{ Incremental }; +use HaltCondition::{Epochs, MSE, Timer}; +use LearningMode::{Incremental}; use std::iter::{Zip, Enumerate}; use std::slice; -use rustc_serialize::json; -use time::{ Duration, PreciseTime }; -use rand::Rng; - -static DEFAULT_LEARNING_RATE: f64 = 0.3f64; -static DEFAULT_MOMENTUM: f64 = 0f64; -static DEFAULT_EPOCHS: u32 = 1000; +use std::time::{Duration, Instant}; +use rand::distributions::{Normal, IndependentSample}; + +const DEFAULT_LEARNING_RATE:f64 = 0.3; +const DEFAULT_LAMBDA:f64 = 0.0; +const DEFAULT_MOMENTUM:f64 = 0.0; +const DEFAULT_EPOCHS:u32 = 1000; + +//values for a (0,1) distribution (so (-1, 1) interval in standard deviation) +//const SELU_FACTOR_A:f64 = 1.0507; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf +//const SELU_FACTOR_B:f64 = 1.6733; //alpha in https://arxiv.org/pdf/1706.02515.pdf +//values for a (0,2) distribution (so (-2, 2) interval in standard deviation) +const SELU_FACTOR_A:f64 = 1.06071; //greater than 1, lambda in https://arxiv.org/pdf/1706.02515.pdf +const SELU_FACTOR_B:f64 = 1.97126; //alpha in https://arxiv.org/pdf/1706.02515.pdf + +const PELU_FACTOR_A:f64 = 1.5; +const PELU_FACTOR_B:f64 = 2.0; + +const LRELU_FACTOR:f64 = 0.33; + + +/// Specifies the activation function +#[derive(Debug, Copy, Clone, PartialEq, Deserialize, Serialize)] +pub enum Activation { + /// Sigmoid activation + Sigmoid, + /// SELU activation + SELU, + /// PELU activation + PELU, + /// Leaky ReLU activation + LRELU, + /// Linear activation + Linear, + /// Tanh activation + Tanh, +} /// Specifies when to stop training the network #[derive(Debug, Copy, Clone)] @@ -95,6 +130,7 @@ pub struct Trainer<'a,'b> { examples: &'b [(Vec, Vec)], rate: f64, momentum: f64, + lambda: f64, log_interval: Option, halt_condition: HaltCondition, learning_mode: LearningMode, @@ -117,10 +153,20 @@ impl<'a,'b> Trainer<'a,'b> { self.rate = rate; self } + + /// Specifies the lambda factor for L2 regularization used when training (default is 0.0) + pub fn lambda(&mut self, lambda: f64) -> &mut Trainer<'a,'b> { + if lambda < 0f64 { + panic!("the lambda value must be a positive number"); + } + + self.lambda = lambda; + self + } /// Specifies the momentum to be used when training (default is `0.0`) pub fn momentum(&mut self, momentum: f64) -> &mut Trainer<'a,'b> { - if momentum <= 0f64 { + if momentum < 0f64 { panic!("momentum must be positive"); } @@ -175,6 +221,7 @@ impl<'a,'b> Trainer<'a,'b> { self.nn.train_details( self.examples, self.rate, + self.lambda, self.momentum, self.log_interval, self.halt_condition @@ -184,20 +231,24 @@ impl<'a,'b> Trainer<'a,'b> { } /// Neural network -#[derive(Debug, Clone, RustcDecodable, RustcEncodable)] +#[derive(Debug, Clone, Deserialize, Serialize)] pub struct NN { layers: Vec>>, num_inputs: u32, + hid_act: Activation, + out_act: Activation, } impl NN { - - /// Each number in the `layers_sizes` parameter specifies a + /// Each number in the `layers_sizes` parameter specifies a /// layer in the network. The number itself is the number of nodes in that /// layer. The first number is the input layer, the last /// number is the output layer, and all numbers between the first and /// last are hidden layers. There must be at least two layers in the network. - pub fn new(layers_sizes: &[u32]) -> NN { + /// The activation function can be Sigmoid, SELU, PELU, LRELU, Linear or Tanh. + /// Important: Take care of inputs/outputs for the individual activation functions! + /// Do not use linear activation for hidden layers. + pub fn new(layers_sizes: &[u32], hidden_activation: Activation, output_activation: Activation) -> NN { let mut rng = rand::thread_rng(); if layers_sizes.len() < 2 { @@ -220,11 +271,21 @@ impl NN { let mut prev_layer_size = first_layer_size; for &layer_size in it { let mut layer: Vec> = Vec::new(); + let mut init_std_scale = 2.0; //He init + if hidden_activation == Activation::SELU { init_std_scale = 1.0; } //MSRA / Xavier init + let normal = Normal::new(0.0, (init_std_scale / prev_layer_size as f64).sqrt()); for _ in 0..layer_size { let mut node: Vec = Vec::new(); - for _ in 0..prev_layer_size+1 { - let random_weight: f64 = rng.gen_range(-0.5f64, 0.5f64); - node.push(random_weight); + for i in 0..prev_layer_size+1 { + if i == 0 //threshold aka bias + { + node.push(0.0); + } + else + { + let random_weight: f64 = normal.ind_sample(&mut rng); + node.push(random_weight); + } } node.shrink_to_fit(); layer.push(node) @@ -234,7 +295,8 @@ impl NN { prev_layer_size = layer_size; } layers.shrink_to_fit(); - NN { layers: layers, num_inputs: first_layer_size } + + NN { layers: layers, num_inputs: first_layer_size, hid_act: hidden_activation, out_act: output_activation } } /// Runs the network on an input and returns a vector of the results. @@ -257,6 +319,7 @@ impl NN { examples: examples, rate: DEFAULT_LEARNING_RATE, momentum: DEFAULT_MOMENTUM, + lambda: DEFAULT_LAMBDA, log_interval: None, halt_condition: Epochs(DEFAULT_EPOCHS), learning_mode: Incremental, @@ -266,16 +329,16 @@ impl NN { /// Encodes the network as a JSON string. pub fn to_json(&self) -> String { - json::encode(self).ok().expect("encoding JSON failed") + serde_json::to_string(self).ok().expect("encoding JSON failed") } /// Builds a new network from a JSON string. pub fn from_json(encoded: &str) -> NN { - let network: NN = json::decode(encoded).ok().expect("decoding JSON failed"); + let network:NN = serde_json::from_str(encoded).ok().expect("decoding JSON failed"); network } - fn train_details(&mut self, examples: &[(Vec, Vec)], rate: f64, momentum: f64, log_interval: Option, + fn train_details(&mut self, examples: &[(Vec, Vec)], rate: f64, lambda: f64, momentum: f64, log_interval: Option, halt_condition: HaltCondition) -> f64 { // check that input and output sizes are correct @@ -290,16 +353,16 @@ impl NN { } } - self.train_incremental(examples, rate, momentum, log_interval, halt_condition) + self.train_incremental(examples, rate, lambda, momentum, log_interval, halt_condition) } - fn train_incremental(&mut self, examples: &[(Vec, Vec)], rate: f64, momentum: f64, log_interval: Option, + fn train_incremental(&mut self, examples: &[(Vec, Vec)], rate: f64, lambda: f64, momentum: f64, log_interval: Option, halt_condition: HaltCondition) -> f64 { let mut prev_deltas = self.make_weights_tracker(0.0f64); let mut epochs = 0u32; let mut training_error_rate = 0f64; - let start_time = PreciseTime::now(); + let start_time = Instant::now(); loop { @@ -321,8 +384,7 @@ impl NN { if training_error_rate <= target_error { break } }, Timer(duration) => { - let now = PreciseTime::now(); - if start_time.to(now) >= duration { break } + if start_time.elapsed() >= duration { break } } } } @@ -333,7 +395,7 @@ impl NN { let results = self.do_run(&inputs); let weight_updates = self.calculate_weight_updates(&results, &targets); training_error_rate += calculate_error(&results, &targets); - self.update_weights(&weight_updates, &mut prev_deltas, rate, momentum) + self.update_weights(&weight_updates, &mut prev_deltas, rate, lambda, momentum) } epochs += 1; @@ -348,7 +410,23 @@ impl NN { for (layer_index, layer) in self.layers.iter().enumerate() { let mut layer_results = Vec::new(); for node in layer.iter() { - layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ) + let activation; + if layer_index == self.layers.len()-1 //output layer + { + activation = self.out_act; + } + else + { + activation = self.hid_act; + } + match activation { + Activation::Sigmoid => layer_results.push( sigmoid(modified_dotprod(&node, &results[layer_index])) ), + Activation::SELU => layer_results.push( selu(modified_dotprod(&node, &results[layer_index])) ), + Activation::PELU => layer_results.push( pelu(modified_dotprod(&node, &results[layer_index])) ), + Activation::LRELU => layer_results.push( lrelu(modified_dotprod(&node, &results[layer_index])) ), + Activation::Linear => layer_results.push( linear(modified_dotprod(&node, &results[layer_index])) ), + Activation::Tanh => layer_results.push( tanh(modified_dotprod(&node, &results[layer_index])) ), + } } results.push(layer_results); } @@ -356,23 +434,22 @@ impl NN { } // updates all weights in the network - fn update_weights(&mut self, network_weight_updates: &Vec>>, prev_deltas: &mut Vec>>, rate: f64, momentum: f64) { - for layer_index in 0..self.layers.len() { - let mut layer = &mut self.layers[layer_index]; + fn update_weights(&mut self, network_weight_updates: &Vec>>, prev_deltas: &mut Vec>>, rate: f64, lambda: f64, momentum: f64) { + for layer_index in 0..self.layers.len() { + let layer = &mut self.layers[layer_index]; let layer_weight_updates = &network_weight_updates[layer_index]; for node_index in 0..layer.len() { - let mut node = &mut layer[node_index]; + let node = &mut layer[node_index]; let node_weight_updates = &layer_weight_updates[node_index]; for weight_index in 0..node.len() { let weight_update = node_weight_updates[weight_index]; let prev_delta = prev_deltas[layer_index][node_index][weight_index]; let delta = (rate * weight_update) + (momentum * prev_delta); - node[weight_index] += delta; + node[weight_index] = (1.0 - rate * lambda) * node[weight_index] + delta; prev_deltas[layer_index][node_index][weight_index] = delta; } } } - } // calculates all weight updates by backpropagation @@ -382,32 +459,48 @@ impl NN { let layers = &self.layers; let network_results = &results[1..]; // skip the input layer let mut next_layer_nodes: Option<&Vec>> = None; - + for (layer_index, (layer_nodes, layer_results)) in iter_zip_enum(layers, network_results).rev() { let prev_layer_results = &results[layer_index]; let mut layer_errors = Vec::new(); let mut layer_weight_updates = Vec::new(); - - + + for (node_index, (node, &result)) in iter_zip_enum(layer_nodes, layer_results) { let mut node_weight_updates = Vec::new(); - let mut node_error; - + let node_error; + // calculate error for this node if layer_index == layers.len() - 1 { - node_error = result * (1f64 - result) * (targets[node_index] - result); + let act_deriv = match self.out_act { //output activation + Activation::Sigmoid => result * (1.0 - result), + Activation::SELU => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, + Activation::PELU => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, + Activation::LRELU => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, + Activation::Linear => 1.0, + Activation::Tanh => 1.0 - result * result, + }; + node_error = act_deriv * (targets[node_index] - result); } else { let mut sum = 0f64; let next_layer_errors = &network_errors[network_errors.len() - 1]; for (next_node, &next_node_error_data) in next_layer_nodes.unwrap().iter().zip((next_layer_errors).iter()) { sum += next_node[node_index+1] * next_node_error_data; // +1 because the 0th weight is the threshold } - node_error = result * (1f64 - result) * sum; + let act_deriv = match self.hid_act { //hidden activation + Activation::Sigmoid => result * (1.0 - result), + Activation::SELU => if result >= 0.0f64 { SELU_FACTOR_A } else { result + SELU_FACTOR_A * SELU_FACTOR_B }, + Activation::PELU => if result >= 0.0f64 { PELU_FACTOR_A / PELU_FACTOR_B } else { (result + PELU_FACTOR_A) / PELU_FACTOR_B }, + Activation::LRELU => if result >= 0.0f64 { 1.0 } else { LRELU_FACTOR }, + Activation::Linear => 1.0, + Activation::Tanh => 1.0 - result * result, + }; + node_error = act_deriv * sum; } // calculate weight updates for this node for weight_index in 0..node.len() { - let mut prev_layer_result; + let prev_layer_result; if weight_index == 0 { prev_layer_result = 1f64; // threshold } else { @@ -463,6 +556,47 @@ fn sigmoid(y: f64) -> f64 { 1f64 / (1f64 + (-y).exp()) } +fn selu(y: f64) -> f64 { //SELU activation + SELU_FACTOR_A * if y < 0.0 + { + SELU_FACTOR_B * y.exp() - SELU_FACTOR_B + } + else + { + y + } +} + +fn pelu(y: f64) -> f64 { //PELU activation + if y < 0.0 + { + PELU_FACTOR_A * (y / PELU_FACTOR_B).exp() - PELU_FACTOR_A + } + else + { + (PELU_FACTOR_A / PELU_FACTOR_B) * y + } +} + +fn lrelu(y: f64) -> f64 { //LRELU activation + if y < 0.0 + { + LRELU_FACTOR * y + } + else + { + y + } +} + +fn linear(y: f64) -> f64 { //linear activation + y +} + +fn tanh(y: f64) -> f64 { //tanh activation + y.tanh() +} + // takes two arrays and enumerates the iterator produced by zipping each of // their iterators together diff --git a/tests/xor.rs b/tests/xor.rs index 4ef51eb..e6fa5f1 100644 --- a/tests/xor.rs +++ b/tests/xor.rs @@ -1,7 +1,6 @@ extern crate nn; -extern crate time; -use nn::{NN, HaltCondition, LearningMode}; +use nn::{NN, HaltCondition, LearningMode, Activation}; #[test] fn xor_4layers() { @@ -14,8 +13,8 @@ fn xor_4layers() { ]; // create a new neural network - let mut net1 = NN::new(&[2,3,3,1]); - + let mut net1 = NN::new(&[2,3,3,1], Activation::PELU, Activation::Sigmoid); + // train the network net1.train(&examples) .log_interval(Some(1000))