changeset 1:42b6020b2fdb

Do regularised cost function
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Fri, 11 Nov 2011 14:13:51 -0500
parents 395fc40248c3
children 55430128adcd
files nnCostFunction.m
diffstat 1 files changed, 57 insertions(+), 85 deletions(-) [+]
line wrap: on
line diff
--- a/nnCostFunction.m
+++ b/nnCostFunction.m
@@ -1,91 +1,63 @@
-function [J grad] = nnCostFunction(nn_params, ...
-                                   input_layer_size, ...
-                                   hidden_layer_size, ...
-                                   num_labels, ...
+function [J grad] = nnCostFunction(nn_params, 
+                                   input_layer_size,
+                                   hidden_layer_size,
+                                   num_labels,
                                    X, y, lambda)
-%NNCOSTFUNCTION Implements the neural network cost function for a two layer
-%neural network which performs classification
-%   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
-%   X, y, lambda) computes the cost and gradient of the neural network. The
-%   parameters for the neural network are "unrolled" into the vector
-%   nn_params and need to be converted back into the weight matrices. 
-% 
-%   The returned parameter grad should be a "unrolled" vector of the
-%   partial derivatives of the neural network.
-%
+  ##NNCOSTFUNCTION Implements the neural network cost function for a two layer
+  ##neural network which performs classification
+  ##   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
+  ##   X, y, lambda) computes the cost and gradient of the neural network. The
+  ##   parameters for the neural network are "unrolled" into the vector
+  ##   nn_params and need to be converted back into the weight matrices. 
+  ## 
+  ##   The returned parameter grad should be a "unrolled" vector of the
+  ##   partial derivatives of the neural network.
+  ##
 
-% Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
-% for our 2 layer neural network
-Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
-                 hidden_layer_size, (input_layer_size + 1));
-
-Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
-                 num_labels, (hidden_layer_size + 1));
+  ## Reshape nn_params back into the parameters Theta1 and Theta2, the
+  ## weight matrices for our 2 layer neural network
+  Theta1 = reshape (nn_params(1:hidden_layer_size * (input_layer_size + 1)), 
+                    hidden_layer_size, (input_layer_size + 1));
 
-% Setup some useful variables
-m = size(X, 1);
-         
-% You need to return the following variables correctly 
-J = 0;
-Theta1_grad = zeros(size(Theta1));
-Theta2_grad = zeros(size(Theta2));
+  Theta2 = reshape (nn_params((1 + (hidden_layer_size
+                                    * (input_layer_size + 1))):end),
+                    num_labels, (hidden_layer_size + 1));
+
+  ## Setup some useful variables
+  m = rows (X);
+  one_vec = ones (m, 1);
+
+  Theta1_grad = zeros(size(Theta1));
+  Theta2_grad = zeros(size(Theta2));
 
-% ====================== YOUR CODE HERE ======================
-% Instructions: You should complete the code by working through the
-%               following parts.
-%
-% Part 1: Feedforward the neural network and return the cost in the
-%         variable J. After implementing Part 1, you can verify that your
-%         cost function computation is correct by verifying the cost
-%         computed in ex4.m
-%
-% Part 2: Implement the backpropagation algorithm to compute the gradients
-%         Theta1_grad and Theta2_grad. You should return the partial derivatives of
-%         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
-%         Theta2_grad, respectively. After implementing Part 2, you can check
-%         that your implementation is correct by running checkNNGradients
-%
-%         Note: The vector y passed into the function is a vector of labels
-%               containing values from 1..K. You need to map this vector into a 
-%               binary vector of 1's and 0's to be used with the neural network
-%               cost function.
-%
-%         Hint: We recommend implementing backpropagation using a for-loop
-%               over the training examples if you are implementing it for the 
-%               first time.
-%
-% Part 3: Implement regularization with the cost function and gradients.
-%
-%         Hint: You can implement this around the code for
-%               backpropagation. That is, you can compute the gradients for
-%               the regularization separately and then add them to Theta1_grad
-%               and Theta2_grad from Part 2.
-%
+  ht = sigmoid ([one_vec, sigmoid([one_vec, X]*Theta1')]*Theta2');
+  
+  ## This is a bit tricky. In order to avoid expanding the y entries
+  ## into those useless 0-1 vectors (why represent the same data with
+  ## more space?), instead we use bsxfun together with an indexing
+  ## trick. Recall the long form of the cost function
+  ##
+  ##            /  -log( h_theta(x)) if y == 1
+  ##    cost = {
+  ##            \  -log(1 - h_theta(x)) if y != 1
+  ##
+  ## thus the indices formed with bsxfun pick out the entries of ht that
+  ## are the first form for this label or not the first form for this
+  ## label. Then everything just gets added together.
+  ##
+  ## Note that although the bsxfun does generate the 0-1 logical matrix
+  ## of the y's, it's useful that it's a logical matrix because
+  ## internally the indexing with a logical matrix can be done faster.
+  ## Also, logical indexing returns vectors, so the double summations
+  ## get flattened into a single summation.
+  J = -(sum (log (ht(bsxfun (@eq, 1:num_labels, y))))                \
+        + sum (log (1 - ht(bsxfun (@ne, 1:num_labels, y)))))/m \
+      
+      ## The regularisation term has to exclude the first column of the Thetas,
+      ## because we don't regularise the bias nodes.
+      + lambda*(sum (Theta1(:, 2:end)(:).^2) \
+                + sum (Theta2(:, 2:end)(:).^2))/(2*m);
 
-
-
-
-
-
-
-
-
-
-
-
-
-
+  grad = [Theta1_grad(:) ; Theta2_grad(:)];
 
-
-
-
-
-% -------------------------------------------------------------
-
-% =========================================================================
-
-% Unroll gradients
-grad = [Theta1_grad(:) ; Theta2_grad(:)];
-
-
-end
+endfunction