view nnCostFunction.m @ 1:42b6020b2fdb

Do regularised cost function
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Fri, 11 Nov 2011 14:13:51 -0500
parents 395fc40248c3
children e09973b9190f
line wrap: on
line source

function [J grad] = nnCostFunction(nn_params, 
                                   input_layer_size,
                                   hidden_layer_size,
                                   num_labels,
                                   X, y, lambda)
  ##NNCOSTFUNCTION Implements the neural network cost function for a two layer
  ##neural network which performs classification
  ##   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
  ##   X, y, lambda) computes the cost and gradient of the neural network. The
  ##   parameters for the neural network are "unrolled" into the vector
  ##   nn_params and need to be converted back into the weight matrices. 
  ## 
  ##   The returned parameter grad should be a "unrolled" vector of the
  ##   partial derivatives of the neural network.
  ##

  ## Reshape nn_params back into the parameters Theta1 and Theta2, the
  ## weight matrices for our 2 layer neural network
  Theta1 = reshape (nn_params(1:hidden_layer_size * (input_layer_size + 1)), 
                    hidden_layer_size, (input_layer_size + 1));

  Theta2 = reshape (nn_params((1 + (hidden_layer_size
                                    * (input_layer_size + 1))):end),
                    num_labels, (hidden_layer_size + 1));

  ## Setup some useful variables
  m = rows (X);
  one_vec = ones (m, 1);

  Theta1_grad = zeros(size(Theta1));
  Theta2_grad = zeros(size(Theta2));

  ht = sigmoid ([one_vec, sigmoid([one_vec, X]*Theta1')]*Theta2');
  
  ## This is a bit tricky. In order to avoid expanding the y entries
  ## into those useless 0-1 vectors (why represent the same data with
  ## more space?), instead we use bsxfun together with an indexing
  ## trick. Recall the long form of the cost function
  ##
  ##            /  -log( h_theta(x)) if y == 1
  ##    cost = {
  ##            \  -log(1 - h_theta(x)) if y != 1
  ##
  ## thus the indices formed with bsxfun pick out the entries of ht that
  ## are the first form for this label or not the first form for this
  ## label. Then everything just gets added together.
  ##
  ## Note that although the bsxfun does generate the 0-1 logical matrix
  ## of the y's, it's useful that it's a logical matrix because
  ## internally the indexing with a logical matrix can be done faster.
  ## Also, logical indexing returns vectors, so the double summations
  ## get flattened into a single summation.
  J = -(sum (log (ht(bsxfun (@eq, 1:num_labels, y))))                \
        + sum (log (1 - ht(bsxfun (@ne, 1:num_labels, y)))))/m \
      
      ## The regularisation term has to exclude the first column of the Thetas,
      ## because we don't regularise the bias nodes.
      + lambda*(sum (Theta1(:, 2:end)(:).^2) \
                + sum (Theta2(:, 2:end)(:).^2))/(2*m);

  grad = [Theta1_grad(:) ; Theta2_grad(:)];

endfunction