Mercurial > hg > machine-learning-hw4

--- a/nnCostFunction.m
+++ b/nnCostFunction.m
@@ -30,34 +30,35 @@
   Theta1_grad = zeros(size(Theta1));
   Theta2_grad = zeros(size(Theta2));

-  ht = sigmoid ([one_vec, sigmoid([one_vec, X]*Theta1')]*Theta2');
-
-  ## This is a bit tricky. In order to avoid expanding the y entries
-  ## into those useless 0-1 vectors (why represent the same data with
-  ## more space?), instead we use bsxfun together with an indexing
-  ## trick. Recall the long form of the cost function
-  ##
-  ##            /  -log( h_theta(x)) if y == 1
-  ##    cost = {
-  ##            \  -log(1 - h_theta(x)) if y != 1
-  ##
-  ## thus the indices formed with bsxfun pick out the entries of ht that
-  ## are the first form for this label or not the first form for this
-  ## label. Then everything just gets added together.
-  ##
-  ## Note that although the bsxfun does generate the 0-1 logical matrix
-  ## of the y's, it's useful that it's a logical matrix because
-  ## internally the indexing with a logical matrix can be done faster.
-  ## Also, logical indexing returns vectors, so the double summations
-  ## get flattened into a single summation.
-  J = -(sum (log (ht(bsxfun (@eq, 1:num_labels, y))))                \
-        + sum (log (1 - ht(bsxfun (@ne, 1:num_labels, y)))))/m \
+  a1 = X;
+  z2 = [one_vec, a1]*Theta1';
+  a2 = sigmoid (z2);
+  z3 = [one_vec, a2]*Theta2';
+  a3 = sigmoid (z3);
+  ht = a3;
+
+  ## Logical matrix of zeros and ones representing the labels
+  y_idx = bsxfun (@eq, 1:num_labels, y);
+
+  ## Using long form of cost function that broke it up into cases.
+  J = -(sum (log (ht(y_idx))) + sum (log (1 - ht(! y_idx))))/m     \

       ## The regularisation term has to exclude the first column of the Thetas,
       ## because we don't regularise the bias nodes.
-      + lambda*(sum (Theta1(:, 2:end)(:).^2) \
+      + lambda*(sum (Theta1(:, 2:end)(:).^2)                       \
                 + sum (Theta2(:, 2:end)(:).^2))/(2*m);

+  ## Backprop
+  delta3 = a3 - y_idx;
+  delta2 = (delta3*Theta2)(:, 2:end) .* sigmoidGradient (z2);
+
+  Theta2_grad = sum (bsxfun (@times, permute (delta3, [2, 3, 1]),
+                             permute ([one_vec, a2], [3, 2, 1])),
+                     3)/m;
+  Theta1_grad = sum (bsxfun (@times, permute (delta2, [2, 3, 1]),
+                             permute ([one_vec, a1], [3, 2, 1])),
+                     3)/m;
+
   grad = [Theta1_grad(:) ; Theta2_grad(:)];

 endfunction