Mercurial > hg > machine-learning-hw6

diff processEmail.m @ 1:e0f1290d2b43
Initial submission of work
author: Jordi Gutiérrez Hermoso <jordigh@octave.org>
date: Sun, 27 Nov 2011 23:18:00 -0500 (2011-11-28)
parents: f602dc601e9e
children: 7f92093ea77d
--- a/processEmail.m
+++ b/processEmail.m
@@ -1,125 +1,100 @@
 function word_indices = processEmail(email_contents)
-%PROCESSEMAIL preprocesses a the body of an email and
-%returns a list of word_indices 
-%   word_indices = PROCESSEMAIL(email_contents) preprocesses 
-%   the body of an email and returns a list of indices of the 
-%   words contained in the email. 
-%
+  ##PROCESSEMAIL preprocesses a the body of an email and
+  ##returns a list of word_indices 
+  ##   word_indices = PROCESSEMAIL(email_contents) preprocesses 
+  ##   the body of an email and returns a list of indices of the 
+  ##   words contained in the email. 
+  ##
 
-% Load Vocabulary
-vocabList = getVocabList();
+  ## Load Vocabulary
+  vocabList = getVocabList();
 
-% Init return value
-word_indices = [];
+  ## Init return value
+  word_indices = [];
 
-% ========================== Preprocess Email ===========================
+  ## ========================== Preprocess Email ===========================
 
-% Find the Headers ( \n\n and remove )
-% Uncomment the following lines if you are working with raw emails with the
-% full headers
+  ## Find the Headers ( \n\n and remove )
+  ## Uncomment the following lines if you are working with raw emails with the
+  ## full headers
 
-% hdrstart = strfind(email_contents, ([char(10) char(10)]));
-% email_contents = email_contents(hdrstart(1):end);
+  ## hdrstart = strfind(email_contents, ([char(10) char(10)]));
+  ## email_contents = email_contents(hdrstart(1):end);
 
-% Lower case
-email_contents = lower(email_contents);
+  ## Lower case
+  email_contents = lower(email_contents);
 
-% Strip all HTML
-% Looks for any expression that starts with < and ends with > and replace
-% and does not have any < or > in the tag it with a space
-email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
+  ## Strip all HTML
+  ## Looks for any expression that starts with < and ends with > and replace
+  ## and does not have any < or > in the tag it with a space
+  email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
 
-% Handle Numbers
-% Look for one or more characters between 0-9
-email_contents = regexprep(email_contents, '[0-9]+', 'number');
+  ## Handle Numbers
+  ## Look for one or more characters between 0-9
+  email_contents = regexprep(email_contents, '[0-9]+', 'number');
 
-% Handle URLS
-% Look for strings starting with http:// or https://
-email_contents = regexprep(email_contents, ...
-                           '(http|https)://[^\s]*', 'httpaddr');
+  ## Handle URLS
+  ## Look for strings starting with http:// or https://
+  email_contents = regexprep(email_contents, ...
+                             '(http|https)://[^\s]*', 'httpaddr');
 
-% Handle Email Addresses
-% Look for strings with @ in the middle
-email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
+  ## Handle Email Addresses
+  ## Look for strings with @ in the middle
+  email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
 
-% Handle $ sign
-email_contents = regexprep(email_contents, '[$]+', 'dollar');
+  ## Handle $ sign
+  email_contents = regexprep(email_contents, '[$]+', 'dollar');
 
 
-% ========================== Tokenize Email ===========================
+  ## ========================== Tokenize Email ===========================
 
-% Output the email to screen as well
-fprintf('\n==== Processed Email ====\n\n');
+  ## Output the email to screen as well
+  fprintf('\n==== Processed Email ====\n\n');
 
-% Process file
-l = 0;
+  ## Process file
+  l = 0;
 
-while ~isempty(email_contents)
+  while ~isempty(email_contents)
 
-    % Tokenize and also get rid of any punctuation
-    [str, email_contents] = ...
-       strtok(email_contents, ...
-              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
-   
-    % Remove any non alphanumeric characters
+    ## Tokenize and also get rid of any punctuation
+    [str, email_contents] = \
+        strtok(email_contents, \
+               [" @$/#.-:&*+=[]?!(){},'\">_<;%" char(10) char(13)]);
+    
+    ## Remove any non alphanumeric characters
     str = regexprep(str, '[^a-zA-Z0-9]', '');
 
-    % Stem the word 
-    % (the porterStemmer sometimes has issues, so we use a try catch block)
+    ## Stem the word 
+    ## (the porterStemmer sometimes has issues, so we use a try catch block)
     try str = porterStemmer(strtrim(str)); 
     catch str = ''; continue;
-    end;
+    end_try_catch;
 
-    % Skip the word if it is too short
+    ## Skip the word if it is too short
     if length(str) < 1
-       continue;
-    end
+      continue;
+    endif
+    
+    ## Convert the vocabulary list
+    for i = 1:numel (vocabList)
+      if strcmp (vocabList{i}, str)
+        word_indices(end+1) = i;
+        break;
+      endif
+    endfor
 
-    % Look up the word in the dictionary and add to word_indices if
-    % found
-    % ====================== YOUR CODE HERE ======================
-    % Instructions: Fill in this function to add the index of str to
-    %               word_indices if it is in the vocabulary. At this point
-    %               of the code, you have a stemmed word from the email in
-    %               the variable str. You should look up str in the
-    %               vocabulary list (vocabList). If a match exists, you
-    %               should add the index of the word to the word_indices
-    %               vector. Concretely, if str = 'action', then you should
-    %               look up the vocabulary list to find where in vocabList
-    %               'action' appears. For example, if vocabList{18} =
-    %               'action', then, you should add 18 to the word_indices 
-    %               vector (e.g., word_indices = [word_indices ; 18]; ).
-    % 
-    % Note: vocabList{idx} returns a the word with index idx in the
-    %       vocabulary list.
-    % 
-    % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
-    %       str2). It will return 1 only if the two strings are equivalent.
-    %
-
-
+    ## Print to screen, ensuring that the output lines are not too long
+    if (l + length(str) + 1) > 78
+      fprintf('\n');
+      l = 0;
+    endif
+    fprintf("%s ", str);
+    l = l + length(str) + 1;
+    
+  endwhile
 
-
-
-
-
-
-
-
-    % =============================================================
-
-
-    % Print to screen, ensuring that the output lines are not too long
-    if (l + length(str) + 1) > 78
-        fprintf('\n');
-        l = 0;
-    end
-    fprintf('%s ', str);
-    l = l + length(str) + 1;
-
-end
-
-% Print footer
-fprintf('\n\n=========================\n');
-
-end
+  ## Print footer
+  fprintf("\n\n=========================\n");
+  
+endfunction
+          
\ No newline at end of file
author	Jordi Gutiérrez Hermoso <jordigh@octave.org>
date	Sun, 27 Nov 2011 23:18:00 -0500 (2011-11-28)
parents	f602dc601e9e
children	7f92093ea77d