Mercurial > hg > machine-learning-hw6
diff processEmail.m @ 1:e0f1290d2b43
Initial submission of work
author | Jordi GutiƩrrez Hermoso <jordigh@octave.org> |
---|---|
date | Sun, 27 Nov 2011 23:18:00 -0500 (2011-11-28) |
parents | f602dc601e9e |
children | 7f92093ea77d |
line wrap: on
line diff
--- a/processEmail.m +++ b/processEmail.m @@ -1,125 +1,100 @@ function word_indices = processEmail(email_contents) -%PROCESSEMAIL preprocesses a the body of an email and -%returns a list of word_indices -% word_indices = PROCESSEMAIL(email_contents) preprocesses -% the body of an email and returns a list of indices of the -% words contained in the email. -% + ##PROCESSEMAIL preprocesses a the body of an email and + ##returns a list of word_indices + ## word_indices = PROCESSEMAIL(email_contents) preprocesses + ## the body of an email and returns a list of indices of the + ## words contained in the email. + ## -% Load Vocabulary -vocabList = getVocabList(); + ## Load Vocabulary + vocabList = getVocabList(); -% Init return value -word_indices = []; + ## Init return value + word_indices = []; -% ========================== Preprocess Email =========================== + ## ========================== Preprocess Email =========================== -% Find the Headers ( \n\n and remove ) -% Uncomment the following lines if you are working with raw emails with the -% full headers + ## Find the Headers ( \n\n and remove ) + ## Uncomment the following lines if you are working with raw emails with the + ## full headers -% hdrstart = strfind(email_contents, ([char(10) char(10)])); -% email_contents = email_contents(hdrstart(1):end); + ## hdrstart = strfind(email_contents, ([char(10) char(10)])); + ## email_contents = email_contents(hdrstart(1):end); -% Lower case -email_contents = lower(email_contents); + ## Lower case + email_contents = lower(email_contents); -% Strip all HTML -% Looks for any expression that starts with < and ends with > and replace -% and does not have any < or > in the tag it with a space -email_contents = regexprep(email_contents, '<[^<>]+>', ' '); + ## Strip all HTML + ## Looks for any expression that starts with < and ends with > and replace + ## and does not have any < or > in the tag it with a space + email_contents = regexprep(email_contents, '<[^<>]+>', ' '); -% Handle Numbers -% Look for one or more characters between 0-9 -email_contents = regexprep(email_contents, '[0-9]+', 'number'); + ## Handle Numbers + ## Look for one or more characters between 0-9 + email_contents = regexprep(email_contents, '[0-9]+', 'number'); -% Handle URLS -% Look for strings starting with http:// or https:// -email_contents = regexprep(email_contents, ... - '(http|https)://[^\s]*', 'httpaddr'); + ## Handle URLS + ## Look for strings starting with http:// or https:// + email_contents = regexprep(email_contents, ... + '(http|https)://[^\s]*', 'httpaddr'); -% Handle Email Addresses -% Look for strings with @ in the middle -email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr'); + ## Handle Email Addresses + ## Look for strings with @ in the middle + email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr'); -% Handle $ sign -email_contents = regexprep(email_contents, '[$]+', 'dollar'); + ## Handle $ sign + email_contents = regexprep(email_contents, '[$]+', 'dollar'); -% ========================== Tokenize Email =========================== + ## ========================== Tokenize Email =========================== -% Output the email to screen as well -fprintf('\n==== Processed Email ====\n\n'); + ## Output the email to screen as well + fprintf('\n==== Processed Email ====\n\n'); -% Process file -l = 0; + ## Process file + l = 0; -while ~isempty(email_contents) + while ~isempty(email_contents) - % Tokenize and also get rid of any punctuation - [str, email_contents] = ... - strtok(email_contents, ... - [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); - - % Remove any non alphanumeric characters + ## Tokenize and also get rid of any punctuation + [str, email_contents] = \ + strtok(email_contents, \ + [" @$/#.-:&*+=[]?!(){},'\">_<;%" char(10) char(13)]); + + ## Remove any non alphanumeric characters str = regexprep(str, '[^a-zA-Z0-9]', ''); - % Stem the word - % (the porterStemmer sometimes has issues, so we use a try catch block) + ## Stem the word + ## (the porterStemmer sometimes has issues, so we use a try catch block) try str = porterStemmer(strtrim(str)); catch str = ''; continue; - end; + end_try_catch; - % Skip the word if it is too short + ## Skip the word if it is too short if length(str) < 1 - continue; - end + continue; + endif + + ## Convert the vocabulary list + for i = 1:numel (vocabList) + if strcmp (vocabList{i}, str) + word_indices(end+1) = i; + break; + endif + endfor - % Look up the word in the dictionary and add to word_indices if - % found - % ====================== YOUR CODE HERE ====================== - % Instructions: Fill in this function to add the index of str to - % word_indices if it is in the vocabulary. At this point - % of the code, you have a stemmed word from the email in - % the variable str. You should look up str in the - % vocabulary list (vocabList). If a match exists, you - % should add the index of the word to the word_indices - % vector. Concretely, if str = 'action', then you should - % look up the vocabulary list to find where in vocabList - % 'action' appears. For example, if vocabList{18} = - % 'action', then, you should add 18 to the word_indices - % vector (e.g., word_indices = [word_indices ; 18]; ). - % - % Note: vocabList{idx} returns a the word with index idx in the - % vocabulary list. - % - % Note: You can use strcmp(str1, str2) to compare two strings (str1 and - % str2). It will return 1 only if the two strings are equivalent. - % - - + ## Print to screen, ensuring that the output lines are not too long + if (l + length(str) + 1) > 78 + fprintf('\n'); + l = 0; + endif + fprintf("%s ", str); + l = l + length(str) + 1; + + endwhile - - - - - - - - % ============================================================= - - - % Print to screen, ensuring that the output lines are not too long - if (l + length(str) + 1) > 78 - fprintf('\n'); - l = 0; - end - fprintf('%s ', str); - l = l + length(str) + 1; - -end - -% Print footer -fprintf('\n\n=========================\n'); - -end + ## Print footer + fprintf("\n\n=========================\n"); + +endfunction + \ No newline at end of file