changeset 12:d5ba6cf70633

Merge in Haskell
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Wed, 02 Oct 2013 08:43:44 -0400
parents 7ce081b92ff9 (diff) 39725b492f1e (current diff)
children 484f9cac6a1f
files
diffstat 4 files changed, 201 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/breathalyzer/Makefile
@@ -0,0 +1,3 @@
+
+breathalyzer: main.c++ bk-tree.c++ levenshtein.c++
+	g++ -O2 main.c++ -o breathalyzer
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/breathalyzer/bk-tree.c++
@@ -0,0 +1,124 @@
+#include "levenshtein.c++"
+
+#include <fstream>
+#include <sstream>
+
+// unordered_map seems to perform just  as well, cheaper op[] but more
+// expensive ctor, overall balances out, might as well use plain map.
+#include <map>
+
+struct bk_tree
+{
+  std::string word;
+  //For caching levenshtein distances
+  std::map<std::string, size_t> lev_cache;
+  std::map<size_t, bk_tree*> children;
+};
+typedef std::map<size_t, bk_tree*> branches;
+
+void insert_word(bk_tree& tree, std::string& word)
+{
+  if(tree.word == "")
+  {
+    tree.word = word;
+    return;
+  }
+
+  size_t d = levenshtein(tree.word, word);
+  bk_tree* child = tree.children[d];
+  if(not child)
+  {
+    bk_tree* subtree = new bk_tree;
+    tree.children[d] = subtree;
+    subtree->word = word;
+  }
+  else
+    insert_word(*child, word);
+}
+
+void save_tree(const bk_tree& tree, std::ofstream& ofs)
+{
+  ofs << tree.word;
+  if(tree.children.size() > 0)
+  {
+    ofs << " ( ";
+    for(branches::const_iterator i = tree.children.begin();
+        i != tree.children.end(); i++)
+    {
+      if( i-> second != 0)
+      {
+        ofs << i -> first << " ";
+        save_tree( *(i->second), ofs);
+        ofs << " ";
+      }
+    }
+    ofs << ") ";
+  }
+}
+
+std::string load_tree(bk_tree & tree, std::ifstream& ifs)
+{
+  using namespace std;
+
+  string word;
+  ifs >> word;
+
+  tree.word = word;
+
+  string token;
+  ifs >> token;
+  if(token == "(")
+  {
+    size_t weight;
+    string nexttoken;
+    while(true)
+    {
+      if(nexttoken == ")")
+      {
+        return "";
+      }
+
+      stringstream ss;
+      if(nexttoken != "")
+      {
+        ss << nexttoken;
+        ss >> weight;
+      }
+      else
+      {
+        ifs >> token;
+        if(token ==")")
+          return "";
+        ss << token;
+        ss >> weight;
+      }
+      bk_tree* subtree = new bk_tree;
+      tree.children[weight] = subtree;
+      nexttoken = load_tree(*subtree, ifs);
+    }
+  }
+  return token;
+}
+
+bool query_tree( bk_tree & tree, std::string& word, size_t m)
+{
+  size_t d;
+  if( tree.lev_cache.find(word) != tree.lev_cache.end())
+    d = tree.lev_cache[word];
+  else
+    tree.lev_cache[word] = d = levenshtein(tree.word,word);
+
+  if( d <= m)
+    return true;
+
+  bool found = false;
+  for(size_t i = d - m; i <= d+m; i++)
+  {
+    if( tree.children[i] and query_tree(*tree.children[i], word, m))
+    {
+      found = true;
+      break;
+    }
+  }
+  return found;
+}
new file mode 100644
--- /dev/null
+++ b/breathalyzer/levenshtein.c++
@@ -0,0 +1,31 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+size_t levenshtein(std::string& a,
+                   std::string& b)
+{
+  using namespace std;
+  size_t asize=a.size(), bsize = b.size();
+
+  //Just need two rows at a time
+  vector<size_t> prevrow(bsize+1), thisrow(bsize+1);
+
+  for(size_t i = 0; i <= bsize; i++)
+    prevrow[i] = i;
+
+  for(size_t i = 1; i <= asize; i ++)
+  {
+    thisrow[0] = i;
+    for(size_t j = 1; j <= bsize; j++)
+    {
+      thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1]),
+                       1 + min(prevrow[j],thisrow[j-1]) );
+    }
+    swap(thisrow,prevrow);
+  }
+
+  return prevrow[bsize];
+}
+
new file mode 100644
--- /dev/null
+++ b/breathalyzer/main.c++
@@ -0,0 +1,43 @@
+#include "bk-tree.c++"
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+void string_to_upper(std::string& str)
+{
+  std::transform( str.begin(), str.end(), str.begin(),  &toupper);
+}
+
+int main(int argc, char** argv)
+{
+  using namespace std;
+  bk_tree tree;
+  {
+    ifstream ifs("tree");
+    load_tree(tree,ifs);
+  }
+
+  string filename = argv[1];
+  ifstream ifs(filename.c_str());
+
+  size_t total_distance = 0;
+  string word;
+  while( ifs >> word)
+  {
+    string_to_upper(word);
+    size_t m = word.size();
+    for(size_t i = 0; i < word.size(); i++)
+    {
+      if( query_tree(tree,word,i) )
+      {
+        m = i;
+        break;
+      }
+    }
+
+    total_distance += m;
+
+  }
+  cout << total_distance << endl;
+}