changeset 6:cad6633bc1fc

Preliminary final form
author Jordi Gutiérrez Hermoso <jordigh@gmail.com>
date Sat, 01 Jan 2011 03:11:04 -0600
parents 49a1f3e7eff2
children feead6f50850
files bk-tree.c++ levenshtein.c++ main.c++
diffstat 3 files changed, 134 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/bk-tree.c++
@@ -0,0 +1,117 @@
+#include "levenshtein.c++"
+
+#include <fstream>
+#include <sstream>
+
+// unordered_map seems to perform just  as well, cheaper op[] but more
+// expensive ctor, overall balances out, might as well use plain map.
+#include <map>
+
+struct bk_tree
+{
+  std::string word;
+  std::map<size_t, bk_tree*> children;
+};
+typedef std::map<size_t, bk_tree*> branches;
+
+void insert_word(bk_tree& tree, std::string& word)
+{
+  if(tree.word == "")
+  {
+    tree.word = word;
+    return;
+  }
+
+  size_t d = levenshtein(tree.word, word);
+  bk_tree* child = tree.children[d];
+  if(not child)
+  {
+    bk_tree* subtree = new bk_tree;
+    tree.children[d] = subtree;
+    subtree->word = word;
+  }
+  else
+    insert_word(*child, word);
+}
+
+void save_tree(const bk_tree& tree, std::ofstream& ofs)
+{
+  ofs << tree.word;
+  if(tree.children.size() > 0)
+  {
+    ofs << " ( ";
+    for(branches::const_iterator i = tree.children.begin();
+        i != tree.children.end(); i++)
+    {
+      if( i-> second != 0)
+      {
+        ofs << i -> first << " ";
+        save_tree( *(i->second), ofs);
+        ofs << " ";
+      }
+    }
+    ofs << ") ";
+  }
+}
+
+std::string load_tree(bk_tree & tree, std::ifstream& ifs)
+{
+  using namespace std;
+
+  string word;
+  ifs >> word;
+
+  tree.word = word;
+
+  string token;
+  ifs >> token;
+  if(token == "(")
+  {
+    size_t weight;
+    string nexttoken;
+    while(true)
+    {
+      if(nexttoken == ")")
+      {
+        return "";
+      }
+
+      stringstream ss;
+      if(nexttoken != "")
+      {
+        ss << nexttoken;
+        ss >> weight;
+      }
+      else
+      {
+        ifs >> token;
+        if(token ==")")
+          return "";
+        ss << token;
+        ss >> weight;
+      }
+      bk_tree* subtree = new bk_tree;
+      tree.children[weight] = subtree;
+      nexttoken = load_tree(*subtree, ifs);
+    }
+  }
+  return token;
+}
+
+bool query_tree( bk_tree & tree, std::string& word, size_t m)
+{
+  size_t d;
+  if( (d = levenshtein(tree.word,word)) <= m)
+    return true;
+
+  bool found = false;
+  for(size_t i = d - m; i <= d+m; i++)
+  {
+    if( tree.children[i] and query_tree(*tree.children[i], word, m))
+    {
+      found = true;
+      break;
+    }
+  }
+  return found;
+}
--- a/levenshtein.c++
+++ b/levenshtein.c++
@@ -4,27 +4,10 @@
 #include <algorithm>
 
 size_t levenshtein(std::string& a,
-                   std::string& b,
-                   size_t m = 100) //the cutoff maximum
+                   std::string& b)
 {
   using namespace std;
   size_t asize=a.size(), bsize = b.size();
-  bool swapped = false;
-
-  if( asize > bsize)
-  {
-    swapped = true;
-    swap(a,b);
-    swap(asize,bsize);
-  }
-
-  //A bit of a dumb heuristic, but seems to help a little
-  if( long(bsize)  - long(asize) > m)
-  {
-    if(swapped)
-      swap(a,b); //unswap
-    return m;
-  }
 
   //Just need two rows at a time
   vector<size_t> prevrow(bsize+1), thisrow(bsize+1);
@@ -43,9 +26,6 @@
     swap(thisrow,prevrow);
   }
 
-  if(swapped)
-    swap(a,b); //unswap
+  return prevrow[bsize];
+}
 
-  return prevrow[bsize];
-
-}
--- a/main.c++
+++ b/main.c++
@@ -1,4 +1,4 @@
-#include "levenshtein.c++"
+#include "bk-tree.c++"
 
 #include <fstream>
 #include <sstream>
@@ -12,28 +12,29 @@
 int main()
 {
   using namespace std;
-  ifstream ifs("twl06.txt");
-  string word;
-  vector<string> wl;
-  while(ifs >> word)
+  bk_tree tree;
   {
-    wl.push_back(word);
+    ifstream ifs("tree");
+    load_tree(tree,ifs);
   }
 
-  string phrase = "tihs sententcnes iss nout varrry goud";
-  stringstream ss; ss << phrase;
   size_t total_distance = 0;
-  while( ss >> word)
+  string word;
+  while( cin >> word)
   {
     string_to_upper(word);
-    size_t m = 50;
-    for(size_t i = 0; i < wl.size(); i++)
+    size_t m = word.size();
+    for(size_t i = 0; i < word.size(); i++)
     {
-      m = min(levenshtein(wl[i],word,m), m);
-      if(m == 0)
+      if( query_tree(tree,word,i) )
+      {
+        m = i;
         break;
+      }
     }
+
     total_distance += m;
+
   }
   cout << total_distance << endl;
 }