view bk-tree.c++ @ 6:cad6633bc1fc

Preliminary final form
author Jordi Gutiérrez Hermoso <jordigh@gmail.com>
date Sat, 01 Jan 2011 03:11:04 -0600
parents
children b2c93de84b36
line wrap: on
line source

#include "levenshtein.c++"

#include <fstream>
#include <sstream>

// unordered_map seems to perform just  as well, cheaper op[] but more
// expensive ctor, overall balances out, might as well use plain map.
#include <map>

struct bk_tree
{
  std::string word;
  std::map<size_t, bk_tree*> children;
};
typedef std::map<size_t, bk_tree*> branches;

void insert_word(bk_tree& tree, std::string& word)
{
  if(tree.word == "")
  {
    tree.word = word;
    return;
  }

  size_t d = levenshtein(tree.word, word);
  bk_tree* child = tree.children[d];
  if(not child)
  {
    bk_tree* subtree = new bk_tree;
    tree.children[d] = subtree;
    subtree->word = word;
  }
  else
    insert_word(*child, word);
}

void save_tree(const bk_tree& tree, std::ofstream& ofs)
{
  ofs << tree.word;
  if(tree.children.size() > 0)
  {
    ofs << " ( ";
    for(branches::const_iterator i = tree.children.begin();
        i != tree.children.end(); i++)
    {
      if( i-> second != 0)
      {
        ofs << i -> first << " ";
        save_tree( *(i->second), ofs);
        ofs << " ";
      }
    }
    ofs << ") ";
  }
}

std::string load_tree(bk_tree & tree, std::ifstream& ifs)
{
  using namespace std;

  string word;
  ifs >> word;

  tree.word = word;

  string token;
  ifs >> token;
  if(token == "(")
  {
    size_t weight;
    string nexttoken;
    while(true)
    {
      if(nexttoken == ")")
      {
        return "";
      }

      stringstream ss;
      if(nexttoken != "")
      {
        ss << nexttoken;
        ss >> weight;
      }
      else
      {
        ifs >> token;
        if(token ==")")
          return "";
        ss << token;
        ss >> weight;
      }
      bk_tree* subtree = new bk_tree;
      tree.children[weight] = subtree;
      nexttoken = load_tree(*subtree, ifs);
    }
  }
  return token;
}

bool query_tree( bk_tree & tree, std::string& word, size_t m)
{
  size_t d;
  if( (d = levenshtein(tree.word,word)) <= m)
    return true;

  bool found = false;
  for(size_t i = d - m; i <= d+m; i++)
  {
    if( tree.children[i] and query_tree(*tree.children[i], word, m))
    {
      found = true;
      break;
    }
  }
  return found;
}