Mercurial > hg > toys
changeset 6:cad6633bc1fc
Preliminary final form
author | Jordi Gutiérrez Hermoso <jordigh@gmail.com> |
---|---|
date | Sat, 01 Jan 2011 03:11:04 -0600 |
parents | 49a1f3e7eff2 |
children | feead6f50850 |
files | bk-tree.c++ levenshtein.c++ main.c++ |
diffstat | 3 files changed, 134 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
new file mode 100644 --- /dev/null +++ b/bk-tree.c++ @@ -0,0 +1,117 @@ +#include "levenshtein.c++" + +#include <fstream> +#include <sstream> + +// unordered_map seems to perform just as well, cheaper op[] but more +// expensive ctor, overall balances out, might as well use plain map. +#include <map> + +struct bk_tree +{ + std::string word; + std::map<size_t, bk_tree*> children; +}; +typedef std::map<size_t, bk_tree*> branches; + +void insert_word(bk_tree& tree, std::string& word) +{ + if(tree.word == "") + { + tree.word = word; + return; + } + + size_t d = levenshtein(tree.word, word); + bk_tree* child = tree.children[d]; + if(not child) + { + bk_tree* subtree = new bk_tree; + tree.children[d] = subtree; + subtree->word = word; + } + else + insert_word(*child, word); +} + +void save_tree(const bk_tree& tree, std::ofstream& ofs) +{ + ofs << tree.word; + if(tree.children.size() > 0) + { + ofs << " ( "; + for(branches::const_iterator i = tree.children.begin(); + i != tree.children.end(); i++) + { + if( i-> second != 0) + { + ofs << i -> first << " "; + save_tree( *(i->second), ofs); + ofs << " "; + } + } + ofs << ") "; + } +} + +std::string load_tree(bk_tree & tree, std::ifstream& ifs) +{ + using namespace std; + + string word; + ifs >> word; + + tree.word = word; + + string token; + ifs >> token; + if(token == "(") + { + size_t weight; + string nexttoken; + while(true) + { + if(nexttoken == ")") + { + return ""; + } + + stringstream ss; + if(nexttoken != "") + { + ss << nexttoken; + ss >> weight; + } + else + { + ifs >> token; + if(token ==")") + return ""; + ss << token; + ss >> weight; + } + bk_tree* subtree = new bk_tree; + tree.children[weight] = subtree; + nexttoken = load_tree(*subtree, ifs); + } + } + return token; +} + +bool query_tree( bk_tree & tree, std::string& word, size_t m) +{ + size_t d; + if( (d = levenshtein(tree.word,word)) <= m) + return true; + + bool found = false; + for(size_t i = d - m; i <= d+m; i++) + { + if( tree.children[i] and query_tree(*tree.children[i], word, m)) + { + found = true; + break; + } + } + return found; +}
--- a/levenshtein.c++ +++ b/levenshtein.c++ @@ -4,27 +4,10 @@ #include <algorithm> size_t levenshtein(std::string& a, - std::string& b, - size_t m = 100) //the cutoff maximum + std::string& b) { using namespace std; size_t asize=a.size(), bsize = b.size(); - bool swapped = false; - - if( asize > bsize) - { - swapped = true; - swap(a,b); - swap(asize,bsize); - } - - //A bit of a dumb heuristic, but seems to help a little - if( long(bsize) - long(asize) > m) - { - if(swapped) - swap(a,b); //unswap - return m; - } //Just need two rows at a time vector<size_t> prevrow(bsize+1), thisrow(bsize+1); @@ -43,9 +26,6 @@ swap(thisrow,prevrow); } - if(swapped) - swap(a,b); //unswap + return prevrow[bsize]; +} - return prevrow[bsize]; - -}
--- a/main.c++ +++ b/main.c++ @@ -1,4 +1,4 @@ -#include "levenshtein.c++" +#include "bk-tree.c++" #include <fstream> #include <sstream> @@ -12,28 +12,29 @@ int main() { using namespace std; - ifstream ifs("twl06.txt"); - string word; - vector<string> wl; - while(ifs >> word) + bk_tree tree; { - wl.push_back(word); + ifstream ifs("tree"); + load_tree(tree,ifs); } - string phrase = "tihs sententcnes iss nout varrry goud"; - stringstream ss; ss << phrase; size_t total_distance = 0; - while( ss >> word) + string word; + while( cin >> word) { string_to_upper(word); - size_t m = 50; - for(size_t i = 0; i < wl.size(); i++) + size_t m = word.size(); + for(size_t i = 0; i < word.size(); i++) { - m = min(levenshtein(wl[i],word,m), m); - if(m == 0) + if( query_tree(tree,word,i) ) + { + m = i; break; + } } + total_distance += m; + } cout << total_distance << endl; }