Mercurial > hg > toys
changeset 12:d5ba6cf70633
Merge in Haskell
author | Jordi Gutiérrez Hermoso <jordigh@octave.org> |
---|---|
date | Wed, 02 Oct 2013 08:43:44 -0400 |
parents | 7ce081b92ff9 (diff) 39725b492f1e (current diff) |
children | 484f9cac6a1f |
files | |
diffstat | 4 files changed, 201 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
new file mode 100644 --- /dev/null +++ b/breathalyzer/Makefile @@ -0,0 +1,3 @@ + +breathalyzer: main.c++ bk-tree.c++ levenshtein.c++ + g++ -O2 main.c++ -o breathalyzer \ No newline at end of file
new file mode 100644 --- /dev/null +++ b/breathalyzer/bk-tree.c++ @@ -0,0 +1,124 @@ +#include "levenshtein.c++" + +#include <fstream> +#include <sstream> + +// unordered_map seems to perform just as well, cheaper op[] but more +// expensive ctor, overall balances out, might as well use plain map. +#include <map> + +struct bk_tree +{ + std::string word; + //For caching levenshtein distances + std::map<std::string, size_t> lev_cache; + std::map<size_t, bk_tree*> children; +}; +typedef std::map<size_t, bk_tree*> branches; + +void insert_word(bk_tree& tree, std::string& word) +{ + if(tree.word == "") + { + tree.word = word; + return; + } + + size_t d = levenshtein(tree.word, word); + bk_tree* child = tree.children[d]; + if(not child) + { + bk_tree* subtree = new bk_tree; + tree.children[d] = subtree; + subtree->word = word; + } + else + insert_word(*child, word); +} + +void save_tree(const bk_tree& tree, std::ofstream& ofs) +{ + ofs << tree.word; + if(tree.children.size() > 0) + { + ofs << " ( "; + for(branches::const_iterator i = tree.children.begin(); + i != tree.children.end(); i++) + { + if( i-> second != 0) + { + ofs << i -> first << " "; + save_tree( *(i->second), ofs); + ofs << " "; + } + } + ofs << ") "; + } +} + +std::string load_tree(bk_tree & tree, std::ifstream& ifs) +{ + using namespace std; + + string word; + ifs >> word; + + tree.word = word; + + string token; + ifs >> token; + if(token == "(") + { + size_t weight; + string nexttoken; + while(true) + { + if(nexttoken == ")") + { + return ""; + } + + stringstream ss; + if(nexttoken != "") + { + ss << nexttoken; + ss >> weight; + } + else + { + ifs >> token; + if(token ==")") + return ""; + ss << token; + ss >> weight; + } + bk_tree* subtree = new bk_tree; + tree.children[weight] = subtree; + nexttoken = load_tree(*subtree, ifs); + } + } + return token; +} + +bool query_tree( bk_tree & tree, std::string& word, size_t m) +{ + size_t d; + if( tree.lev_cache.find(word) != tree.lev_cache.end()) + d = tree.lev_cache[word]; + else + tree.lev_cache[word] = d = levenshtein(tree.word,word); + + if( d <= m) + return true; + + bool found = false; + for(size_t i = d - m; i <= d+m; i++) + { + if( tree.children[i] and query_tree(*tree.children[i], word, m)) + { + found = true; + break; + } + } + return found; +}
new file mode 100644 --- /dev/null +++ b/breathalyzer/levenshtein.c++ @@ -0,0 +1,31 @@ +#include <string> +#include <vector> +#include <iostream> +#include <algorithm> + +size_t levenshtein(std::string& a, + std::string& b) +{ + using namespace std; + size_t asize=a.size(), bsize = b.size(); + + //Just need two rows at a time + vector<size_t> prevrow(bsize+1), thisrow(bsize+1); + + for(size_t i = 0; i <= bsize; i++) + prevrow[i] = i; + + for(size_t i = 1; i <= asize; i ++) + { + thisrow[0] = i; + for(size_t j = 1; j <= bsize; j++) + { + thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1]), + 1 + min(prevrow[j],thisrow[j-1]) ); + } + swap(thisrow,prevrow); + } + + return prevrow[bsize]; +} +
new file mode 100644 --- /dev/null +++ b/breathalyzer/main.c++ @@ -0,0 +1,43 @@ +#include "bk-tree.c++" + +#include <fstream> +#include <sstream> +#include <algorithm> + +void string_to_upper(std::string& str) +{ + std::transform( str.begin(), str.end(), str.begin(), &toupper); +} + +int main(int argc, char** argv) +{ + using namespace std; + bk_tree tree; + { + ifstream ifs("tree"); + load_tree(tree,ifs); + } + + string filename = argv[1]; + ifstream ifs(filename.c_str()); + + size_t total_distance = 0; + string word; + while( ifs >> word) + { + string_to_upper(word); + size_t m = word.size(); + for(size_t i = 0; i < word.size(); i++) + { + if( query_tree(tree,word,i) ) + { + m = i; + break; + } + } + + total_distance += m; + + } + cout << total_distance << endl; +}