from __future__ import generators class Node: """Represents a bunch of possible next-words for one word-as-a-node Doesn't include the word itself; rather, it's associated with the word in a key:value relationship elsewhere These only live as leafs at the bottom of the tree """ def __init__(self): self.n = 0 self.dict = {} #word : freq self.list = None self.metadata_dt = {} #word : metadatas def add_child(self, child, metadata): self.n += 1 if self.dict.has_key(child): self.dict[child] += 1 self.metadata_dt[child].append(metadata) else: self.dict[child] = 1 self.metadata_dt[child] = [metadata] def finalize(self): self.list = [] counter = 0 for word, freq in self.dict.iteritems(): counter += freq self.list.append((counter, word)) def get_word(self, rand): n = rand * self.n if not self.list: self.finalize() item_bottom = 0 item_top = len(self.list) - 1 while item_top > item_bottom + 1: item_middle = (item_top + item_bottom) / 2 if n < self.list[item_middle][0]: item_top = item_middle elif n > self.list[item_middle][0]: item_bottom = item_middle else: return self.list[item_middle][1] return self.list[item_top][1] SENTENCE_DELIM= '#' def build_dict(n, token_source, n_tuple_dict = None): """Will build upon the n_tuple_dict tree, and return a new, updated tree""" if not n_tuple_dict: n_tuple_dict = {} starting_ngram = [SENTENCE_DELIM for i in xrange(n)] count=0 cur_ngram = starting_ngram for tok, metadata in token_source: count += 1 cur_ngram = cur_ngram[1:] + [tok] curnode = n_tuple_dict for i in xrange(n - 1): #Populate the n-1-length path with {}'s and a Node if not curnode.has_key(cur_ngram[i]): if i == n - 2: curnode[cur_ngram[i]] = Node() else: curnode[cur_ngram[i]] = {} curnode = curnode[cur_ngram[i]] #curnode is now actually a node, not a dict curnode.add_child(cur_ngram[n - 1], metadata) if tok == SENTENCE_DELIM: cur_ngram = starting_ngram[:] return n_tuple_dict, count def get_choosing_node(dict, sequence): """Traverses a sequence of words, length=n, down the tree. It returns the bottom node, which is a Node node, that contains a probability table.""" working = dict for i in xrange(len(sequence)): if working.has_key(sequence[i]): working = working[sequence[i]] else: raise Exception, "word " + sequence[i] + " in sequence can't be found" return working import random import time random.seed(time.clock()) MAXWORDS=2056 def random_sentence(dict, n): """yields word, positionlist. Uses the dict as the big tree, and n as the length of the n-gram""" startwith = [SENTENCE_DELIM for i in xrange(n-1)] newword = 'guten Tag' wordcount = 0 while newword != SENTENCE_DELIM: endnode = get_choosing_node(dict, startwith) newword = endnode.get_word(random.random()) if newword is SENTENCE_DELIM: yield ".", [('A00 0000', 0, 0)] else: poslist = [metadata.pos for metadata in endnode.metadata_dt[newword]] yield newword, poslist startwith = startwith[1:] + [newword] wordcount += 1 if wordcount > MAXWORDS: yield ".", [('A00 0000', 0, 0)] break