# FILE NAME: text_algorithms.py # DESCRIPTION: Implementation of the algorithms from the book: # "Text Algorithms", by Maxime Crochemore and Wojciech Rytter, # Oxford University Press, New York, 1994. ISBN 0-19-508609-0. # AUTHOR: Tzemos El. Diploma Thesis. ECE Dept. University of Patras. # LANGUAGE: Python. Version 3.8.2 # FILE VERSION: 1.00 (18/7/2020) # Included only Ukkonnen's algorithm in this file, from 25 algorithms. ############################### Algorithm: Ukkonen ################################### """ Linear time implementation of Ukkonnen's algorithm Rule 1: After an insertion from root: active_node remains root active_edge is set to the first character of the new suffix we need to insert active_length is reduced by 1 Rule 2: If we create a new internal node OR make an inserter from an internal node, and this is not the first SUCH internal node at current step, then we link the previous SUCH node with THIS one through a suffix link. Rule 3: After splitting an edge from an active_node that is not the root node, we follow the suffix link going out of that node, if there is any, and reset the active_node to the node it points to. If there is no suffix link, we set the active_node to the root. active_edge and active_length remain unchanged. Observation 1 When the final suffix we need to insert is found to exist in the tree already, the tree itself is not changed at all (we only update the active point, active_len and remainder). Observation 2: If at some point active_length is greater or equal to the length of current edge (edge_length), we move our active point down until edge_length is not strictly greater than active_length. """ import os import platform import itertools import subprocess class Node(object): counter = itertools.count().__next__ def __init__(self, start = None, end = None, SuffixLink = None, parent = None, id = None): self.start = start self.end = end self.id = Node.counter() self.SuffixLink = SuffixLink self.parent = parent self.children = {} def __repr__(self): return "Node(id=%d, start=%d, end=%d, SuffixLink=%s)" % (self.id, self.start, self.end, self.SuffixLink) def edgeLen(self): return self.end - self.start class SuffixTree: def __init__(self, string, nodes = None): self.string = string self.currentNode = 0 self.first_str_end = None # this is the end index for the first string self.position = -1 self.nodes = nodes if nodes else [] self.root = self.newNode(-1, -1, parent=None) self.active_node = self.root self.active_edge = 0 self.active_len = 0 self.remainder = 0 self.end = len(self.string) self.needSuffixLink = 0 for i in self.string: self.__addChar(i) def newNode(self, start, end, parent = None): n = Node(start, end, parent = parent) self.nodes.append(n) return n def active_edge_c(self): return self.string[self.active_edge] def walkDown(self, node): if self.active_len >= node.edgeLen(): self.active_edge += node.edgeLen() self.active_len -= node.edgeLen() self.active_node = node return True return False def addSuffixLink(self, node): if self.needSuffixLink: self.needSuffixLink.SuffixLink = node self.needSuffixLink = node def __addChar(self, c): self.position += 1 if c == '#': self.first_str_end = self.position #increase remainder by 1 at the start of each loop self.remainder += 1 self.needSuffixLink = None while self.remainder > 0: #update active edge properly if self.active_len == 0: self.active_edge = self.position #if there is no path starting with new character create one if self.active_edge_c() not in self.active_node.children: leaf = self.newNode(self.position, self.end) self.active_node.children[self.active_edge_c()] = leaf leaf.parent = self.active_node self.addSuffixLink(self.active_node) #else change the active node, length, edge properly else: original_leaf = self.active_node.children[self.active_edge_c()] if self.walkDown(original_leaf): continue if self.string[original_leaf.start + self.active_len] == c: self.active_len += 1 self.addSuffixLink(self.active_node) break #creates an internal node and adds the new leaf and add suffix link split = self.newNode(original_leaf.start, original_leaf.start + self.active_len) self.active_node.children[self.active_edge_c()] = split split.parent = self.active_node leaf = self.newNode(self.position, self.end) split.children[c] = leaf leaf.parent = split original_leaf.start += self.active_len split.children[self.string[original_leaf.start]] = original_leaf original_leaf.parent = split self.addSuffixLink(split) self.remainder -= 1 #if active length>0 update edge, length to find new path if self.active_node == self.root and self.active_len > 0: self.active_len -= 1 self.active_edge = self.position - self.remainder + 1 #else update suffix link else: self.active_node = self.active_node.SuffixLink if self.active_node.SuffixLink else self.root #prints the dictionary of the suffix tree as a graph def printTree(self, buffer): buffer.write("digraph {\n") buffer.write("\tlabel = " + '"Figure: The ukkonen compressed tree T(text) of ' + self.string +'"\n') buffer.write("\tfontsize = 20\n") buffer.write("\tfontcolor = blue\n") buffer.write("\tlabelloc = "+ '"t"\n') buffer.write("\trankdir = LR\n") buffer.write("\tedge [arrowsize=1.1,fontsize=16]\n") buffer.write("\tnode0 [label=\"0\"," + "style=filled,fillcolor=red,shape=circle,width=.6,height=.6]\n") # write the root node in red filled circle buffer.write("//------leaves------\n") self.printLeaves(self.root, buffer) buffer.write("//------internal nodes------\n") self.printInternalNodes(self.root, buffer) buffer.write("//------edges------\n") self.printEdges(self.root, buffer) buffer.write("//------suffix links------\n") self.printSLinks(self.root, buffer) buffer.write("}\n") def edgeString(self,nodeID): if type(nodeID) == type(Node()): return "".join(self.string[nodeID.start:nodeID.end]) def label(self,nodeID): return repr("".join(self.string[nodeID.start:nodeID.end])) def printLeaves(self, nodeID, buffer): if len(nodeID.children) == 0: buffer.write("\tnode"+str(nodeID.id)+" [label=\""+str(nodeID.id)+"\",style=filled,fillcolor=lightblue,shape=circle,width=.6,height=.6]\n") # write the leaves in lightblue filled circle else: for child in nodeID.children.values(): self.printLeaves(child, buffer) def printInternalNodes(self, nodeID, buffer): if (not nodeID == self.root) and len(nodeID.children) > 0: buffer.write("\tnode"+str(nodeID.id)+ " [label=\""+str(nodeID.id)+"\",style=filled,fillcolor=green,shape=circle,width=.6," + "height=.6]\n") # write the InternalNodes in green filled circle for child in nodeID.children.values(): self.printInternalNodes(child, buffer) def printEdges(self, nodeID, buffer): for child in nodeID.children.values(): buffer.write("\tnode"+str(nodeID.id)+" -> node"+ str(child.id)+" [label=\""+self.edgeString(child)+"\",weight=5,color=blue,arrowsize=1.1]\n") # write the Edges with blue lines self.printEdges(child, buffer) def printSLinks(self, nodeID, buffer): if nodeID.SuffixLink: buffer.write("\tnode"+str(nodeID.id)+" -> node"+ str(nodeID.SuffixLink.id)+" [label=\"\",weight=5,color=red,style=dotted,arrowsize=1.1]\n") # write the Links with red dotted lines for child in nodeID.children.values(): self.printSLinks(child, buffer) def printTree_to_screen(self): print(self.label(self.root)) self.printEdges_to_screen(self.root, '') def printEdges_to_screen(self, nodeID,ident): if (not nodeID == self.root) and len(nodeID.children) > 0: ident=ident+' ' else: ident=ident+' ' for child in nodeID.children.values(): if len(nodeID.children) == 0: print(repr(self.edgeString(child))) else: print(ident+repr(self.edgeString(child))) self.printEdges_to_screen(child,' ') def Ukkonen(string): suffixtree = SuffixTree(string) suffixtree.printTree_to_screen() file="sty.dot" # dot graph file O = open(file, 'w') # open file suffixtree.printTree(O) # create ukkonen suffixtree graph file O.close() # close file if (platform.system() == "Windows"): subprocess.call("C:/Users/lefteris/Downloads/graphviz-2.38/release/bin/dot.exe -Tpdf sty.dot -o sty.pdf") subprocess.call("C:/Users/lefteris/Downloads/graphviz-2.38/release/bin/dot.exe -Tpng sty.dot -o sty.png") subprocess.call("C:/Users/lefteris/Downloads/graphviz-2.38/release/bin/dot.exe -Tjpg sty.dot -o sty.jpg") #subprocess.call("C:/Users/lefteris/Downloads/graphviz-2.38/release/bin/gvedit.exe sty.dot") subprocess.call("C:\Program Files (x86)\Adobe\Acrobat Reader DC\Reader\AcroRd32.exe sty.pdf") # open pdf file and show on screen elif (platform.system() == "Linux"): cmd = ["dot", "-Tpdf", file, "-o", os.path.splitext(file)[0]+".pdf"] # create a pdf file from sty.dot graph file subprocess.call(cmd) cmd = ["dot", "-Tpng", file, "-o", os.path.splitext(file)[0]+".png"] # create a jpg file from sty.dot graph file subprocess.call(cmd) cmd = ["dot", "-Tjpg", file, "-o", os.path.splitext(file)[0]+".jpg"] # create a postscript file from sty.dot graph file subprocess.call(cmd) cmd = ["dot", "-Tx11", file] # create a postscript file from sty.dot graph file #subprocess.call(cmd) cmd = ["gvedit", file] #subprocess.call(cmd) cmd = ["epdfview", os.path.splitext(file)[0]+".pdf"] # open pdf file and show on screen subprocess.call(cmd) """ cmd="dot -Tpdf " + file +" -o "+ os.path.splitext(file)[0]+".pdf" # create a pdf file from sty.dot graph file os.system(cmd) cmd="dot -Tpng " + file +" -o "+ os.path.splitext(file)[0]+".png" # create a png file from sty.dot graph file os.system(cmd) cmd="dot -Tjpg "+file +" -o "+ os.path.splitext(file)[0]+".jpg" # create a jpg file from sty.dot graph file os.system(cmd) cmd="dot -Tx11 "+file +" -o "+ os.path.splitext(file)[0]+".ps" # create a postscript file from sty.dot graph file os.system(cmd) #cmd="graph-easy --input " + file +" --as=ascii" # create a pdf file from sty.dot graph file #os.system(cmd) #os.system("gvedit " + file) #os.system("epdfview "+ os.path.splitext(file)[0]+".pdf") # open pdf file and show on screen """