001/* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2012 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020package org.sonar.duplications.detector.suffixtree; 021 022import com.google.common.base.Objects; 023 024/** 025 * Provides algorithm to construct suffix tree. 026 * <p> 027 * Suffix tree for the string S of length n is defined as a tree such that: 028 * <ul> 029 * <li>the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,</li> 030 * <li>edges spell non-empty strings,</li> 031 * <li>and all internal nodes (except perhaps the root) have at least two children.</li> 032 * </ul> 033 * Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). 034 * This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. 035 * Since all internal non-root nodes are branching, there can be at most n − 1 such nodes, and n + (n − 1) + 1 = 2n nodes in total. 036 * All internal nodes and leaves have incoming edge, so number of edges equal to number of leaves plus number of inner nodes, 037 * thus at most 2n - 1. 038 * Construction takes O(n) time. 039 * </p><p> 040 * This implementation was adapted from <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of 041 * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>. 042 * </p> 043 */ 044public final class SuffixTree { 045 046 final Text text; 047 048 private final Node root; 049 050 public static SuffixTree create(Text text) { 051 SuffixTree tree = new SuffixTree(text); 052 Suffix active = new Suffix(tree.root, 0, -1); 053 for (int i = 0; i < text.length(); i++) { 054 tree.addPrefix(active, i); 055 } 056 return tree; 057 } 058 059 private SuffixTree(Text text) { 060 this.text = text; 061 root = new Node(this, null); 062 } 063 064 private void addPrefix(Suffix active, int endIndex) { 065 Node lastParentNode = null; 066 Node parentNode; 067 068 while (true) { 069 Edge edge; 070 parentNode = active.getOriginNode(); 071 072 // Step 1 is to try and find a matching edge for the given node. 073 // If a matching edge exists, we are done adding edges, so we break out of this big loop. 074 if (active.isExplicit()) { 075 edge = active.getOriginNode().findEdge(symbolAt(endIndex)); 076 if (edge != null) { 077 break; 078 } 079 } else { 080 // implicit node, a little more complicated 081 edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex())); 082 int span = active.getSpan(); 083 if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) { 084 break; 085 } 086 parentNode = edge.splitEdge(active); 087 } 088 089 // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position, 090 // and insert it into the hash table. When we create a new node, it also means we need to create 091 // a suffix link to the new node from the last node we visited. 092 Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode); 093 newEdge.insert(); 094 updateSuffixNode(lastParentNode, parentNode); 095 lastParentNode = parentNode; 096 097 // This final step is where we move to the next smaller suffix 098 if (active.getOriginNode() == root) { 099 active.incBeginIndex(); 100 } else { 101 active.changeOriginNode(); 102 } 103 active.canonize(); 104 } 105 updateSuffixNode(lastParentNode, parentNode); 106 active.incEndIndex(); // Now the endpoint is the next active point 107 active.canonize(); 108 } 109 110 private void updateSuffixNode(Node node, Node suffixNode) { 111 if ((node != null) && (!node.equals(root))) { 112 node.setSuffixNode(suffixNode); 113 } 114 } 115 116 public Object symbolAt(int index) { 117 return text.symbolAt(index); 118 } 119 120 public Node getRootNode() { 121 return root; 122 } 123 124}