001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2012 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.detector.suffixtree;
021    
022    import com.google.common.base.Objects;
023    
024    /**
025     * Provides algorithm to construct suffix tree.
026     * <p>
027     * Suffix tree for the string S of length n is defined as a tree such that:
028     * <ul>
029     * <li>the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,</li>
030     * <li>edges spell non-empty strings,</li>
031     * <li>and all internal nodes (except perhaps the root) have at least two children.</li>
032     * </ul>
033     * Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $).
034     * This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S.
035     * Since all internal non-root nodes are branching, there can be at most n −  1 such nodes, and n + (n − 1) + 1 = 2n nodes in total.
036     * All internal nodes and leaves have incoming edge, so number of edges equal to number of leaves plus number of inner nodes,
037     * thus at most 2n - 1.
038     * Construction takes O(n) time.
039     * </p><p>
040     * This implementation was adapted from <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of
041     * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>.
042     * </p>
043     */
044    public final class SuffixTree {
045    
046      final Text text;
047    
048      private final Node root;
049    
050      public static SuffixTree create(Text text) {
051        SuffixTree tree = new SuffixTree(text);
052        Suffix active = new Suffix(tree.root, 0, -1);
053        for (int i = 0; i < text.length(); i++) {
054          tree.addPrefix(active, i);
055        }
056        return tree;
057      }
058    
059      private SuffixTree(Text text) {
060        this.text = text;
061        root = new Node(this, null);
062      }
063    
064      private void addPrefix(Suffix active, int endIndex) {
065        Node lastParentNode = null;
066        Node parentNode;
067    
068        while (true) {
069          Edge edge;
070          parentNode = active.getOriginNode();
071    
072          // Step 1 is to try and find a matching edge for the given node.
073          // If a matching edge exists, we are done adding edges, so we break out of this big loop.
074          if (active.isExplicit()) {
075            edge = active.getOriginNode().findEdge(symbolAt(endIndex));
076            if (edge != null) {
077              break;
078            }
079          } else {
080            // implicit node, a little more complicated
081            edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex()));
082            int span = active.getSpan();
083            if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) {
084              break;
085            }
086            parentNode = edge.splitEdge(active);
087          }
088    
089          // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position,
090          // and insert it into the hash table. When we create a new node, it also means we need to create
091          // a suffix link to the new node from the last node we visited.
092          Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode);
093          newEdge.insert();
094          updateSuffixNode(lastParentNode, parentNode);
095          lastParentNode = parentNode;
096    
097          // This final step is where we move to the next smaller suffix
098          if (active.getOriginNode() == root) {
099            active.incBeginIndex();
100          } else {
101            active.changeOriginNode();
102          }
103          active.canonize();
104        }
105        updateSuffixNode(lastParentNode, parentNode);
106        active.incEndIndex(); // Now the endpoint is the next active point
107        active.canonize();
108      }
109    
110      private void updateSuffixNode(Node node, Node suffixNode) {
111        if ((node != null) && (!node.equals(root))) {
112          node.setSuffixNode(suffixNode);
113        }
114      }
115    
116      public Object symbolAt(int index) {
117        return text.symbolAt(index);
118      }
119    
120      public Node getRootNode() {
121        return root;
122      }
123    
124    }