001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020 package org.sonar.duplications.detector.suffixtree;
021
022 import com.google.common.collect.Lists;
023
024 import java.util.*;
025
026 public final class Search {
027
028 private final SuffixTree tree;
029 private final TextSet text;
030 private final Collector reporter;
031
032 private final List<Integer> list = Lists.newArrayList();
033 private final List<Node> innerNodes = Lists.newArrayList();
034
035 public static void perform(TextSet text, Collector reporter) {
036 new Search(SuffixTree.create(text), text, reporter).compute();
037 }
038
039 private Search(SuffixTree tree, TextSet text, Collector reporter) {
040 this.tree = tree;
041 this.text = text;
042 this.reporter = reporter;
043 }
044
045 private void compute() {
046 // O(N)
047 dfs();
048
049 // O(N * log(N))
050 Collections.sort(innerNodes, DEPTH_COMPARATOR);
051
052 // O(N)
053 visitInnerNodes();
054 }
055
056 private static final Comparator<Node> DEPTH_COMPARATOR = new Comparator<Node>() {
057 public int compare(Node o1, Node o2) {
058 return o2.depth - o1.depth;
059 }
060 };
061
062 /**
063 * Depth-first search (DFS).
064 */
065 private void dfs() {
066 LinkedList<Node> stack = Lists.newLinkedList();
067 stack.add(tree.getRootNode());
068 while (!stack.isEmpty()) {
069 Node node = stack.removeLast();
070 node.startSize = list.size();
071 if (node.getEdges().isEmpty()) { // leaf
072 list.add(node.depth);
073 node.endSize = list.size();
074 } else {
075 if (!node.equals(tree.getRootNode())) { // inner node = not leaf and not root
076 innerNodes.add(node);
077 }
078 for (Edge edge : node.getEdges()) {
079 Node endNode = edge.getEndNode();
080 endNode.depth = node.depth + edge.getSpan() + 1;
081 stack.addLast(endNode);
082 }
083 }
084 }
085 // At this point all inner nodes are ordered by the time of entering, so we visit them from last to first
086 ListIterator<Node> iterator = innerNodes.listIterator(innerNodes.size());
087 while (iterator.hasPrevious()) {
088 Node node = iterator.previous();
089 int max = -1;
090 for (Edge edge : node.getEdges()) {
091 max = Math.max(edge.getEndNode().endSize, max);
092 }
093 node.endSize = max;
094 }
095 }
096
097 /**
098 * Each inner-node represents prefix of some suffixes, thus substring of text.
099 */
100 private void visitInnerNodes() {
101 for (Node node : innerNodes) {
102 if (containsOrigin(node)) {
103 report(node);
104 }
105 }
106 }
107
108 /**
109 * TODO Godin: in fact computations here are the same as in {@link #report(Node)},
110 * so maybe would be better to remove this duplication,
111 * however it should be noted that this check can't be done in {@link Collector#endOfGroup()},
112 * because it might lead to creation of unnecessary new objects
113 */
114 private boolean containsOrigin(Node node) {
115 for (int i = node.startSize; i < node.endSize; i++) {
116 int start = tree.text.length() - list.get(i);
117 int end = start + node.depth;
118 if (text.isInsideOrigin(end)) {
119 return true;
120 }
121 }
122 return false;
123 }
124
125 private void report(Node node) {
126 reporter.startOfGroup(node.endSize - node.startSize, node.depth);
127 for (int i = node.startSize; i < node.endSize; i++) {
128 int start = tree.text.length() - list.get(i);
129 int end = start + node.depth;
130 reporter.part(start, end);
131 }
132 reporter.endOfGroup();
133 }
134
135 public abstract static class Collector {
136
137 /**
138 * Invoked at the beginning of processing for current node.
139 * <p>
140 * Length - is a depth of node. And nodes are visited in descending order of depth,
141 * thus we guaranty that length will not increase between two sequential calls of this method
142 * (can be equal or less than previous value).
143 * </p>
144 *
145 * @param size number of parts in group
146 * @param length length of each part in group
147 */
148 abstract void startOfGroup(int size, int length);
149
150 /**
151 * Invoked as many times as leaves in the subtree, where current node is root.
152 *
153 * @param start start position in generalised text
154 * @param end end position in generalised text
155 */
156 abstract void part(int start, int end);
157
158 /**
159 * Invoked at the end of processing for current node.
160 */
161 abstract void endOfGroup();
162
163 }
164
165 }