001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020 package org.sonar.duplications.detector.original;
021
022 import com.google.common.collect.Lists;
023 import com.google.common.collect.Maps;
024 import org.sonar.duplications.block.Block;
025 import org.sonar.duplications.block.ByteArray;
026 import org.sonar.duplications.index.CloneGroup;
027 import org.sonar.duplications.index.CloneIndex;
028 import org.sonar.duplications.index.ClonePart;
029
030 import java.util.Collection;
031 import java.util.Collections;
032 import java.util.List;
033 import java.util.Map;
034
035 /**
036 * Implementation of algorithm described in paper
037 * <a href="http://www4.in.tum.de/~juergens/publications/icsm2010_crc.pdf">Index-Based Code Clone Detection: Incremental, Distributed, Scalable</a>
038 * by Benjamin Hummel, Elmar Juergens, Michael Conradt and Lars Heinemann.
039 */
040 public final class OriginalCloneDetectionAlgorithm {
041
042 /**
043 * Performs detection and returns list of clone groups between file (which represented as a collection of blocks) and index.
044 * Note that this method ignores blocks for this file, that will be retrieved from index.
045 */
046 public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) {
047 if (fileBlocks.isEmpty()) {
048 return Collections.EMPTY_LIST;
049 }
050 OriginalCloneDetectionAlgorithm reporter = new OriginalCloneDetectionAlgorithm(cloneIndex);
051 reporter.findClones(fileBlocks);
052 return reporter.filter.getResult();
053 }
054
055 private final CloneIndex cloneIndex;
056
057 private final Filter filter = new Filter();
058
059 private String originResourceId;
060
061 private OriginalCloneDetectionAlgorithm(CloneIndex cloneIndex) {
062 this.cloneIndex = cloneIndex;
063 }
064
065 private BlocksGroup[] createGroups(Collection<Block> fileBlocks) {
066 // 2: let f be the list of tuples corresponding to filename sorted by statement index
067 // either read from the index or calculated on the fly
068 int size = fileBlocks.size();
069
070 // Godin: create one group per unique hash
071 Map<ByteArray, BlocksGroup> groupsByHash = Maps.newHashMap(); // TODO Godin: can we create map with expected size?
072 for (Block fileBlock : fileBlocks) {
073 ByteArray hash = fileBlock.getBlockHash();
074 BlocksGroup sameHash = groupsByHash.get(hash);
075 if (sameHash == null) {
076 sameHash = BlocksGroup.empty();
077 groupsByHash.put(hash, sameHash);
078 }
079 sameHash.blocks.add(fileBlock);
080 }
081
082 // Godin: retrieve blocks from index
083 for (Map.Entry<ByteArray, BlocksGroup> entry : groupsByHash.entrySet()) {
084 ByteArray hash = entry.getKey();
085 BlocksGroup group = entry.getValue();
086 for (Block blockFromIndex : cloneIndex.getBySequenceHash(hash)) {
087 // Godin: skip blocks for this file if they come from index
088 if (!originResourceId.equals(blockFromIndex.getResourceId())) {
089 group.blocks.add(blockFromIndex);
090 }
091 }
092 Collections.sort(group.blocks, BlocksGroup.BlockComparator.INSTANCE);
093 }
094
095 // 3: let c be a list with c(0) = empty
096 BlocksGroup[] sameHashBlocksGroups = new BlocksGroup[size + 2];
097 sameHashBlocksGroups[0] = BlocksGroup.empty();
098 // 4: for i := 1 to length(f) do
099 for (Block fileBlock : fileBlocks) {
100 ByteArray hash = fileBlock.getBlockHash();
101 int i = fileBlock.getIndexInFile() + 1;
102 // 5: retrieve tuples with same sequence hash as f(i)
103 // 6: store this set as c(i)
104 sameHashBlocksGroups[i] = groupsByHash.get(hash);
105 }
106
107 // Godin: allows to report clones at the end of file, because condition at line 13 would be evaluated as true
108 sameHashBlocksGroups[size + 1] = BlocksGroup.empty();
109
110 return sameHashBlocksGroups;
111 }
112
113 private void findClones(Collection<Block> fileBlocks) {
114 originResourceId = fileBlocks.iterator().next().getResourceId();
115
116 BlocksGroup[] sameHashBlocksGroups = createGroups(fileBlocks);
117
118 // 7: for i := 1 to length(c) do
119 for (int i = 1; i < sameHashBlocksGroups.length; i++) {
120 // In the main loop (starting from Line 7), we first check
121 // whether any new clones might start at this position. If there
122 // is only a single tuple with this hash (which has to belong
123 // to the inspected file at the current location) we skip this loop
124 // iteration. The same holds if all tuples at position i have already
125 // been present at position i − 1, as in this case any clone group
126 // found at position i would be included in a clone group starting
127 // at position i − 1.
128
129 // Although we use the subset operator in the
130 // algorithm description, this is not really a subset operation,
131 // as of course the statement index of the tuples in c(i) will be
132 // increased by 1 compared to the corresponding ones in c(i − 1)
133 // and the hash and info fields will differ.
134
135 // 8: if |c(i)| < 2 or c(i) subsumed by c(i - 1) then
136 if (sameHashBlocksGroups[i].size() < 2 || sameHashBlocksGroups[i].subsumedBy(sameHashBlocksGroups[i - 1], 1)) {
137 // 9: continue with next loop iteration
138 continue;
139 }
140
141 // The set a introduced in Line 10 is called the active set and
142 // contains all tuples corresponding to clones which have not yet
143 // been reported. At each iteration of the inner loop the set a
144 // is reduced to tuples which are also present in c(j); again the
145 // intersection operator has to account for the increased statement
146 // index and different hash and info fields. The new value is
147 // stored in a0. Clones are only reported, if tuples are lost in
148 // Line 12, as otherwise all current clones could be prolonged
149 // by one statement. Clone reporting matches tuples that, after
150 // correction of the statement index, appear in both c(i) and a,
151 // each matched pair corresponds to a single clone. Its location
152 // can be extracted from the filename and info fields.
153
154 // 10: let a := c(i)
155 BlocksGroup currentBlocksGroup = sameHashBlocksGroups[i];
156 // 11: for j := i + 1 to length(c) do
157 for (int j = i + 1; j < sameHashBlocksGroups.length; j++) {
158 // 12: let a0 := a intersect c(j)
159 BlocksGroup intersectedBlocksGroup = currentBlocksGroup.intersect(sameHashBlocksGroups[j]);
160
161 // 13: if |a0| < |a| then
162 if (intersectedBlocksGroup.size() < currentBlocksGroup.size()) {
163 // 14: report clones from c(i) to a (see text)
164
165 // One problem of this algorithm is that clone classes with
166 // multiple instances in the same file are encountered and
167 // reported multiple times. Furthermore, when calculating the clone
168 // groups for all files in a system, clone groups will be reported
169 // more than once as well. Both cases can be avoided, by
170 // checking whether the first element of a0 (with respect to a
171 // fixed order) is equal to f(j) and only report in this case.
172
173 Block first = currentBlocksGroup.first(originResourceId);
174 if (first.getIndexInFile() == j - 2) {
175 // Godin: We report clones, which start in i-1 and end in j-2, so length is j-2-(i-1)+1=j-i
176 reportClones(sameHashBlocksGroups[i], currentBlocksGroup, j - i);
177 }
178 }
179 // 15: a := a0
180 currentBlocksGroup = intersectedBlocksGroup;
181
182 // Line 16 early exits the inner loop if either no more clones are starting
183 // from position i (i.e., a is too small), or if all tuples from a
184 // have already been in c(i − 1), corrected for statement index.
185 // In this case they have already been reported in the previous
186 // iteration of the outer loop.
187
188 // IMPORTANT Godin: note that difference in indexes between "a" and "c(i-1)" greater than one,
189 // so method subsumedBy should take this into account
190
191 // 16: if |a| < 2 or a subsumed by c(i-1) then
192 if (currentBlocksGroup.size() < 2 || currentBlocksGroup.subsumedBy(sameHashBlocksGroups[i - 1], j - i + 1)) {
193 // 17: break inner loop
194 break;
195 }
196 }
197 }
198 }
199
200 private void reportClones(BlocksGroup beginGroup, BlocksGroup endGroup, int cloneLength) {
201 List<Block[]> pairs = beginGroup.pairs(endGroup, cloneLength);
202
203 ClonePart origin = null;
204 List<ClonePart> parts = Lists.newArrayList();
205
206 for (int i = 0; i < pairs.size(); i++) {
207 Block[] pair = pairs.get(i);
208 Block firstBlock = pair[0];
209 Block lastBlock = pair[1];
210 ClonePart part = new ClonePart(firstBlock.getResourceId(),
211 firstBlock.getIndexInFile(),
212 firstBlock.getStartLine(),
213 lastBlock.getEndLine());
214
215 if (originResourceId.equals(part.getResourceId())) {
216 if (origin == null) {
217 origin = part;
218 } else if (part.getUnitStart() < origin.getUnitStart()) {
219 origin = part;
220 }
221 }
222
223 parts.add(part);
224 }
225
226 filter.add(CloneGroup.builder().setLength(cloneLength).setOrigin(origin).setParts(parts).build());
227 }
228
229 }