001/* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2012 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020package org.sonar.duplications.detector.original; 021 022import com.google.common.collect.Lists; 023import com.google.common.collect.Maps; 024import org.sonar.duplications.block.Block; 025import org.sonar.duplications.block.ByteArray; 026import org.sonar.duplications.index.CloneGroup; 027import org.sonar.duplications.index.CloneIndex; 028import org.sonar.duplications.index.ClonePart; 029 030import java.util.Collection; 031import java.util.Collections; 032import java.util.List; 033import java.util.Map; 034 035/** 036 * Implementation of algorithm described in paper 037 * <a href="http://www4.in.tum.de/~juergens/publications/icsm2010_crc.pdf">Index-Based Code Clone Detection: Incremental, Distributed, Scalable</a> 038 * by Benjamin Hummel, Elmar Juergens, Michael Conradt and Lars Heinemann. 039 */ 040public final class OriginalCloneDetectionAlgorithm { 041 042 /** 043 * Performs detection and returns list of clone groups between file (which represented as a collection of blocks) and index. 044 * Note that this method ignores blocks for this file, that will be retrieved from index. 045 */ 046 public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) { 047 if (fileBlocks.isEmpty()) { 048 return Collections.EMPTY_LIST; 049 } 050 OriginalCloneDetectionAlgorithm reporter = new OriginalCloneDetectionAlgorithm(cloneIndex); 051 reporter.findClones(fileBlocks); 052 return reporter.filter.getResult(); 053 } 054 055 private final CloneIndex cloneIndex; 056 057 private final Filter filter = new Filter(); 058 059 private String originResourceId; 060 061 private OriginalCloneDetectionAlgorithm(CloneIndex cloneIndex) { 062 this.cloneIndex = cloneIndex; 063 } 064 065 private BlocksGroup[] createGroups(Collection<Block> fileBlocks) { 066 // 2: let f be the list of tuples corresponding to filename sorted by statement index 067 // either read from the index or calculated on the fly 068 int size = fileBlocks.size(); 069 070 // Godin: create one group per unique hash 071 Map<ByteArray, BlocksGroup> groupsByHash = Maps.newHashMap(); // TODO Godin: can we create map with expected size? 072 for (Block fileBlock : fileBlocks) { 073 ByteArray hash = fileBlock.getBlockHash(); 074 BlocksGroup sameHash = groupsByHash.get(hash); 075 if (sameHash == null) { 076 sameHash = BlocksGroup.empty(); 077 groupsByHash.put(hash, sameHash); 078 } 079 sameHash.blocks.add(fileBlock); 080 } 081 082 // Godin: retrieve blocks from index 083 for (Map.Entry<ByteArray, BlocksGroup> entry : groupsByHash.entrySet()) { 084 ByteArray hash = entry.getKey(); 085 BlocksGroup group = entry.getValue(); 086 for (Block blockFromIndex : cloneIndex.getBySequenceHash(hash)) { 087 // Godin: skip blocks for this file if they come from index 088 if (!originResourceId.equals(blockFromIndex.getResourceId())) { 089 group.blocks.add(blockFromIndex); 090 } 091 } 092 Collections.sort(group.blocks, BlocksGroup.BlockComparator.INSTANCE); 093 } 094 095 // 3: let c be a list with c(0) = empty 096 BlocksGroup[] sameHashBlocksGroups = new BlocksGroup[size + 2]; 097 sameHashBlocksGroups[0] = BlocksGroup.empty(); 098 // 4: for i := 1 to length(f) do 099 for (Block fileBlock : fileBlocks) { 100 ByteArray hash = fileBlock.getBlockHash(); 101 int i = fileBlock.getIndexInFile() + 1; 102 // 5: retrieve tuples with same sequence hash as f(i) 103 // 6: store this set as c(i) 104 sameHashBlocksGroups[i] = groupsByHash.get(hash); 105 } 106 107 // Godin: allows to report clones at the end of file, because condition at line 13 would be evaluated as true 108 sameHashBlocksGroups[size + 1] = BlocksGroup.empty(); 109 110 return sameHashBlocksGroups; 111 } 112 113 private void findClones(Collection<Block> fileBlocks) { 114 originResourceId = fileBlocks.iterator().next().getResourceId(); 115 116 BlocksGroup[] sameHashBlocksGroups = createGroups(fileBlocks); 117 118 // 7: for i := 1 to length(c) do 119 for (int i = 1; i < sameHashBlocksGroups.length; i++) { 120 // In the main loop (starting from Line 7), we first check 121 // whether any new clones might start at this position. If there 122 // is only a single tuple with this hash (which has to belong 123 // to the inspected file at the current location) we skip this loop 124 // iteration. The same holds if all tuples at position i have already 125 // been present at position i − 1, as in this case any clone group 126 // found at position i would be included in a clone group starting 127 // at position i − 1. 128 129 // Although we use the subset operator in the 130 // algorithm description, this is not really a subset operation, 131 // as of course the statement index of the tuples in c(i) will be 132 // increased by 1 compared to the corresponding ones in c(i − 1) 133 // and the hash and info fields will differ. 134 135 // 8: if |c(i)| < 2 or c(i) subsumed by c(i - 1) then 136 if (sameHashBlocksGroups[i].size() < 2 || sameHashBlocksGroups[i].subsumedBy(sameHashBlocksGroups[i - 1], 1)) { 137 // 9: continue with next loop iteration 138 continue; 139 } 140 141 // The set a introduced in Line 10 is called the active set and 142 // contains all tuples corresponding to clones which have not yet 143 // been reported. At each iteration of the inner loop the set a 144 // is reduced to tuples which are also present in c(j); again the 145 // intersection operator has to account for the increased statement 146 // index and different hash and info fields. The new value is 147 // stored in a0. Clones are only reported, if tuples are lost in 148 // Line 12, as otherwise all current clones could be prolonged 149 // by one statement. Clone reporting matches tuples that, after 150 // correction of the statement index, appear in both c(i) and a, 151 // each matched pair corresponds to a single clone. Its location 152 // can be extracted from the filename and info fields. 153 154 // 10: let a := c(i) 155 BlocksGroup currentBlocksGroup = sameHashBlocksGroups[i]; 156 // 11: for j := i + 1 to length(c) do 157 for (int j = i + 1; j < sameHashBlocksGroups.length; j++) { 158 // 12: let a0 := a intersect c(j) 159 BlocksGroup intersectedBlocksGroup = currentBlocksGroup.intersect(sameHashBlocksGroups[j]); 160 161 // 13: if |a0| < |a| then 162 if (intersectedBlocksGroup.size() < currentBlocksGroup.size()) { 163 // 14: report clones from c(i) to a (see text) 164 165 // One problem of this algorithm is that clone classes with 166 // multiple instances in the same file are encountered and 167 // reported multiple times. Furthermore, when calculating the clone 168 // groups for all files in a system, clone groups will be reported 169 // more than once as well. Both cases can be avoided, by 170 // checking whether the first element of a0 (with respect to a 171 // fixed order) is equal to f(j) and only report in this case. 172 173 Block first = currentBlocksGroup.first(originResourceId); 174 if (first.getIndexInFile() == j - 2) { 175 // Godin: We report clones, which start in i-1 and end in j-2, so length is j-2-(i-1)+1=j-i 176 reportClones(sameHashBlocksGroups[i], currentBlocksGroup, j - i); 177 } 178 } 179 // 15: a := a0 180 currentBlocksGroup = intersectedBlocksGroup; 181 182 // Line 16 early exits the inner loop if either no more clones are starting 183 // from position i (i.e., a is too small), or if all tuples from a 184 // have already been in c(i − 1), corrected for statement index. 185 // In this case they have already been reported in the previous 186 // iteration of the outer loop. 187 188 // IMPORTANT Godin: note that difference in indexes between "a" and "c(i-1)" greater than one, 189 // so method subsumedBy should take this into account 190 191 // 16: if |a| < 2 or a subsumed by c(i-1) then 192 if (currentBlocksGroup.size() < 2 || currentBlocksGroup.subsumedBy(sameHashBlocksGroups[i - 1], j - i + 1)) { 193 // 17: break inner loop 194 break; 195 } 196 } 197 } 198 } 199 200 private void reportClones(BlocksGroup beginGroup, BlocksGroup endGroup, int cloneLength) { 201 List<Block[]> pairs = beginGroup.pairs(endGroup, cloneLength); 202 203 ClonePart origin = null; 204 List<ClonePart> parts = Lists.newArrayList(); 205 206 for (int i = 0; i < pairs.size(); i++) { 207 Block[] pair = pairs.get(i); 208 Block firstBlock = pair[0]; 209 Block lastBlock = pair[1]; 210 ClonePart part = new ClonePart(firstBlock.getResourceId(), 211 firstBlock.getIndexInFile(), 212 firstBlock.getStartLine(), 213 lastBlock.getEndLine()); 214 215 if (originResourceId.equals(part.getResourceId())) { 216 if (origin == null) { 217 origin = part; 218 } else if (part.getUnitStart() < origin.getUnitStart()) { 219 origin = part; 220 } 221 } 222 223 parts.add(part); 224 } 225 226 filter.add(CloneGroup.builder().setLength(cloneLength).setOrigin(origin).setParts(parts).build()); 227 } 228 229}