001 /* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2011 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020 package org.sonar.duplications.detector.original; 021 022 import java.util.Collection; 023 import java.util.Collections; 024 import java.util.List; 025 import java.util.Map; 026 027 import org.sonar.duplications.block.Block; 028 import org.sonar.duplications.block.ByteArray; 029 import org.sonar.duplications.index.CloneGroup; 030 import org.sonar.duplications.index.CloneIndex; 031 import org.sonar.duplications.index.ClonePart; 032 033 import com.google.common.collect.Lists; 034 import com.google.common.collect.Maps; 035 036 /** 037 * Implementation of algorithm described in paper 038 * <a href="http://www4.in.tum.de/~juergens/publications/icsm2010_crc.pdf">Index-Based Code Clone Detection: Incremental, Distributed, Scalable</a> 039 * by Benjamin Hummel, Elmar Juergens, Michael Conradt and Lars Heinemann. 040 */ 041 public final class OriginalCloneDetectionAlgorithm { 042 043 /** 044 * Performs detection and returns list of clone groups between file (which represented as a collection of blocks) and index. 045 * Note that this method ignores blocks for this file, that will be retrieved from index. 046 */ 047 public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) { 048 if (fileBlocks.isEmpty()) { 049 return Collections.EMPTY_LIST; 050 } 051 OriginalCloneDetectionAlgorithm reporter = new OriginalCloneDetectionAlgorithm(cloneIndex); 052 reporter.findClones(fileBlocks); 053 return reporter.filter.getResult(); 054 } 055 056 private final CloneIndex cloneIndex; 057 058 private final Filter filter = new Filter(); 059 060 private String originResourceId; 061 062 private OriginalCloneDetectionAlgorithm(CloneIndex cloneIndex) { 063 this.cloneIndex = cloneIndex; 064 } 065 066 private BlocksGroup[] createGroups(Collection<Block> fileBlocks) { 067 // 2: let f be the list of tuples corresponding to filename sorted by statement index 068 // either read from the index or calculated on the fly 069 int size = fileBlocks.size(); 070 071 // Godin: create one group per unique hash 072 Map<ByteArray, BlocksGroup> groupsByHash = Maps.newHashMap(); // TODO Godin: can we create map with expected size? 073 for (Block fileBlock : fileBlocks) { 074 ByteArray hash = fileBlock.getBlockHash(); 075 BlocksGroup sameHash = groupsByHash.get(hash); 076 if (sameHash == null) { 077 sameHash = BlocksGroup.empty(); 078 groupsByHash.put(hash, sameHash); 079 } 080 sameHash.blocks.add(fileBlock); 081 } 082 083 // Godin: retrieve blocks from index 084 for (Map.Entry<ByteArray, BlocksGroup> entry : groupsByHash.entrySet()) { 085 ByteArray hash = entry.getKey(); 086 BlocksGroup group = entry.getValue(); 087 for (Block blockFromIndex : cloneIndex.getBySequenceHash(hash)) { 088 // Godin: skip blocks for this file if they come from index 089 if (!originResourceId.equals(blockFromIndex.getResourceId())) { 090 group.blocks.add(blockFromIndex); 091 } 092 } 093 Collections.sort(group.blocks, BlocksGroup.BlockComparator.INSTANCE); 094 } 095 096 // 3: let c be a list with c(0) = empty 097 BlocksGroup[] sameHashBlocksGroups = new BlocksGroup[size + 2]; 098 sameHashBlocksGroups[0] = BlocksGroup.empty(); 099 // 4: for i := 1 to length(f) do 100 for (Block fileBlock : fileBlocks) { 101 ByteArray hash = fileBlock.getBlockHash(); 102 int i = fileBlock.getIndexInFile() + 1; 103 // 5: retrieve tuples with same sequence hash as f(i) 104 // 6: store this set as c(i) 105 sameHashBlocksGroups[i] = groupsByHash.get(hash); 106 } 107 108 // Godin: allows to report clones at the end of file, because condition at line 13 would be evaluated as true 109 sameHashBlocksGroups[size + 1] = BlocksGroup.empty(); 110 111 return sameHashBlocksGroups; 112 } 113 114 private void findClones(Collection<Block> fileBlocks) { 115 originResourceId = fileBlocks.iterator().next().getResourceId(); 116 117 BlocksGroup[] sameHashBlocksGroups = createGroups(fileBlocks); 118 119 // 7: for i := 1 to length(c) do 120 for (int i = 1; i < sameHashBlocksGroups.length; i++) { 121 // In the main loop (starting from Line 7), we first check 122 // whether any new clones might start at this position. If there 123 // is only a single tuple with this hash (which has to belong 124 // to the inspected file at the current location) we skip this loop 125 // iteration. The same holds if all tuples at position i have already 126 // been present at position i − 1, as in this case any clone group 127 // found at position i would be included in a clone group starting 128 // at position i − 1. 129 130 // Although we use the subset operator in the 131 // algorithm description, this is not really a subset operation, 132 // as of course the statement index of the tuples in c(i) will be 133 // increased by 1 compared to the corresponding ones in c(i − 1) 134 // and the hash and info fields will differ. 135 136 // 8: if |c(i)| < 2 or c(i) subsumed by c(i - 1) then 137 if (sameHashBlocksGroups[i].size() < 2 || sameHashBlocksGroups[i].subsumedBy(sameHashBlocksGroups[i - 1], 1)) { 138 // 9: continue with next loop iteration 139 continue; 140 } 141 142 // The set a introduced in Line 10 is called the active set and 143 // contains all tuples corresponding to clones which have not yet 144 // been reported. At each iteration of the inner loop the set a 145 // is reduced to tuples which are also present in c(j); again the 146 // intersection operator has to account for the increased statement 147 // index and different hash and info fields. The new value is 148 // stored in a0. Clones are only reported, if tuples are lost in 149 // Line 12, as otherwise all current clones could be prolonged 150 // by one statement. Clone reporting matches tuples that, after 151 // correction of the statement index, appear in both c(i) and a; 152 // each matched pair corresponds to a single clone. Its location 153 // can be extracted from the filename and info fields. 154 155 // 10: let a := c(i) 156 BlocksGroup currentBlocksGroup = sameHashBlocksGroups[i]; 157 // 11: for j := i + 1 to length(c) do 158 for (int j = i + 1; j < sameHashBlocksGroups.length; j++) { 159 // 12: let a0 := a intersect c(j) 160 BlocksGroup intersectedBlocksGroup = currentBlocksGroup.intersect(sameHashBlocksGroups[j]); 161 162 // 13: if |a0| < |a| then 163 if (intersectedBlocksGroup.size() < currentBlocksGroup.size()) { 164 // 14: report clones from c(i) to a (see text) 165 166 // One problem of this algorithm is that clone classes with 167 // multiple instances in the same file are encountered and 168 // reported multiple times. Furthermore, when calculating the clone 169 // groups for all files in a system, clone groups will be reported 170 // more than once as well. Both cases can be avoided, by 171 // checking whether the first element of a0 (with respect to a 172 // fixed order) is equal to f(j) and only report in this case. 173 174 Block first = currentBlocksGroup.first(originResourceId); 175 if (first.getIndexInFile() == j - 2) { 176 // Godin: We report clones, which start in i-1 and end in j-2, so length is j-2-(i-1)+1=j-i 177 reportClones(sameHashBlocksGroups[i], currentBlocksGroup, j - i); 178 } 179 } 180 // 15: a := a0 181 currentBlocksGroup = intersectedBlocksGroup; 182 183 // Line 16 early exits the inner loop if either no more clones are starting 184 // from position i (i.e., a is too small), or if all tuples from a 185 // have already been in c(i − 1), corrected for statement index. 186 // In this case they have already been reported in the previous 187 // iteration of the outer loop. 188 189 // IMPORTANT Godin: note that difference in indexes between "a" and "c(i-1)" greater than one, 190 // so method subsumedBy should take this into account 191 192 // 16: if |a| < 2 or a subsumed by c(i-1) then 193 if (currentBlocksGroup.size() < 2 || currentBlocksGroup.subsumedBy(sameHashBlocksGroups[i - 1], j - i + 1)) { 194 // 17: break inner loop 195 break; 196 } 197 } 198 } 199 } 200 201 private void reportClones(BlocksGroup beginGroup, BlocksGroup endGroup, int cloneLength) { 202 List<Block[]> pairs = beginGroup.pairs(endGroup, cloneLength); 203 204 ClonePart origin = null; 205 List<ClonePart> parts = Lists.newArrayList(); 206 207 for (int i = 0; i < pairs.size(); i++) { 208 Block[] pair = pairs.get(i); 209 ClonePart part = new ClonePart(pair[0].getResourceId(), pair[0].getIndexInFile(), pair[0].getFirstLineNumber(), pair[1].getLastLineNumber()); 210 parts.add(part); 211 212 if (originResourceId.equals(part.getResourceId())) { 213 if (origin == null) { 214 origin = part; 215 } else { 216 if (part.getUnitStart() < origin.getUnitStart()) { 217 origin = part; 218 } 219 } 220 } 221 } 222 223 filter.add(new CloneGroup(cloneLength, origin, parts)); 224 } 225 226 }