001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2011 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.detector.original;
021    
022    import java.util.Collection;
023    import java.util.Collections;
024    import java.util.List;
025    import java.util.Map;
026    
027    import org.sonar.duplications.block.Block;
028    import org.sonar.duplications.block.ByteArray;
029    import org.sonar.duplications.index.CloneGroup;
030    import org.sonar.duplications.index.CloneIndex;
031    import org.sonar.duplications.index.ClonePart;
032    
033    import com.google.common.collect.Lists;
034    import com.google.common.collect.Maps;
035    
036    /**
037     * Implementation of algorithm described in paper
038     * <a href="http://www4.in.tum.de/~juergens/publications/icsm2010_crc.pdf">Index-Based Code Clone Detection: Incremental, Distributed, Scalable</a>
039     * by Benjamin Hummel, Elmar Juergens, Michael Conradt and Lars Heinemann.
040     */
041    public final class OriginalCloneDetectionAlgorithm {
042    
043      /**
044       * Performs detection and returns list of clone groups between file (which represented as a collection of blocks) and index.
045       * Note that this method ignores blocks for this file, that will be retrieved from index.
046       */
047      public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) {
048        if (fileBlocks.isEmpty()) {
049          return Collections.EMPTY_LIST;
050        }
051        OriginalCloneDetectionAlgorithm reporter = new OriginalCloneDetectionAlgorithm(cloneIndex);
052        reporter.findClones(fileBlocks);
053        return reporter.filter.getResult();
054      }
055    
056      private final CloneIndex cloneIndex;
057    
058      private final Filter filter = new Filter();
059    
060      private String originResourceId;
061    
062      private OriginalCloneDetectionAlgorithm(CloneIndex cloneIndex) {
063        this.cloneIndex = cloneIndex;
064      }
065    
066      private BlocksGroup[] createGroups(Collection<Block> fileBlocks) {
067        // 2: let f be the list of tuples corresponding to filename sorted by statement index
068        // either read from the index or calculated on the fly
069        int size = fileBlocks.size();
070    
071        // Godin: create one group per unique hash
072        Map<ByteArray, BlocksGroup> groupsByHash = Maps.newHashMap(); // TODO Godin: can we create map with expected size?
073        for (Block fileBlock : fileBlocks) {
074          ByteArray hash = fileBlock.getBlockHash();
075          BlocksGroup sameHash = groupsByHash.get(hash);
076          if (sameHash == null) {
077            sameHash = BlocksGroup.empty();
078            groupsByHash.put(hash, sameHash);
079          }
080          sameHash.blocks.add(fileBlock);
081        }
082    
083        // Godin: retrieve blocks from index
084        for (Map.Entry<ByteArray, BlocksGroup> entry : groupsByHash.entrySet()) {
085          ByteArray hash = entry.getKey();
086          BlocksGroup group = entry.getValue();
087          for (Block blockFromIndex : cloneIndex.getBySequenceHash(hash)) {
088            // Godin: skip blocks for this file if they come from index
089            if (!originResourceId.equals(blockFromIndex.getResourceId())) {
090              group.blocks.add(blockFromIndex);
091            }
092          }
093          Collections.sort(group.blocks, BlocksGroup.BlockComparator.INSTANCE);
094        }
095    
096        // 3: let c be a list with c(0) = empty
097        BlocksGroup[] sameHashBlocksGroups = new BlocksGroup[size + 2];
098        sameHashBlocksGroups[0] = BlocksGroup.empty();
099        // 4: for i := 1 to length(f) do
100        for (Block fileBlock : fileBlocks) {
101          ByteArray hash = fileBlock.getBlockHash();
102          int i = fileBlock.getIndexInFile() + 1;
103          // 5: retrieve tuples with same sequence hash as f(i)
104          // 6: store this set as c(i)
105          sameHashBlocksGroups[i] = groupsByHash.get(hash);
106        }
107    
108        // Godin: allows to report clones at the end of file, because condition at line 13 would be evaluated as true
109        sameHashBlocksGroups[size + 1] = BlocksGroup.empty();
110    
111        return sameHashBlocksGroups;
112      }
113    
114      private void findClones(Collection<Block> fileBlocks) {
115        originResourceId = fileBlocks.iterator().next().getResourceId();
116    
117        BlocksGroup[] sameHashBlocksGroups = createGroups(fileBlocks);
118    
119        // 7: for i := 1 to length(c) do
120        for (int i = 1; i < sameHashBlocksGroups.length; i++) {
121          // In the main loop (starting from Line 7), we first check
122          // whether any new clones might start at this position. If there
123          // is only a single tuple with this hash (which has to belong
124          // to the inspected file at the current location) we skip this loop
125          // iteration. The same holds if all tuples at position i have already
126          // been present at position i − 1, as in this case any clone group
127          // found at position i would be included in a clone group starting
128          // at position i − 1.
129    
130          // Although we use the subset operator in the
131          // algorithm description, this is not really a subset operation,
132          // as of course the statement index of the tuples in c(i) will be
133          // increased by 1 compared to the corresponding ones in c(i − 1)
134          // and the hash and info fields will differ.
135    
136          // 8: if |c(i)| < 2 or c(i) subsumed by c(i - 1) then
137          if (sameHashBlocksGroups[i].size() < 2 || sameHashBlocksGroups[i].subsumedBy(sameHashBlocksGroups[i - 1], 1)) {
138            // 9: continue with next loop iteration
139            continue;
140          }
141    
142          // The set a introduced in Line 10 is called the active set and
143          // contains all tuples corresponding to clones which have not yet
144          // been reported. At each iteration of the inner loop the set a
145          // is reduced to tuples which are also present in c(j); again the
146          // intersection operator has to account for the increased statement
147          // index and different hash and info fields. The new value is
148          // stored in a0. Clones are only reported, if tuples are lost in
149          // Line 12, as otherwise all current clones could be prolonged
150          // by one statement. Clone reporting matches tuples that, after
151          // correction of the statement index, appear in both c(i) and a;
152          // each matched pair corresponds to a single clone. Its location
153          // can be extracted from the filename and info fields.
154    
155          // 10: let a := c(i)
156          BlocksGroup currentBlocksGroup = sameHashBlocksGroups[i];
157          // 11: for j := i + 1 to length(c) do
158          for (int j = i + 1; j < sameHashBlocksGroups.length; j++) {
159            // 12: let a0 := a intersect c(j)
160            BlocksGroup intersectedBlocksGroup = currentBlocksGroup.intersect(sameHashBlocksGroups[j]);
161    
162            // 13: if |a0| < |a| then
163            if (intersectedBlocksGroup.size() < currentBlocksGroup.size()) {
164              // 14: report clones from c(i) to a (see text)
165    
166              // One problem of this algorithm is that clone classes with
167              // multiple instances in the same file are encountered and
168              // reported multiple times. Furthermore, when calculating the clone
169              // groups for all files in a system, clone groups will be reported
170              // more than once as well. Both cases can be avoided, by
171              // checking whether the first element of a0 (with respect to a
172              // fixed order) is equal to f(j) and only report in this case.
173    
174              Block first = currentBlocksGroup.first(originResourceId);
175              if (first.getIndexInFile() == j - 2) {
176                // Godin: We report clones, which start in i-1 and end in j-2, so length is j-2-(i-1)+1=j-i
177                reportClones(sameHashBlocksGroups[i], currentBlocksGroup, j - i);
178              }
179            }
180            // 15: a := a0
181            currentBlocksGroup = intersectedBlocksGroup;
182    
183            // Line 16 early exits the inner loop if either no more clones are starting
184            // from position i (i.e., a is too small), or if all tuples from a
185            // have already been in c(i − 1), corrected for statement index.
186            // In this case they have already been reported in the previous
187            // iteration of the outer loop.
188    
189            // IMPORTANT Godin: note that difference in indexes between "a" and "c(i-1)" greater than one,
190            // so method subsumedBy should take this into account
191    
192            // 16: if |a| < 2 or a subsumed by c(i-1) then
193            if (currentBlocksGroup.size() < 2 || currentBlocksGroup.subsumedBy(sameHashBlocksGroups[i - 1], j - i + 1)) {
194              // 17: break inner loop
195              break;
196            }
197          }
198        }
199      }
200    
201      private void reportClones(BlocksGroup beginGroup, BlocksGroup endGroup, int cloneLength) {
202        List<Block[]> pairs = beginGroup.pairs(endGroup, cloneLength);
203    
204        ClonePart origin = null;
205        List<ClonePart> parts = Lists.newArrayList();
206    
207        for (int i = 0; i < pairs.size(); i++) {
208          Block[] pair = pairs.get(i);
209          Block firstBlock = pair[0];
210          Block lastBlock = pair[1];
211          ClonePart part = new ClonePart(firstBlock.getResourceId(),
212              firstBlock.getIndexInFile(),
213              firstBlock.getFirstLineNumber(),
214              lastBlock.getLastLineNumber());
215    
216          if (originResourceId.equals(part.getResourceId())) {
217            if (origin == null) {
218              origin = part;
219            } else if (part.getUnitStart() < origin.getUnitStart()) {
220              origin = part;
221            }
222          }
223    
224          parts.add(part);
225        }
226    
227        filter.add(new CloneGroup(cloneLength, origin, parts));
228      }
229    
230    }