001 /* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2012 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020 021 /** 022 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html 023 */ 024 package org.sonar.duplications.cpd; 025 026 import net.sourceforge.pmd.cpd.TokenEntry; 027 028 import java.util.*; 029 030 public class MatchCollector { 031 032 private MatchAlgorithm ma; 033 private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>(); 034 private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>(); 035 036 public MatchCollector(MatchAlgorithm ma) { 037 this.ma = ma; 038 } 039 040 public void collect(List<TokenEntry> marks) { 041 // first get a pairwise collection of all maximal matches 042 for (int i = 0; i < marks.size() - 1; i++) { 043 TokenEntry mark1 = marks.get(i); 044 for (int j = i + 1; j < marks.size(); j++) { 045 TokenEntry mark2 = marks.get(j); 046 int diff = mark1.getIndex() - mark2.getIndex(); 047 if (-diff < ma.getMinimumTileSize()) { 048 continue; 049 } 050 if (hasPreviousDupe(mark1, mark2)) { 051 continue; 052 } 053 054 // "match too small" check 055 int dupes = countDuplicateTokens(mark1, mark2); 056 if (dupes < ma.getMinimumTileSize()) { 057 continue; 058 } 059 // is it still too close together 060 if (diff + dupes >= 1) { 061 continue; 062 } 063 determineMatch(mark1, mark2, dupes); 064 } 065 } 066 } 067 068 @SuppressWarnings("PMD.CompareObjectsWithEquals") 069 public List<Match> getMatches() { 070 List<Match> matchList = new ArrayList<Match>(startMap.values()); 071 Collections.sort(matchList); 072 Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>(); 073 Match.MatchCode matchCode = new Match.MatchCode(); 074 for (int i = matchList.size(); i > 1; i--) { 075 Match match1 = matchList.get(i - 1); 076 TokenEntry mark1 = match1.getMarkSet().iterator().next(); 077 matchSet.clear(); 078 matchSet.add(match1.getMatchCode()); 079 for (int j = i - 1; j > 0; j--) { 080 Match match2 = matchList.get(j - 1); 081 if (match1.getTokenCount() != match2.getTokenCount()) { 082 break; 083 } 084 TokenEntry mark2 = null; 085 for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) { 086 mark2 = iter.next(); 087 if (mark2 != mark1) { 088 break; 089 } 090 } 091 int dupes = countDuplicateTokens(mark1, mark2); 092 if (dupes < match1.getTokenCount()) { 093 break; 094 } 095 matchSet.add(match2.getMatchCode()); 096 match1.getMarkSet().addAll(match2.getMarkSet()); 097 matchList.remove(i - 2); 098 i--; 099 } 100 if (matchSet.size() == 1) { 101 continue; 102 } 103 // prune the mark set 104 Set<TokenEntry> pruned = match1.getMarkSet(); 105 boolean done = false; 106 ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet()); 107 Collections.sort(a1); 108 for (int outer = 0; outer < a1.size() - 1 && !done; outer++) { 109 TokenEntry cmark1 = a1.get(outer); 110 for (int inner = outer + 1; inner < a1.size() && !done; inner++) { 111 TokenEntry cmark2 = a1.get(inner); 112 matchCode.setFirst(cmark1.getIndex()); 113 matchCode.setSecond(cmark2.getIndex()); 114 if (!matchSet.contains(matchCode)) { 115 if (pruned.size() > 2) { 116 pruned.remove(cmark2); 117 } 118 if (pruned.size() == 2) { 119 done = true; 120 } 121 } 122 } 123 } 124 } 125 return matchList; 126 } 127 128 /** 129 * A greedy algorithm for determining non-overlapping matches 130 */ 131 private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) { 132 Match match = new Match(dupes, mark1, mark2); 133 String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID(); 134 List<Match> pairMatches = fileMap.get(fileKey); 135 if (pairMatches == null) { 136 pairMatches = new ArrayList<Match>(); 137 fileMap.put(fileKey, pairMatches); 138 } 139 boolean add = true; 140 for (int i = 0; i < pairMatches.size(); i++) { 141 Match other = pairMatches.get(i); 142 if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex() > 0) { 143 boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0; 144 if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0)) 145 || (!ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) { 146 if (other.getTokenCount() >= match.getTokenCount()) { 147 add = false; 148 break; 149 } else { 150 pairMatches.remove(i); 151 startMap.remove(other.getMatchCode()); 152 } 153 } 154 } 155 } 156 if (add) { 157 pairMatches.add(match); 158 startMap.put(match.getMatchCode(), match); 159 } 160 } 161 162 private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) { 163 if (mark1.getIndex() == 0) { 164 return false; 165 } 166 return !matchEnded(ma.tokenAt(-1, mark1), ma.tokenAt(-1, mark2)); 167 } 168 169 private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) { 170 int index = 0; 171 while (!matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) { 172 index++; 173 } 174 return index; 175 } 176 177 private boolean matchEnded(TokenEntry token1, TokenEntry token2) { 178 return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF; 179 } 180 }