001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020
021 /**
022 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
023 */
024 package org.sonar.duplications.cpd;
025
026 import net.sourceforge.pmd.cpd.TokenEntry;
027
028 import java.util.*;
029
030 public class MatchCollector {
031
032 private MatchAlgorithm ma;
033 private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
034 private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
035
036 public MatchCollector(MatchAlgorithm ma) {
037 this.ma = ma;
038 }
039
040 public void collect(List<TokenEntry> marks) {
041 // first get a pairwise collection of all maximal matches
042 for (int i = 0; i < marks.size() - 1; i++) {
043 TokenEntry mark1 = marks.get(i);
044 for (int j = i + 1; j < marks.size(); j++) {
045 TokenEntry mark2 = marks.get(j);
046 int diff = mark1.getIndex() - mark2.getIndex();
047 if (-diff < ma.getMinimumTileSize()) {
048 continue;
049 }
050 if (hasPreviousDupe(mark1, mark2)) {
051 continue;
052 }
053
054 // "match too small" check
055 int dupes = countDuplicateTokens(mark1, mark2);
056 if (dupes < ma.getMinimumTileSize()) {
057 continue;
058 }
059 // is it still too close together
060 if (diff + dupes >= 1) {
061 continue;
062 }
063 determineMatch(mark1, mark2, dupes);
064 }
065 }
066 }
067
068 @SuppressWarnings("PMD.CompareObjectsWithEquals")
069 public List<Match> getMatches() {
070 List<Match> matchList = new ArrayList<Match>(startMap.values());
071 Collections.sort(matchList);
072 Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
073 Match.MatchCode matchCode = new Match.MatchCode();
074 for (int i = matchList.size(); i > 1; i--) {
075 Match match1 = matchList.get(i - 1);
076 TokenEntry mark1 = match1.getMarkSet().iterator().next();
077 matchSet.clear();
078 matchSet.add(match1.getMatchCode());
079 for (int j = i - 1; j > 0; j--) {
080 Match match2 = matchList.get(j - 1);
081 if (match1.getTokenCount() != match2.getTokenCount()) {
082 break;
083 }
084 TokenEntry mark2 = null;
085 for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) {
086 mark2 = iter.next();
087 if (mark2 != mark1) {
088 break;
089 }
090 }
091 int dupes = countDuplicateTokens(mark1, mark2);
092 if (dupes < match1.getTokenCount()) {
093 break;
094 }
095 matchSet.add(match2.getMatchCode());
096 match1.getMarkSet().addAll(match2.getMarkSet());
097 matchList.remove(i - 2);
098 i--;
099 }
100 if (matchSet.size() == 1) {
101 continue;
102 }
103 // prune the mark set
104 Set<TokenEntry> pruned = match1.getMarkSet();
105 boolean done = false;
106 ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet());
107 Collections.sort(a1);
108 for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
109 TokenEntry cmark1 = a1.get(outer);
110 for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
111 TokenEntry cmark2 = a1.get(inner);
112 matchCode.setFirst(cmark1.getIndex());
113 matchCode.setSecond(cmark2.getIndex());
114 if (!matchSet.contains(matchCode)) {
115 if (pruned.size() > 2) {
116 pruned.remove(cmark2);
117 }
118 if (pruned.size() == 2) {
119 done = true;
120 }
121 }
122 }
123 }
124 }
125 return matchList;
126 }
127
128 /**
129 * A greedy algorithm for determining non-overlapping matches
130 */
131 private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
132 Match match = new Match(dupes, mark1, mark2);
133 String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
134 List<Match> pairMatches = fileMap.get(fileKey);
135 if (pairMatches == null) {
136 pairMatches = new ArrayList<Match>();
137 fileMap.put(fileKey, pairMatches);
138 }
139 boolean add = true;
140 for (int i = 0; i < pairMatches.size(); i++) {
141 Match other = pairMatches.get(i);
142 if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex() > 0) {
143 boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
144 if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
145 || (!ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
146 if (other.getTokenCount() >= match.getTokenCount()) {
147 add = false;
148 break;
149 } else {
150 pairMatches.remove(i);
151 startMap.remove(other.getMatchCode());
152 }
153 }
154 }
155 }
156 if (add) {
157 pairMatches.add(match);
158 startMap.put(match.getMatchCode(), match);
159 }
160 }
161
162 private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
163 if (mark1.getIndex() == 0) {
164 return false;
165 }
166 return !matchEnded(ma.tokenAt(-1, mark1), ma.tokenAt(-1, mark2));
167 }
168
169 private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
170 int index = 0;
171 while (!matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
172 index++;
173 }
174 return index;
175 }
176
177 private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
178 return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
179 }
180 }