001 /**
002 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003 */
004 package org.sonar.duplications.cpd;
005
006 import java.util.ArrayList;
007 import java.util.Collections;
008 import java.util.HashMap;
009 import java.util.HashSet;
010 import java.util.Iterator;
011 import java.util.List;
012 import java.util.Map;
013 import java.util.Set;
014
015 import net.sourceforge.pmd.cpd.TokenEntry;
016
017 public class MatchCollector {
018
019 private MatchAlgorithm ma;
020 private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
021 private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
022
023 public MatchCollector(MatchAlgorithm ma) {
024 this.ma = ma;
025 }
026
027 public void collect(List<TokenEntry> marks) {
028 // first get a pairwise collection of all maximal matches
029 for (int i = 0; i < marks.size() - 1; i++) {
030 TokenEntry mark1 = marks.get(i);
031 for (int j = i + 1; j < marks.size(); j++) {
032 TokenEntry mark2 = marks.get(j);
033 int diff = mark1.getIndex() - mark2.getIndex();
034 if ( -diff < ma.getMinimumTileSize()) {
035 continue;
036 }
037 if (hasPreviousDupe(mark1, mark2)) {
038 continue;
039 }
040
041 // "match too small" check
042 int dupes = countDuplicateTokens(mark1, mark2);
043 if (dupes < ma.getMinimumTileSize()) {
044 continue;
045 }
046 // is it still too close together
047 if (diff + dupes >= 1) {
048 continue;
049 }
050 determineMatch(mark1, mark2, dupes);
051 }
052 }
053 }
054
055 @SuppressWarnings("PMD.CompareObjectsWithEquals")
056 public List<Match> getMatches() {
057 List<Match> matchList = new ArrayList<Match>(startMap.values());
058 Collections.sort(matchList);
059 Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
060 Match.MatchCode matchCode = new Match.MatchCode();
061 for (int i = matchList.size(); i > 1; i--) {
062 Match match1 = matchList.get(i - 1);
063 TokenEntry mark1 = match1.getMarkSet().iterator().next();
064 matchSet.clear();
065 matchSet.add(match1.getMatchCode());
066 for (int j = i - 1; j > 0; j--) {
067 Match match2 = matchList.get(j - 1);
068 if (match1.getTokenCount() != match2.getTokenCount()) {
069 break;
070 }
071 TokenEntry mark2 = null;
072 for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) {
073 mark2 = iter.next();
074 if (mark2 != mark1) {
075 break;
076 }
077 }
078 int dupes = countDuplicateTokens(mark1, mark2);
079 if (dupes < match1.getTokenCount()) {
080 break;
081 }
082 matchSet.add(match2.getMatchCode());
083 match1.getMarkSet().addAll(match2.getMarkSet());
084 matchList.remove(i - 2);
085 i--;
086 }
087 if (matchSet.size() == 1) {
088 continue;
089 }
090 // prune the mark set
091 Set<TokenEntry> pruned = match1.getMarkSet();
092 boolean done = false;
093 ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet());
094 Collections.sort(a1);
095 for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
096 TokenEntry cmark1 = a1.get(outer);
097 for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
098 TokenEntry cmark2 = a1.get(inner);
099 matchCode.setFirst(cmark1.getIndex());
100 matchCode.setSecond(cmark2.getIndex());
101 if ( !matchSet.contains(matchCode)) {
102 if (pruned.size() > 2) {
103 pruned.remove(cmark2);
104 }
105 if (pruned.size() == 2) {
106 done = true;
107 }
108 }
109 }
110 }
111 }
112 return matchList;
113 }
114
115 /**
116 * A greedy algorithm for determining non-overlapping matches
117 */
118 private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
119 Match match = new Match(dupes, mark1, mark2);
120 String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
121 List<Match> pairMatches = fileMap.get(fileKey);
122 if (pairMatches == null) {
123 pairMatches = new ArrayList<Match>();
124 fileMap.put(fileKey, pairMatches);
125 }
126 boolean add = true;
127 for (int i = 0; i < pairMatches.size(); i++) {
128 Match other = pairMatches.get(i);
129 if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex() > 0) {
130 boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
131 if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
132 || ( !ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
133 if (other.getTokenCount() >= match.getTokenCount()) {
134 add = false;
135 break;
136 } else {
137 pairMatches.remove(i);
138 startMap.remove(other.getMatchCode());
139 }
140 }
141 }
142 }
143 if (add) {
144 pairMatches.add(match);
145 startMap.put(match.getMatchCode(), match);
146 }
147 }
148
149 private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
150 if (mark1.getIndex() == 0) {
151 return false;
152 }
153 return !matchEnded(ma.tokenAt( -1, mark1), ma.tokenAt( -1, mark2));
154 }
155
156 private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
157 int index = 0;
158 while ( !matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
159 index++;
160 }
161 return index;
162 }
163
164 private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
165 return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
166 }
167 }