001    /**
002     * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003     */
004    package org.sonar.duplications.cpd;
005    
006    import java.util.ArrayList;
007    import java.util.Collections;
008    import java.util.HashMap;
009    import java.util.HashSet;
010    import java.util.Iterator;
011    import java.util.List;
012    import java.util.Map;
013    import java.util.Set;
014    
015    import net.sourceforge.pmd.cpd.TokenEntry;
016    
017    public class MatchCollector {
018    
019      private MatchAlgorithm ma;
020      private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
021      private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
022    
023      public MatchCollector(MatchAlgorithm ma) {
024        this.ma = ma;
025      }
026    
027      public void collect(List<TokenEntry> marks) {
028        // first get a pairwise collection of all maximal matches
029        for (int i = 0; i < marks.size() - 1; i++) {
030          TokenEntry mark1 = marks.get(i);
031          for (int j = i + 1; j < marks.size(); j++) {
032            TokenEntry mark2 = marks.get(j);
033            int diff = mark1.getIndex() - mark2.getIndex();
034            if ( -diff < ma.getMinimumTileSize()) {
035              continue;
036            }
037            if (hasPreviousDupe(mark1, mark2)) {
038              continue;
039            }
040    
041            // "match too small" check
042            int dupes = countDuplicateTokens(mark1, mark2);
043            if (dupes < ma.getMinimumTileSize()) {
044              continue;
045            }
046            // is it still too close together
047            if (diff + dupes >= 1) {
048              continue;
049            }
050            determineMatch(mark1, mark2, dupes);
051          }
052        }
053      }
054    
055      @SuppressWarnings("PMD.CompareObjectsWithEquals")
056      public List<Match> getMatches() {
057        List<Match> matchList = new ArrayList<Match>(startMap.values());
058        Collections.sort(matchList);
059        Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
060        Match.MatchCode matchCode = new Match.MatchCode();
061        for (int i = matchList.size(); i > 1; i--) {
062          Match match1 = matchList.get(i - 1);
063          TokenEntry mark1 = match1.getMarkSet().iterator().next();
064          matchSet.clear();
065          matchSet.add(match1.getMatchCode());
066          for (int j = i - 1; j > 0; j--) {
067            Match match2 = matchList.get(j - 1);
068            if (match1.getTokenCount() != match2.getTokenCount()) {
069              break;
070            }
071            TokenEntry mark2 = null;
072            for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) {
073              mark2 = iter.next();
074              if (mark2 != mark1) {
075                break;
076              }
077            }
078            int dupes = countDuplicateTokens(mark1, mark2);
079            if (dupes < match1.getTokenCount()) {
080              break;
081            }
082            matchSet.add(match2.getMatchCode());
083            match1.getMarkSet().addAll(match2.getMarkSet());
084            matchList.remove(i - 2);
085            i--;
086          }
087          if (matchSet.size() == 1) {
088            continue;
089          }
090          // prune the mark set
091          Set<TokenEntry> pruned = match1.getMarkSet();
092          boolean done = false;
093          ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet());
094          Collections.sort(a1);
095          for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
096            TokenEntry cmark1 = a1.get(outer);
097            for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
098              TokenEntry cmark2 = a1.get(inner);
099              matchCode.setFirst(cmark1.getIndex());
100              matchCode.setSecond(cmark2.getIndex());
101              if ( !matchSet.contains(matchCode)) {
102                if (pruned.size() > 2) {
103                  pruned.remove(cmark2);
104                }
105                if (pruned.size() == 2) {
106                  done = true;
107                }
108              }
109            }
110          }
111        }
112        return matchList;
113      }
114    
115      /**
116       * A greedy algorithm for determining non-overlapping matches
117       */
118      private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
119        Match match = new Match(dupes, mark1, mark2);
120        String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
121        List<Match> pairMatches = fileMap.get(fileKey);
122        if (pairMatches == null) {
123          pairMatches = new ArrayList<Match>();
124          fileMap.put(fileKey, pairMatches);
125        }
126        boolean add = true;
127        for (int i = 0; i < pairMatches.size(); i++) {
128          Match other = pairMatches.get(i);
129          if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex() > 0) {
130            boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
131            if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
132                || ( !ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
133              if (other.getTokenCount() >= match.getTokenCount()) {
134                add = false;
135                break;
136              } else {
137                pairMatches.remove(i);
138                startMap.remove(other.getMatchCode());
139              }
140            }
141          }
142        }
143        if (add) {
144          pairMatches.add(match);
145          startMap.put(match.getMatchCode(), match);
146        }
147      }
148    
149      private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
150        if (mark1.getIndex() == 0) {
151          return false;
152        }
153        return !matchEnded(ma.tokenAt( -1, mark1), ma.tokenAt( -1, mark2));
154      }
155    
156      private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
157        int index = 0;
158        while ( !matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
159          index++;
160        }
161        return index;
162      }
163    
164      private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
165        return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
166      }
167    }