001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2011 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    
021    /**
022     * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
023     */
024    package org.sonar.duplications.cpd;
025    
026    import java.util.ArrayList;
027    import java.util.Collections;
028    import java.util.HashMap;
029    import java.util.HashSet;
030    import java.util.Iterator;
031    import java.util.List;
032    import java.util.Map;
033    import java.util.Set;
034    
035    import net.sourceforge.pmd.cpd.TokenEntry;
036    
037    public class MatchCollector {
038    
039      private MatchAlgorithm ma;
040      private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
041      private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
042    
043      public MatchCollector(MatchAlgorithm ma) {
044        this.ma = ma;
045      }
046    
047      public void collect(List<TokenEntry> marks) {
048        // first get a pairwise collection of all maximal matches
049        for (int i = 0; i < marks.size() - 1; i++) {
050          TokenEntry mark1 = marks.get(i);
051          for (int j = i + 1; j < marks.size(); j++) {
052            TokenEntry mark2 = marks.get(j);
053            int diff = mark1.getIndex() - mark2.getIndex();
054            if ( -diff < ma.getMinimumTileSize()) {
055              continue;
056            }
057            if (hasPreviousDupe(mark1, mark2)) {
058              continue;
059            }
060    
061            // "match too small" check
062            int dupes = countDuplicateTokens(mark1, mark2);
063            if (dupes < ma.getMinimumTileSize()) {
064              continue;
065            }
066            // is it still too close together
067            if (diff + dupes >= 1) {
068              continue;
069            }
070            determineMatch(mark1, mark2, dupes);
071          }
072        }
073      }
074    
075      @SuppressWarnings("PMD.CompareObjectsWithEquals")
076      public List<Match> getMatches() {
077        List<Match> matchList = new ArrayList<Match>(startMap.values());
078        Collections.sort(matchList);
079        Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
080        Match.MatchCode matchCode = new Match.MatchCode();
081        for (int i = matchList.size(); i > 1; i--) {
082          Match match1 = matchList.get(i - 1);
083          TokenEntry mark1 = match1.getMarkSet().iterator().next();
084          matchSet.clear();
085          matchSet.add(match1.getMatchCode());
086          for (int j = i - 1; j > 0; j--) {
087            Match match2 = matchList.get(j - 1);
088            if (match1.getTokenCount() != match2.getTokenCount()) {
089              break;
090            }
091            TokenEntry mark2 = null;
092            for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) {
093              mark2 = iter.next();
094              if (mark2 != mark1) {
095                break;
096              }
097            }
098            int dupes = countDuplicateTokens(mark1, mark2);
099            if (dupes < match1.getTokenCount()) {
100              break;
101            }
102            matchSet.add(match2.getMatchCode());
103            match1.getMarkSet().addAll(match2.getMarkSet());
104            matchList.remove(i - 2);
105            i--;
106          }
107          if (matchSet.size() == 1) {
108            continue;
109          }
110          // prune the mark set
111          Set<TokenEntry> pruned = match1.getMarkSet();
112          boolean done = false;
113          ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet());
114          Collections.sort(a1);
115          for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
116            TokenEntry cmark1 = a1.get(outer);
117            for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
118              TokenEntry cmark2 = a1.get(inner);
119              matchCode.setFirst(cmark1.getIndex());
120              matchCode.setSecond(cmark2.getIndex());
121              if ( !matchSet.contains(matchCode)) {
122                if (pruned.size() > 2) {
123                  pruned.remove(cmark2);
124                }
125                if (pruned.size() == 2) {
126                  done = true;
127                }
128              }
129            }
130          }
131        }
132        return matchList;
133      }
134    
135      /**
136       * A greedy algorithm for determining non-overlapping matches
137       */
138      private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
139        Match match = new Match(dupes, mark1, mark2);
140        String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
141        List<Match> pairMatches = fileMap.get(fileKey);
142        if (pairMatches == null) {
143          pairMatches = new ArrayList<Match>();
144          fileMap.put(fileKey, pairMatches);
145        }
146        boolean add = true;
147        for (int i = 0; i < pairMatches.size(); i++) {
148          Match other = pairMatches.get(i);
149          if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex() > 0) {
150            boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
151            if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
152                || ( !ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
153              if (other.getTokenCount() >= match.getTokenCount()) {
154                add = false;
155                break;
156              } else {
157                pairMatches.remove(i);
158                startMap.remove(other.getMatchCode());
159              }
160            }
161          }
162        }
163        if (add) {
164          pairMatches.add(match);
165          startMap.put(match.getMatchCode(), match);
166        }
167      }
168    
169      private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
170        if (mark1.getIndex() == 0) {
171          return false;
172        }
173        return !matchEnded(ma.tokenAt( -1, mark1), ma.tokenAt( -1, mark2));
174      }
175    
176      private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
177        int index = 0;
178        while ( !matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
179          index++;
180        }
181        return index;
182      }
183    
184      private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
185        return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
186      }
187    }