001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2011 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.block;
021    
022    import java.util.Collections;
023    import java.util.List;
024    
025    import org.sonar.duplications.statement.Statement;
026    
027    import com.google.common.collect.Lists;
028    
029    /**
030     * Creates blocks from statements, each block will contain specified number of statements (<code>blockSize</code>) and 64-bits (8-bytes) hash value.
031     * Hash value computed using
032     * <a href="http://en.wikipedia.org/wiki/Rolling_hash#Rabin-Karp_rolling_hash">Rabin-Karp rolling hash</a> :
033     * <blockquote><pre>
034     * s[0]*31^(blockSize-1) + s[1]*31^(blockSize-2) + ... + s[blockSize-1]
035     * </pre></blockquote>
036     * using <code>long</code> arithmetic, where <code>s[i]</code>
037     * is the hash code of <code>String</code> (which is cached) for statement with number i.
038     * Thus running time - O(N), where N - number of statements.
039     * Implementation fully thread-safe.
040     */
041    public class BlockChunker {
042    
043      private static final long PRIME_BASE = 31;
044    
045      private final int blockSize;
046      private final long power;
047    
048      public BlockChunker(int blockSize) {
049        this.blockSize = blockSize;
050    
051        long pow = 1;
052        for (int i = 0; i < blockSize - 1; i++) {
053          pow = pow * PRIME_BASE;
054        }
055        this.power = pow;
056      }
057    
058      public List<Block> chunk(String resourceId, List<Statement> statements) {
059        if (statements.size() < blockSize) {
060          return Collections.emptyList();
061        }
062        Statement[] statementsArr = statements.toArray(new Statement[statements.size()]);
063        List<Block> blocks = Lists.newArrayListWithCapacity(statementsArr.length - blockSize + 1);
064        long hash = 0;
065        int first = 0;
066        int last = 0;
067        for (; last < blockSize - 1; last++) {
068          hash = hash * PRIME_BASE + statementsArr[last].getValue().hashCode();
069        }
070        for (; last < statementsArr.length; last++, first++) {
071          Statement firstStatement = statementsArr[first];
072          Statement lastStatement = statementsArr[last];
073          // add last statement to hash
074          hash = hash * PRIME_BASE + lastStatement.getValue().hashCode();
075          // create block
076          blocks.add(new Block(resourceId, new ByteArray(hash), first, firstStatement.getStartLine(), lastStatement.getEndLine()));
077          // remove first statement from hash
078          hash -= power * firstStatement.getValue().hashCode();
079        }
080        return blocks;
081      }
082    
083      public int getBlockSize() {
084        return blockSize;
085      }
086    
087    }