001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2012 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.index;
021    
022    import com.google.common.collect.Lists;
023    import org.sonar.duplications.block.Block;
024    import org.sonar.duplications.block.ByteArray;
025    import org.sonar.duplications.utils.FastStringComparator;
026    
027    import java.util.Collection;
028    import java.util.List;
029    
030    /**
031     * Provides an index optimized by memory.
032     * <p>
033     * Each object in Java has an overhead - see
034     * <a href="http://devblog.streamy.com/2009/07/24/determine-size-of-java-object-class/">"HOWTO: Determine the size of a Java Object or Class"</a>.
035     * So to optimize memory consumption, we use flat arrays, however this increases time of queries.
036     * During  usual detection of duplicates most time consuming method is a {@link #getByResourceId(String)}:
037     * around 50% of time spent in this class and number of invocations of this method is 1% of total invocations,
038     * however total time spent in this class less than 1 second for small projects and around 2 seconds for projects like JDK.
039     * </p>
040     * <p>
041     * Note that this implementation currently does not support deletion, however it's possible to implement.
042     * </p>
043     */
044    public class PackedMemoryCloneIndex extends AbstractCloneIndex {
045    
046      private static final int DEFAULT_INITIAL_CAPACITY = 1024;
047    
048      private static final int BLOCK_INTS = 5;
049    
050      private final int hashInts;
051    
052      private final int blockInts;
053    
054      /**
055       * Indicates that index requires sorting to perform queries.
056       */
057      private boolean sorted;
058    
059      /**
060       * Current number of blocks in index.
061       */
062      private int size;
063    
064      private String[] resourceIds;
065      private int[] blockData;
066    
067      private int[] resourceIdsIndex;
068    
069      private final Block.Builder blockBuilder = Block.builder();
070    
071      public PackedMemoryCloneIndex() {
072        this(8, DEFAULT_INITIAL_CAPACITY);
073      }
074    
075      /**
076       * @param hashBytes size of hash in bytes
077       * @param initialCapacity the initial capacity
078       */
079      public PackedMemoryCloneIndex(int hashBytes, int initialCapacity) {
080        this.sorted = false;
081        this.hashInts = hashBytes / 4;
082        this.blockInts = hashInts + BLOCK_INTS;
083        this.size = 0;
084        this.resourceIds = new String[initialCapacity];
085        this.blockData = new int[initialCapacity * blockInts];
086        this.resourceIdsIndex = new int[initialCapacity];
087      }
088    
089      /**
090       * {@inheritDoc}
091       * <p>
092       * <strong>Note that this implementation does not guarantee that blocks would be sorted by index.</strong>
093       * </p>
094       */
095      public Collection<Block> getByResourceId(String resourceId) {
096        ensureSorted();
097    
098        // prepare resourceId for binary search
099        resourceIds[size] = resourceId;
100        resourceIdsIndex[size] = size;
101    
102        int index = DataUtils.binarySearch(byResourceId);
103    
104        List<Block> result = Lists.newArrayList();
105        int realIndex = resourceIdsIndex[index];
106        while (index < size && FastStringComparator.INSTANCE.compare(resourceIds[realIndex], resourceId) == 0) {
107          // extract block (note that there is no need to extract resourceId)
108          int offset = realIndex * blockInts;
109          int[] hash = new int[hashInts];
110          for (int j = 0; j < hashInts; j++) {
111            hash[j] = blockData[offset++];
112          }
113          int indexInFile = blockData[offset++];
114          int firstLineNumber = blockData[offset++];
115          int lastLineNumber = blockData[offset++];
116          int startUnit = blockData[offset++];
117          int endUnit = blockData[offset];
118    
119          Block block = blockBuilder
120              .setResourceId(resourceId)
121              .setBlockHash(new ByteArray(hash))
122              .setIndexInFile(indexInFile)
123              .setLines(firstLineNumber, lastLineNumber)
124              .setUnit(startUnit, endUnit)
125              .build();
126          result.add(block);
127    
128          index++;
129          realIndex = resourceIdsIndex[index];
130        }
131        return result;
132      }
133    
134      /**
135       * {@inheritDoc}
136       */
137      public Collection<Block> getBySequenceHash(ByteArray sequenceHash) {
138        ensureSorted();
139    
140        // prepare hash for binary search
141        int[] hash = sequenceHash.toIntArray();
142        if (hash.length != hashInts) {
143          throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
144        }
145        int offset = size * blockInts;
146        for (int i = 0; i < hashInts; i++) {
147          blockData[offset++] = hash[i];
148        }
149    
150        int index = DataUtils.binarySearch(byBlockHash);
151    
152        List<Block> result = Lists.newArrayList();
153        while (index < size && !isLessByHash(size, index)) {
154          // extract block (note that there is no need to extract hash)
155          String resourceId = resourceIds[index];
156          offset = index * blockInts + hashInts;
157          int indexInFile = blockData[offset++];
158          int firstLineNumber = blockData[offset++];
159          int lastLineNumber = blockData[offset++];
160          int startUnit = blockData[offset++];
161          int endUnit = blockData[offset];
162    
163          Block block = blockBuilder
164              .setResourceId(resourceId)
165              .setBlockHash(sequenceHash)
166              .setIndexInFile(indexInFile)
167              .setLines(firstLineNumber, lastLineNumber)
168              .setUnit(startUnit, endUnit)
169              .build();
170          result.add(block);
171          index++;
172        }
173        return result;
174      }
175    
176      /**
177       * {@inheritDoc}
178       * <p>
179       * <strong>Note that this implementation allows insertion of two blocks with same index for one resource.</strong>
180       * </p>
181       */
182      public void insert(Block block) {
183        sorted = false;
184        ensureCapacity();
185    
186        resourceIds[size] = block.getResourceId();
187    
188        int[] hash = block.getBlockHash().toIntArray();
189        if (hash.length != hashInts) {
190          throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
191        }
192        int offset = size * blockInts;
193        for (int i = 0; i < hashInts; i++) {
194          blockData[offset++] = hash[i];
195        }
196        blockData[offset++] = block.getIndexInFile();
197        blockData[offset++] = block.getStartLine();
198        blockData[offset++] = block.getEndLine();
199        blockData[offset++] = block.getStartUnit();
200        blockData[offset] = block.getEndUnit();
201    
202        size++;
203      }
204    
205      /**
206       * Increases the capacity, if necessary.
207       */
208      private void ensureCapacity() {
209        if (size < resourceIds.length) {
210          return;
211        }
212        int newCapacity = (resourceIds.length * 3) / 2 + 1;
213        // Increase size of resourceIds
214        String[] oldResourceIds = resourceIds;
215        resourceIds = new String[newCapacity];
216        System.arraycopy(oldResourceIds, 0, resourceIds, 0, oldResourceIds.length);
217        // Increase size of blockData
218        int[] oldBlockData = blockData;
219        blockData = new int[newCapacity * blockInts];
220        System.arraycopy(oldBlockData, 0, blockData, 0, oldBlockData.length);
221        // Increase size of byResourceIndices (no need to copy old, because would be restored in method ensureSorted)
222        resourceIdsIndex = new int[newCapacity];
223        sorted = false;
224      }
225    
226      /**
227       * Performs sorting, if necessary.
228       */
229      private void ensureSorted() {
230        if (sorted) {
231          return;
232        }
233    
234        ensureCapacity();
235    
236        DataUtils.sort(byBlockHash);
237        for (int i = 0; i < size; i++) {
238          resourceIdsIndex[i] = i;
239        }
240        DataUtils.sort(byResourceId);
241    
242        sorted = true;
243      }
244    
245      private boolean isLessByHash(int i, int j) {
246        i *= blockInts;
247        j *= blockInts;
248        for (int k = 0; k < hashInts; k++, i++, j++) {
249          if (blockData[i] < blockData[j]) {
250            return true;
251          }
252          if (blockData[i] > blockData[j]) {
253            return false;
254          }
255        }
256        return false;
257      }
258    
259      private final DataUtils.Sortable byBlockHash = new DataUtils.Sortable() {
260        public void swap(int i, int j) {
261          String tmp = resourceIds[i];
262          resourceIds[i] = resourceIds[j];
263          resourceIds[j] = tmp;
264    
265          i *= blockInts;
266          j *= blockInts;
267          for (int k = 0; k < blockInts; k++, i++, j++) {
268            int x = blockData[i];
269            blockData[i] = blockData[j];
270            blockData[j] = x;
271          }
272        }
273    
274        public boolean isLess(int i, int j) {
275          return isLessByHash(i, j);
276        }
277    
278        public int size() {
279          return size;
280        }
281      };
282    
283      private final DataUtils.Sortable byResourceId = new DataUtils.Sortable() {
284        public void swap(int i, int j) {
285          int tmp = resourceIdsIndex[i];
286          resourceIdsIndex[i] = resourceIdsIndex[j];
287          resourceIdsIndex[j] = tmp;
288        }
289    
290        public boolean isLess(int i, int j) {
291          String s1 = resourceIds[resourceIdsIndex[i]];
292          String s2 = resourceIds[resourceIdsIndex[j]];
293          return FastStringComparator.INSTANCE.compare(s1, s2) < 0;
294        }
295    
296        public int size() {
297          return size;
298        }
299      };
300    
301    }