001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2011 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.index;
021    
022    import java.util.Collection;
023    import java.util.List;
024    
025    import org.sonar.duplications.block.Block;
026    import org.sonar.duplications.block.ByteArray;
027    import org.sonar.duplications.utils.FastStringComparator;
028    
029    import com.google.common.collect.Lists;
030    
031    /**
032     * Provides an index optimized by memory.
033     * <p>
034     * Each object in Java has an overhead - see
035     * <a href="http://devblog.streamy.com/2009/07/24/determine-size-of-java-object-class/">"HOWTO: Determine the size of a Java Object or Class"</a>.
036     * So to optimize memory consumption, we use flat arrays, however this increases time of queries.
037     * During  usual detection of duplicates most time consuming method is a {@link #getByResourceId(String)}:
038     * around 50% of time spent in this class and number of invocations of this method is 1% of total invocations,
039     * however total time spent in this class less than 1 second for small projects and around 2 seconds for projects like JDK.
040     * </p>
041     * <p>
042     * Note that this implementation currently does not support deletion, however it's possible to implement.
043     * </p>
044     */
045    public class PackedMemoryCloneIndex extends AbstractCloneIndex {
046    
047      private static final int DEFAULT_INITIAL_CAPACITY = 1024;
048    
049      private static final int BLOCK_INTS = 3;
050    
051      private final int hashInts;
052    
053      private final int blockInts;
054    
055      /**
056       * Indicates that index requires sorting to perform queries.
057       */
058      private boolean sorted;
059    
060      /**
061       * Current number of blocks in index.
062       */
063      private int size;
064    
065      private String[] resourceIds;
066      private int[] blockData;
067    
068      private int[] resourceIdsIndex;
069    
070      public PackedMemoryCloneIndex() {
071        this(8, DEFAULT_INITIAL_CAPACITY);
072      }
073    
074      /**
075       * @param hashBytes size of hash in bytes
076       * @param initialCapacity the initial capacity
077       */
078      public PackedMemoryCloneIndex(int hashBytes, int initialCapacity) {
079        this.sorted = false;
080        this.hashInts = hashBytes / 4;
081        this.blockInts = hashInts + BLOCK_INTS;
082        this.size = 0;
083        this.resourceIds = new String[initialCapacity];
084        this.blockData = new int[initialCapacity * blockInts];
085        this.resourceIdsIndex = new int[initialCapacity];
086      }
087    
088      /**
089       * {@inheritDoc}
090       * <p>
091       * <strong>Note that this implementation does not guarantee that blocks would be sorted by index.</strong>
092       * </p>
093       */
094      public Collection<Block> getByResourceId(String resourceId) {
095        ensureSorted();
096    
097        // prepare resourceId for binary search
098        resourceIds[size] = resourceId;
099        resourceIdsIndex[size] = size;
100    
101        int index = DataUtils.binarySearch(byResourceId);
102    
103        List<Block> result = Lists.newArrayList();
104        int realIndex = resourceIdsIndex[index];
105        while (index < size && FastStringComparator.INSTANCE.compare(resourceIds[realIndex], resourceId) == 0) {
106          // extract block (note that there is no need to extract resourceId)
107          int offset = realIndex * blockInts;
108          int[] hash = new int[hashInts];
109          for (int j = 0; j < hashInts; j++) {
110            hash[j] = blockData[offset++];
111          }
112          int indexInFile = blockData[offset++];
113          int firstLineNumber = blockData[offset++];
114          int lastLineNumber = blockData[offset];
115    
116          result.add(new Block(resourceId, new ByteArray(hash), indexInFile, firstLineNumber, lastLineNumber));
117    
118          index++;
119          realIndex = resourceIdsIndex[index];
120        }
121        return result;
122      }
123    
124      /**
125       * {@inheritDoc}
126       */
127      public Collection<Block> getBySequenceHash(ByteArray sequenceHash) {
128        ensureSorted();
129    
130        // prepare hash for binary search
131        int[] hash = sequenceHash.toIntArray();
132        if (hash.length != hashInts) {
133          throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
134        }
135        int offset = size * blockInts;
136        for (int i = 0; i < hashInts; i++) {
137          blockData[offset++] = hash[i];
138        }
139    
140        int index = DataUtils.binarySearch(byBlockHash);
141    
142        List<Block> result = Lists.newArrayList();
143        while (index < size && !isLessByHash(size, index)) {
144          // extract block (note that there is no need to extract hash)
145          String resourceId = resourceIds[index];
146          offset = index * blockInts + hashInts;
147          int indexInFile = blockData[offset++];
148          int firstLineNumber = blockData[offset++];
149          int lastLineNumber = blockData[offset];
150    
151          result.add(new Block(resourceId, sequenceHash, indexInFile, firstLineNumber, lastLineNumber));
152          index++;
153        }
154        return result;
155      }
156    
157      /**
158       * {@inheritDoc}
159       * <p>
160       * <strong>Note that this implementation allows insertion of two blocks with same index for one resource.</strong>
161       * </p>
162       */
163      public void insert(Block block) {
164        sorted = false;
165        ensureCapacity();
166    
167        resourceIds[size] = block.getResourceId();
168    
169        int[] hash = block.getBlockHash().toIntArray();
170        if (hash.length != hashInts) {
171          throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
172        }
173        int offset = size * blockInts;
174        for (int i = 0; i < hashInts; i++) {
175          blockData[offset++] = hash[i];
176        }
177        blockData[offset++] = block.getIndexInFile();
178        blockData[offset++] = block.getFirstLineNumber();
179        blockData[offset] = block.getLastLineNumber();
180    
181        size++;
182      }
183    
184      /**
185       * Increases the capacity, if necessary.
186       */
187      private void ensureCapacity() {
188        if (size < resourceIds.length) {
189          return;
190        }
191        int newCapacity = (resourceIds.length * 3) / 2 + 1;
192        // Increase size of resourceIds
193        String[] oldResourceIds = resourceIds;
194        resourceIds = new String[newCapacity];
195        System.arraycopy(oldResourceIds, 0, resourceIds, 0, oldResourceIds.length);
196        // Increase size of blockData
197        int[] oldBlockData = blockData;
198        blockData = new int[newCapacity * blockInts];
199        System.arraycopy(oldBlockData, 0, blockData, 0, oldBlockData.length);
200        // Increase size of byResourceIndices (no need to copy old, because would be restored in method ensureSorted)
201        resourceIdsIndex = new int[newCapacity];
202        sorted = false;
203      }
204    
205      /**
206       * Performs sorting, if necessary.
207       */
208      private void ensureSorted() {
209        if (sorted) {
210          return;
211        }
212    
213        ensureCapacity();
214    
215        DataUtils.sort(byBlockHash);
216        for (int i = 0; i < size; i++) {
217          resourceIdsIndex[i] = i;
218        }
219        DataUtils.sort(byResourceId);
220    
221        sorted = true;
222      }
223    
224      private boolean isLessByHash(int i, int j) {
225        i *= blockInts;
226        j *= blockInts;
227        for (int k = 0; k < hashInts; k++, i++, j++) {
228          if (blockData[i] < blockData[j]) {
229            return true;
230          }
231          if (blockData[i] > blockData[j]) {
232            return false;
233          }
234        }
235        return false;
236      }
237    
238      private final DataUtils.Sortable byBlockHash = new DataUtils.Sortable() {
239        public void swap(int i, int j) {
240          String tmp = resourceIds[i];
241          resourceIds[i] = resourceIds[j];
242          resourceIds[j] = tmp;
243    
244          i *= blockInts;
245          j *= blockInts;
246          for (int k = 0; k < blockInts; k++, i++, j++) {
247            int x = blockData[i];
248            blockData[i] = blockData[j];
249            blockData[j] = x;
250          }
251        }
252    
253        public boolean isLess(int i, int j) {
254          return isLessByHash(i, j);
255        }
256    
257        public int size() {
258          return size;
259        }
260      };
261    
262      private final DataUtils.Sortable byResourceId = new DataUtils.Sortable() {
263        public void swap(int i, int j) {
264          int tmp = resourceIdsIndex[i];
265          resourceIdsIndex[i] = resourceIdsIndex[j];
266          resourceIdsIndex[j] = tmp;
267        }
268    
269        public boolean isLess(int i, int j) {
270          String s1 = resourceIds[resourceIdsIndex[i]];
271          String s2 = resourceIds[resourceIdsIndex[j]];
272          return FastStringComparator.INSTANCE.compare(s1, s2) < 0;
273        }
274    
275        public int size() {
276          return size;
277        }
278      };
279    
280    }