001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020 package org.sonar.duplications.index;
021
022 import com.google.common.collect.Lists;
023 import org.sonar.duplications.block.Block;
024 import org.sonar.duplications.block.ByteArray;
025 import org.sonar.duplications.utils.FastStringComparator;
026
027 import java.util.Collection;
028 import java.util.List;
029
030 /**
031 * Provides an index optimized by memory.
032 * <p>
033 * Each object in Java has an overhead - see
034 * <a href="http://devblog.streamy.com/2009/07/24/determine-size-of-java-object-class/">"HOWTO: Determine the size of a Java Object or Class"</a>.
035 * So to optimize memory consumption, we use flat arrays, however this increases time of queries.
036 * During usual detection of duplicates most time consuming method is a {@link #getByResourceId(String)}:
037 * around 50% of time spent in this class and number of invocations of this method is 1% of total invocations,
038 * however total time spent in this class less than 1 second for small projects and around 2 seconds for projects like JDK.
039 * </p>
040 * <p>
041 * Note that this implementation currently does not support deletion, however it's possible to implement.
042 * </p>
043 */
044 public class PackedMemoryCloneIndex extends AbstractCloneIndex {
045
046 private static final int DEFAULT_INITIAL_CAPACITY = 1024;
047
048 private static final int BLOCK_INTS = 5;
049
050 private final int hashInts;
051
052 private final int blockInts;
053
054 /**
055 * Indicates that index requires sorting to perform queries.
056 */
057 private boolean sorted;
058
059 /**
060 * Current number of blocks in index.
061 */
062 private int size;
063
064 private String[] resourceIds;
065 private int[] blockData;
066
067 private int[] resourceIdsIndex;
068
069 private final Block.Builder blockBuilder = Block.builder();
070
071 public PackedMemoryCloneIndex() {
072 this(8, DEFAULT_INITIAL_CAPACITY);
073 }
074
075 /**
076 * @param hashBytes size of hash in bytes
077 * @param initialCapacity the initial capacity
078 */
079 public PackedMemoryCloneIndex(int hashBytes, int initialCapacity) {
080 this.sorted = false;
081 this.hashInts = hashBytes / 4;
082 this.blockInts = hashInts + BLOCK_INTS;
083 this.size = 0;
084 this.resourceIds = new String[initialCapacity];
085 this.blockData = new int[initialCapacity * blockInts];
086 this.resourceIdsIndex = new int[initialCapacity];
087 }
088
089 /**
090 * {@inheritDoc}
091 * <p>
092 * <strong>Note that this implementation does not guarantee that blocks would be sorted by index.</strong>
093 * </p>
094 */
095 public Collection<Block> getByResourceId(String resourceId) {
096 ensureSorted();
097
098 // prepare resourceId for binary search
099 resourceIds[size] = resourceId;
100 resourceIdsIndex[size] = size;
101
102 int index = DataUtils.binarySearch(byResourceId);
103
104 List<Block> result = Lists.newArrayList();
105 int realIndex = resourceIdsIndex[index];
106 while (index < size && FastStringComparator.INSTANCE.compare(resourceIds[realIndex], resourceId) == 0) {
107 // extract block (note that there is no need to extract resourceId)
108 int offset = realIndex * blockInts;
109 int[] hash = new int[hashInts];
110 for (int j = 0; j < hashInts; j++) {
111 hash[j] = blockData[offset++];
112 }
113 int indexInFile = blockData[offset++];
114 int firstLineNumber = blockData[offset++];
115 int lastLineNumber = blockData[offset++];
116 int startUnit = blockData[offset++];
117 int endUnit = blockData[offset];
118
119 Block block = blockBuilder
120 .setResourceId(resourceId)
121 .setBlockHash(new ByteArray(hash))
122 .setIndexInFile(indexInFile)
123 .setLines(firstLineNumber, lastLineNumber)
124 .setUnit(startUnit, endUnit)
125 .build();
126 result.add(block);
127
128 index++;
129 realIndex = resourceIdsIndex[index];
130 }
131 return result;
132 }
133
134 /**
135 * {@inheritDoc}
136 */
137 public Collection<Block> getBySequenceHash(ByteArray sequenceHash) {
138 ensureSorted();
139
140 // prepare hash for binary search
141 int[] hash = sequenceHash.toIntArray();
142 if (hash.length != hashInts) {
143 throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
144 }
145 int offset = size * blockInts;
146 for (int i = 0; i < hashInts; i++) {
147 blockData[offset++] = hash[i];
148 }
149
150 int index = DataUtils.binarySearch(byBlockHash);
151
152 List<Block> result = Lists.newArrayList();
153 while (index < size && !isLessByHash(size, index)) {
154 // extract block (note that there is no need to extract hash)
155 String resourceId = resourceIds[index];
156 offset = index * blockInts + hashInts;
157 int indexInFile = blockData[offset++];
158 int firstLineNumber = blockData[offset++];
159 int lastLineNumber = blockData[offset++];
160 int startUnit = blockData[offset++];
161 int endUnit = blockData[offset];
162
163 Block block = blockBuilder
164 .setResourceId(resourceId)
165 .setBlockHash(sequenceHash)
166 .setIndexInFile(indexInFile)
167 .setLines(firstLineNumber, lastLineNumber)
168 .setUnit(startUnit, endUnit)
169 .build();
170 result.add(block);
171 index++;
172 }
173 return result;
174 }
175
176 /**
177 * {@inheritDoc}
178 * <p>
179 * <strong>Note that this implementation allows insertion of two blocks with same index for one resource.</strong>
180 * </p>
181 */
182 public void insert(Block block) {
183 sorted = false;
184 ensureCapacity();
185
186 resourceIds[size] = block.getResourceId();
187
188 int[] hash = block.getBlockHash().toIntArray();
189 if (hash.length != hashInts) {
190 throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length);
191 }
192 int offset = size * blockInts;
193 for (int i = 0; i < hashInts; i++) {
194 blockData[offset++] = hash[i];
195 }
196 blockData[offset++] = block.getIndexInFile();
197 blockData[offset++] = block.getStartLine();
198 blockData[offset++] = block.getEndLine();
199 blockData[offset++] = block.getStartUnit();
200 blockData[offset] = block.getEndUnit();
201
202 size++;
203 }
204
205 /**
206 * Increases the capacity, if necessary.
207 */
208 private void ensureCapacity() {
209 if (size < resourceIds.length) {
210 return;
211 }
212 int newCapacity = (resourceIds.length * 3) / 2 + 1;
213 // Increase size of resourceIds
214 String[] oldResourceIds = resourceIds;
215 resourceIds = new String[newCapacity];
216 System.arraycopy(oldResourceIds, 0, resourceIds, 0, oldResourceIds.length);
217 // Increase size of blockData
218 int[] oldBlockData = blockData;
219 blockData = new int[newCapacity * blockInts];
220 System.arraycopy(oldBlockData, 0, blockData, 0, oldBlockData.length);
221 // Increase size of byResourceIndices (no need to copy old, because would be restored in method ensureSorted)
222 resourceIdsIndex = new int[newCapacity];
223 sorted = false;
224 }
225
226 /**
227 * Performs sorting, if necessary.
228 */
229 private void ensureSorted() {
230 if (sorted) {
231 return;
232 }
233
234 ensureCapacity();
235
236 DataUtils.sort(byBlockHash);
237 for (int i = 0; i < size; i++) {
238 resourceIdsIndex[i] = i;
239 }
240 DataUtils.sort(byResourceId);
241
242 sorted = true;
243 }
244
245 private boolean isLessByHash(int i, int j) {
246 i *= blockInts;
247 j *= blockInts;
248 for (int k = 0; k < hashInts; k++, i++, j++) {
249 if (blockData[i] < blockData[j]) {
250 return true;
251 }
252 if (blockData[i] > blockData[j]) {
253 return false;
254 }
255 }
256 return false;
257 }
258
259 private final DataUtils.Sortable byBlockHash = new DataUtils.Sortable() {
260 public void swap(int i, int j) {
261 String tmp = resourceIds[i];
262 resourceIds[i] = resourceIds[j];
263 resourceIds[j] = tmp;
264
265 i *= blockInts;
266 j *= blockInts;
267 for (int k = 0; k < blockInts; k++, i++, j++) {
268 int x = blockData[i];
269 blockData[i] = blockData[j];
270 blockData[j] = x;
271 }
272 }
273
274 public boolean isLess(int i, int j) {
275 return isLessByHash(i, j);
276 }
277
278 public int size() {
279 return size;
280 }
281 };
282
283 private final DataUtils.Sortable byResourceId = new DataUtils.Sortable() {
284 public void swap(int i, int j) {
285 int tmp = resourceIdsIndex[i];
286 resourceIdsIndex[i] = resourceIdsIndex[j];
287 resourceIdsIndex[j] = tmp;
288 }
289
290 public boolean isLess(int i, int j) {
291 String s1 = resourceIds[resourceIdsIndex[i]];
292 String s2 = resourceIds[resourceIdsIndex[j]];
293 return FastStringComparator.INSTANCE.compare(s1, s2) < 0;
294 }
295
296 public int size() {
297 return size;
298 }
299 };
300
301 }