001/* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2012 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020package org.sonar.duplications.index; 021 022import com.google.common.collect.Lists; 023import org.sonar.duplications.block.Block; 024import org.sonar.duplications.block.ByteArray; 025import org.sonar.duplications.utils.FastStringComparator; 026 027import java.util.Collection; 028import java.util.List; 029 030/** 031 * Provides an index optimized by memory. 032 * <p> 033 * Each object in Java has an overhead - see 034 * <a href="http://devblog.streamy.com/2009/07/24/determine-size-of-java-object-class/">"HOWTO: Determine the size of a Java Object or Class"</a>. 035 * So to optimize memory consumption, we use flat arrays, however this increases time of queries. 036 * During usual detection of duplicates most time consuming method is a {@link #getByResourceId(String)}: 037 * around 50% of time spent in this class and number of invocations of this method is 1% of total invocations, 038 * however total time spent in this class less than 1 second for small projects and around 2 seconds for projects like JDK. 039 * </p> 040 * <p> 041 * Note that this implementation currently does not support deletion, however it's possible to implement. 042 * </p> 043 */ 044public class PackedMemoryCloneIndex extends AbstractCloneIndex { 045 046 private static final int DEFAULT_INITIAL_CAPACITY = 1024; 047 048 private static final int BLOCK_INTS = 5; 049 050 private final int hashInts; 051 052 private final int blockInts; 053 054 /** 055 * Indicates that index requires sorting to perform queries. 056 */ 057 private boolean sorted; 058 059 /** 060 * Current number of blocks in index. 061 */ 062 private int size; 063 064 private String[] resourceIds; 065 private int[] blockData; 066 067 private int[] resourceIdsIndex; 068 069 private final Block.Builder blockBuilder = Block.builder(); 070 071 public PackedMemoryCloneIndex() { 072 this(8, DEFAULT_INITIAL_CAPACITY); 073 } 074 075 /** 076 * @param hashBytes size of hash in bytes 077 * @param initialCapacity the initial capacity 078 */ 079 public PackedMemoryCloneIndex(int hashBytes, int initialCapacity) { 080 this.sorted = false; 081 this.hashInts = hashBytes / 4; 082 this.blockInts = hashInts + BLOCK_INTS; 083 this.size = 0; 084 this.resourceIds = new String[initialCapacity]; 085 this.blockData = new int[initialCapacity * blockInts]; 086 this.resourceIdsIndex = new int[initialCapacity]; 087 } 088 089 /** 090 * {@inheritDoc} 091 * <p> 092 * <strong>Note that this implementation does not guarantee that blocks would be sorted by index.</strong> 093 * </p> 094 */ 095 public Collection<Block> getByResourceId(String resourceId) { 096 ensureSorted(); 097 098 // prepare resourceId for binary search 099 resourceIds[size] = resourceId; 100 resourceIdsIndex[size] = size; 101 102 int index = DataUtils.binarySearch(byResourceId); 103 104 List<Block> result = Lists.newArrayList(); 105 int realIndex = resourceIdsIndex[index]; 106 while (index < size && FastStringComparator.INSTANCE.compare(resourceIds[realIndex], resourceId) == 0) { 107 // extract block (note that there is no need to extract resourceId) 108 int offset = realIndex * blockInts; 109 int[] hash = new int[hashInts]; 110 for (int j = 0; j < hashInts; j++) { 111 hash[j] = blockData[offset++]; 112 } 113 int indexInFile = blockData[offset++]; 114 int firstLineNumber = blockData[offset++]; 115 int lastLineNumber = blockData[offset++]; 116 int startUnit = blockData[offset++]; 117 int endUnit = blockData[offset]; 118 119 Block block = blockBuilder 120 .setResourceId(resourceId) 121 .setBlockHash(new ByteArray(hash)) 122 .setIndexInFile(indexInFile) 123 .setLines(firstLineNumber, lastLineNumber) 124 .setUnit(startUnit, endUnit) 125 .build(); 126 result.add(block); 127 128 index++; 129 realIndex = resourceIdsIndex[index]; 130 } 131 return result; 132 } 133 134 /** 135 * {@inheritDoc} 136 */ 137 public Collection<Block> getBySequenceHash(ByteArray sequenceHash) { 138 ensureSorted(); 139 140 // prepare hash for binary search 141 int[] hash = sequenceHash.toIntArray(); 142 if (hash.length != hashInts) { 143 throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length); 144 } 145 int offset = size * blockInts; 146 for (int i = 0; i < hashInts; i++) { 147 blockData[offset++] = hash[i]; 148 } 149 150 int index = DataUtils.binarySearch(byBlockHash); 151 152 List<Block> result = Lists.newArrayList(); 153 while (index < size && !isLessByHash(size, index)) { 154 // extract block (note that there is no need to extract hash) 155 String resourceId = resourceIds[index]; 156 offset = index * blockInts + hashInts; 157 int indexInFile = blockData[offset++]; 158 int firstLineNumber = blockData[offset++]; 159 int lastLineNumber = blockData[offset++]; 160 int startUnit = blockData[offset++]; 161 int endUnit = blockData[offset]; 162 163 Block block = blockBuilder 164 .setResourceId(resourceId) 165 .setBlockHash(sequenceHash) 166 .setIndexInFile(indexInFile) 167 .setLines(firstLineNumber, lastLineNumber) 168 .setUnit(startUnit, endUnit) 169 .build(); 170 result.add(block); 171 index++; 172 } 173 return result; 174 } 175 176 /** 177 * {@inheritDoc} 178 * <p> 179 * <strong>Note that this implementation allows insertion of two blocks with same index for one resource.</strong> 180 * </p> 181 */ 182 public void insert(Block block) { 183 sorted = false; 184 ensureCapacity(); 185 186 resourceIds[size] = block.getResourceId(); 187 188 int[] hash = block.getBlockHash().toIntArray(); 189 if (hash.length != hashInts) { 190 throw new IllegalArgumentException("Expected " + hashInts + " ints in hash, but got " + hash.length); 191 } 192 int offset = size * blockInts; 193 for (int i = 0; i < hashInts; i++) { 194 blockData[offset++] = hash[i]; 195 } 196 blockData[offset++] = block.getIndexInFile(); 197 blockData[offset++] = block.getStartLine(); 198 blockData[offset++] = block.getEndLine(); 199 blockData[offset++] = block.getStartUnit(); 200 blockData[offset] = block.getEndUnit(); 201 202 size++; 203 } 204 205 /** 206 * Increases the capacity, if necessary. 207 */ 208 private void ensureCapacity() { 209 if (size < resourceIds.length) { 210 return; 211 } 212 int newCapacity = (resourceIds.length * 3) / 2 + 1; 213 // Increase size of resourceIds 214 String[] oldResourceIds = resourceIds; 215 resourceIds = new String[newCapacity]; 216 System.arraycopy(oldResourceIds, 0, resourceIds, 0, oldResourceIds.length); 217 // Increase size of blockData 218 int[] oldBlockData = blockData; 219 blockData = new int[newCapacity * blockInts]; 220 System.arraycopy(oldBlockData, 0, blockData, 0, oldBlockData.length); 221 // Increase size of byResourceIndices (no need to copy old, because would be restored in method ensureSorted) 222 resourceIdsIndex = new int[newCapacity]; 223 sorted = false; 224 } 225 226 /** 227 * Performs sorting, if necessary. 228 */ 229 private void ensureSorted() { 230 if (sorted) { 231 return; 232 } 233 234 ensureCapacity(); 235 236 DataUtils.sort(byBlockHash); 237 for (int i = 0; i < size; i++) { 238 resourceIdsIndex[i] = i; 239 } 240 DataUtils.sort(byResourceId); 241 242 sorted = true; 243 } 244 245 private boolean isLessByHash(int i, int j) { 246 i *= blockInts; 247 j *= blockInts; 248 for (int k = 0; k < hashInts; k++, i++, j++) { 249 if (blockData[i] < blockData[j]) { 250 return true; 251 } 252 if (blockData[i] > blockData[j]) { 253 return false; 254 } 255 } 256 return false; 257 } 258 259 private final DataUtils.Sortable byBlockHash = new DataUtils.Sortable() { 260 public void swap(int i, int j) { 261 String tmp = resourceIds[i]; 262 resourceIds[i] = resourceIds[j]; 263 resourceIds[j] = tmp; 264 265 i *= blockInts; 266 j *= blockInts; 267 for (int k = 0; k < blockInts; k++, i++, j++) { 268 int x = blockData[i]; 269 blockData[i] = blockData[j]; 270 blockData[j] = x; 271 } 272 } 273 274 public boolean isLess(int i, int j) { 275 return isLessByHash(i, j); 276 } 277 278 public int size() { 279 return size; 280 } 281 }; 282 283 private final DataUtils.Sortable byResourceId = new DataUtils.Sortable() { 284 public void swap(int i, int j) { 285 int tmp = resourceIdsIndex[i]; 286 resourceIdsIndex[i] = resourceIdsIndex[j]; 287 resourceIdsIndex[j] = tmp; 288 } 289 290 public boolean isLess(int i, int j) { 291 String s1 = resourceIds[resourceIdsIndex[i]]; 292 String s2 = resourceIds[resourceIdsIndex[j]]; 293 return FastStringComparator.INSTANCE.compare(s1, s2) < 0; 294 } 295 296 public int size() { 297 return size; 298 } 299 }; 300 301}