001/* 002 * SonarQube 003 * Copyright (C) 2009-2016 SonarSource SA 004 * mailto:contact AT sonarsource DOT com 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public License 017 * along with this program; if not, write to the Free Software Foundation, 018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 019 */ 020package org.sonar.api.batch.fs.internal; 021 022import com.google.common.primitives.Ints; 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileInputStream; 026import java.io.IOException; 027import java.io.InputStreamReader; 028import java.io.Reader; 029import java.nio.ByteBuffer; 030import java.nio.CharBuffer; 031import java.nio.charset.CharacterCodingException; 032import java.nio.charset.Charset; 033import java.nio.charset.CharsetEncoder; 034import java.nio.charset.CodingErrorAction; 035import java.nio.charset.StandardCharsets; 036import java.security.MessageDigest; 037import java.util.ArrayList; 038import java.util.List; 039import javax.annotation.CheckForNull; 040import javax.annotation.Nullable; 041import org.apache.commons.codec.binary.Hex; 042import org.apache.commons.codec.digest.DigestUtils; 043import org.apache.commons.io.ByteOrderMark; 044import org.apache.commons.io.input.BOMInputStream; 045import org.sonar.api.CoreProperties; 046import org.sonar.api.batch.BatchSide; 047import org.sonar.api.utils.log.Logger; 048import org.sonar.api.utils.log.Loggers; 049 050/** 051 * Computes hash of files. Ends of Lines are ignored, so files with 052 * same content but different EOL encoding have the same hash. 053 */ 054@BatchSide 055public class FileMetadata { 056 057 private static final Logger LOG = Loggers.get(FileMetadata.class); 058 059 private static final char LINE_FEED = '\n'; 060 private static final char CARRIAGE_RETURN = '\r'; 061 062 public abstract static class CharHandler { 063 064 protected void handleAll(char c) { 065 } 066 067 protected void handleIgnoreEoL(char c) { 068 } 069 070 protected void newLine() { 071 } 072 073 protected void eof() { 074 } 075 } 076 077 private static class LineCounter extends CharHandler { 078 private int lines = 1; 079 private int nonBlankLines = 0; 080 private boolean blankLine = true; 081 boolean alreadyLoggedInvalidCharacter = false; 082 private final File file; 083 private final Charset encoding; 084 085 LineCounter(File file, Charset encoding) { 086 this.file = file; 087 this.encoding = encoding; 088 } 089 090 @Override 091 protected void handleAll(char c) { 092 if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { 093 LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", file, 094 lines, encoding, CoreProperties.ENCODING_PROPERTY); 095 alreadyLoggedInvalidCharacter = true; 096 } 097 } 098 099 @Override 100 protected void newLine() { 101 lines++; 102 if (!blankLine) { 103 nonBlankLines++; 104 } 105 blankLine = true; 106 } 107 108 @Override 109 protected void handleIgnoreEoL(char c) { 110 if (!Character.isWhitespace(c)) { 111 blankLine = false; 112 } 113 } 114 115 @Override 116 protected void eof() { 117 if (!blankLine) { 118 nonBlankLines++; 119 } 120 } 121 122 public int lines() { 123 return lines; 124 } 125 126 public int nonBlankLines() { 127 return nonBlankLines; 128 } 129 130 } 131 132 private static class FileHashComputer extends CharHandler { 133 private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); 134 private StringBuilder sb = new StringBuilder(); 135 private final CharsetEncoder encoder; 136 private final File file; 137 138 public FileHashComputer(File f) { 139 encoder = StandardCharsets.UTF_8.newEncoder() 140 .onMalformedInput(CodingErrorAction.REPLACE) 141 .onUnmappableCharacter(CodingErrorAction.REPLACE); 142 file = f; 143 } 144 145 @Override 146 protected void handleIgnoreEoL(char c) { 147 sb.append(c); 148 } 149 150 @Override 151 protected void newLine() { 152 sb.append(LINE_FEED); 153 processBuffer(); 154 sb.setLength(0); 155 } 156 157 @Override 158 protected void eof() { 159 if (sb.length() > 0) { 160 processBuffer(); 161 } 162 } 163 164 private void processBuffer() { 165 try { 166 if (sb.length() > 0) { 167 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 168 globalMd5Digest.update(encoded.array(), 0, encoded.limit()); 169 } 170 } catch (CharacterCodingException e) { 171 throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); 172 } 173 } 174 175 @CheckForNull 176 public String getHash() { 177 return Hex.encodeHexString(globalMd5Digest.digest()); 178 } 179 } 180 181 private static class LineHashComputer extends CharHandler { 182 private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); 183 private final CharsetEncoder encoder; 184 private final StringBuilder sb = new StringBuilder(); 185 private final LineHashConsumer consumer; 186 private final File file; 187 private int line = 1; 188 189 public LineHashComputer(LineHashConsumer consumer, File f) { 190 this.consumer = consumer; 191 this.file = f; 192 this.encoder = StandardCharsets.UTF_8.newEncoder() 193 .onMalformedInput(CodingErrorAction.REPLACE) 194 .onUnmappableCharacter(CodingErrorAction.REPLACE); 195 } 196 197 @Override 198 protected void handleIgnoreEoL(char c) { 199 if (!Character.isWhitespace(c)) { 200 sb.append(c); 201 } 202 } 203 204 @Override 205 protected void newLine() { 206 processBuffer(); 207 sb.setLength(0); 208 line++; 209 } 210 211 @Override 212 protected void eof() { 213 if (this.line > 0) { 214 processBuffer(); 215 } 216 } 217 218 private void processBuffer() { 219 try { 220 if (sb.length() > 0) { 221 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 222 lineMd5Digest.update(encoded.array(), 0, encoded.limit()); 223 consumer.consume(line, lineMd5Digest.digest()); 224 } 225 } catch (CharacterCodingException e) { 226 throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); 227 } 228 } 229 } 230 231 private static class LineOffsetCounter extends CharHandler { 232 private int currentOriginalOffset = 0; 233 private List<Integer> originalLineOffsets = new ArrayList<>(); 234 private int lastValidOffset = 0; 235 236 public LineOffsetCounter() { 237 originalLineOffsets.add(0); 238 } 239 240 @Override 241 protected void handleAll(char c) { 242 currentOriginalOffset++; 243 } 244 245 @Override 246 protected void newLine() { 247 originalLineOffsets.add(currentOriginalOffset); 248 } 249 250 @Override 251 protected void eof() { 252 lastValidOffset = currentOriginalOffset; 253 } 254 255 public List<Integer> getOriginalLineOffsets() { 256 return originalLineOffsets; 257 } 258 259 public int getLastValidOffset() { 260 return lastValidOffset; 261 } 262 263 } 264 265 /** 266 * Compute hash of a file ignoring line ends differences. 267 * Maximum performance is needed. 268 */ 269 public Metadata readMetadata(File file, Charset encoding) { 270 LineCounter lineCounter = new LineCounter(file, encoding); 271 FileHashComputer fileHashComputer = new FileHashComputer(file); 272 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 273 readFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter); 274 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 275 lineOffsetCounter.getLastValidOffset()); 276 } 277 278 /** 279 * For testing purpose 280 */ 281 public Metadata readMetadata(Reader reader) { 282 LineCounter lineCounter = new LineCounter(new File("fromString"), StandardCharsets.UTF_16); 283 FileHashComputer fileHashComputer = new FileHashComputer(new File("fromString")); 284 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 285 try { 286 read(reader, lineCounter, fileHashComputer, lineOffsetCounter); 287 } catch (IOException e) { 288 throw new IllegalStateException("Should never occurs", e); 289 } 290 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 291 lineOffsetCounter.getLastValidOffset()); 292 } 293 294 public static void readFile(File file, Charset encoding, CharHandler... handlers) { 295 try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), 296 ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); 297 Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { 298 read(reader, handlers); 299 } catch (IOException e) { 300 throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); 301 } 302 } 303 304 private static void read(Reader reader, CharHandler... handlers) throws IOException { 305 char c; 306 int i = reader.read(); 307 boolean afterCR = false; 308 while (i != -1) { 309 c = (char) i; 310 if (afterCR) { 311 for (CharHandler handler : handlers) { 312 if (c == CARRIAGE_RETURN) { 313 handler.newLine(); 314 handler.handleAll(c); 315 } else if (c == LINE_FEED) { 316 handler.handleAll(c); 317 handler.newLine(); 318 } else { 319 handler.newLine(); 320 handler.handleIgnoreEoL(c); 321 handler.handleAll(c); 322 } 323 } 324 afterCR = c == CARRIAGE_RETURN; 325 } else if (c == LINE_FEED) { 326 for (CharHandler handler : handlers) { 327 handler.handleAll(c); 328 handler.newLine(); 329 } 330 } else if (c == CARRIAGE_RETURN) { 331 afterCR = true; 332 for (CharHandler handler : handlers) { 333 handler.handleAll(c); 334 } 335 } else { 336 for (CharHandler handler : handlers) { 337 handler.handleIgnoreEoL(c); 338 handler.handleAll(c); 339 } 340 } 341 i = reader.read(); 342 } 343 for (CharHandler handler : handlers) { 344 if (afterCR) { 345 handler.newLine(); 346 } 347 handler.eof(); 348 } 349 } 350 351 public static class Metadata { 352 final int lines; 353 final int nonBlankLines; 354 final String hash; 355 final int[] originalLineOffsets; 356 final int lastValidOffset; 357 358 private Metadata(int lines, int nonBlankLines, String hash, List<Integer> originalLineOffsets, int lastValidOffset) { 359 this.lines = lines; 360 this.nonBlankLines = nonBlankLines; 361 this.hash = hash; 362 this.originalLineOffsets = Ints.toArray(originalLineOffsets); 363 this.lastValidOffset = lastValidOffset; 364 } 365 } 366 367 public interface LineHashConsumer { 368 369 void consume(int lineIdx, @Nullable byte[] hash); 370 371 } 372 373 /** 374 * Compute a MD5 hash of each line of the file after removing of all blank chars 375 */ 376 public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) { 377 readFile(f.file(), f.charset(), new LineHashComputer(consumer, f.file())); 378 } 379}