001/* 002 * SonarQube 003 * Copyright (C) 2009-2016 SonarSource SA 004 * mailto:contact AT sonarsource DOT com 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public License 017 * along with this program; if not, write to the Free Software Foundation, 018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 019 */ 020package org.sonar.api.batch.fs.internal; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.io.InputStreamReader; 027import java.io.Reader; 028import java.nio.ByteBuffer; 029import java.nio.CharBuffer; 030import java.nio.charset.CharacterCodingException; 031import java.nio.charset.Charset; 032import java.nio.charset.CharsetEncoder; 033import java.nio.charset.CodingErrorAction; 034import java.nio.charset.StandardCharsets; 035import java.security.MessageDigest; 036import javax.annotation.CheckForNull; 037import javax.annotation.Nullable; 038import org.apache.commons.codec.binary.Hex; 039import org.apache.commons.codec.digest.DigestUtils; 040import org.apache.commons.io.ByteOrderMark; 041import org.apache.commons.io.input.BOMInputStream; 042import org.sonar.api.CoreProperties; 043import org.sonar.api.batch.ScannerSide; 044import org.sonar.api.utils.log.Logger; 045import org.sonar.api.utils.log.Loggers; 046 047/** 048 * Computes hash of files. Ends of Lines are ignored, so files with 049 * same content but different EOL encoding have the same hash. 050 */ 051@ScannerSide 052public class FileMetadata { 053 054 private static final Logger LOG = Loggers.get(FileMetadata.class); 055 056 private static final char LINE_FEED = '\n'; 057 private static final char CARRIAGE_RETURN = '\r'; 058 059 public abstract static class CharHandler { 060 061 protected void handleAll(char c) { 062 } 063 064 protected void handleIgnoreEoL(char c) { 065 } 066 067 protected void newLine() { 068 } 069 070 protected void eof() { 071 } 072 } 073 074 private static class LineCounter extends CharHandler { 075 private int lines = 1; 076 private int nonBlankLines = 0; 077 private boolean blankLine = true; 078 boolean alreadyLoggedInvalidCharacter = false; 079 private final File file; 080 private final Charset encoding; 081 082 LineCounter(File file, Charset encoding) { 083 this.file = file; 084 this.encoding = encoding; 085 } 086 087 @Override 088 protected void handleAll(char c) { 089 if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { 090 LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", file, 091 lines, encoding, CoreProperties.ENCODING_PROPERTY); 092 alreadyLoggedInvalidCharacter = true; 093 } 094 } 095 096 @Override 097 protected void newLine() { 098 lines++; 099 if (!blankLine) { 100 nonBlankLines++; 101 } 102 blankLine = true; 103 } 104 105 @Override 106 protected void handleIgnoreEoL(char c) { 107 if (!Character.isWhitespace(c)) { 108 blankLine = false; 109 } 110 } 111 112 @Override 113 protected void eof() { 114 if (!blankLine) { 115 nonBlankLines++; 116 } 117 } 118 119 public int lines() { 120 return lines; 121 } 122 123 public int nonBlankLines() { 124 return nonBlankLines; 125 } 126 127 } 128 129 private static class FileHashComputer extends CharHandler { 130 private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); 131 private StringBuilder sb = new StringBuilder(); 132 private final CharsetEncoder encoder; 133 private final File file; 134 135 public FileHashComputer(File f) { 136 encoder = StandardCharsets.UTF_8.newEncoder() 137 .onMalformedInput(CodingErrorAction.REPLACE) 138 .onUnmappableCharacter(CodingErrorAction.REPLACE); 139 file = f; 140 } 141 142 @Override 143 protected void handleIgnoreEoL(char c) { 144 sb.append(c); 145 } 146 147 @Override 148 protected void newLine() { 149 sb.append(LINE_FEED); 150 processBuffer(); 151 sb.setLength(0); 152 } 153 154 @Override 155 protected void eof() { 156 if (sb.length() > 0) { 157 processBuffer(); 158 } 159 } 160 161 private void processBuffer() { 162 try { 163 if (sb.length() > 0) { 164 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 165 globalMd5Digest.update(encoded.array(), 0, encoded.limit()); 166 } 167 } catch (CharacterCodingException e) { 168 throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); 169 } 170 } 171 172 @CheckForNull 173 public String getHash() { 174 return Hex.encodeHexString(globalMd5Digest.digest()); 175 } 176 } 177 178 private static class LineHashComputer extends CharHandler { 179 private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); 180 private final CharsetEncoder encoder; 181 private final StringBuilder sb = new StringBuilder(); 182 private final LineHashConsumer consumer; 183 private final File file; 184 private int line = 1; 185 186 public LineHashComputer(LineHashConsumer consumer, File f) { 187 this.consumer = consumer; 188 this.file = f; 189 this.encoder = StandardCharsets.UTF_8.newEncoder() 190 .onMalformedInput(CodingErrorAction.REPLACE) 191 .onUnmappableCharacter(CodingErrorAction.REPLACE); 192 } 193 194 @Override 195 protected void handleIgnoreEoL(char c) { 196 if (!Character.isWhitespace(c)) { 197 sb.append(c); 198 } 199 } 200 201 @Override 202 protected void newLine() { 203 processBuffer(); 204 sb.setLength(0); 205 line++; 206 } 207 208 @Override 209 protected void eof() { 210 if (this.line > 0) { 211 processBuffer(); 212 } 213 } 214 215 private void processBuffer() { 216 try { 217 if (sb.length() > 0) { 218 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 219 lineMd5Digest.update(encoded.array(), 0, encoded.limit()); 220 consumer.consume(line, lineMd5Digest.digest()); 221 } 222 } catch (CharacterCodingException e) { 223 throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); 224 } 225 } 226 } 227 228 private static class LineOffsetCounter extends CharHandler { 229 private long currentOriginalOffset = 0; 230 private IntArrayList originalLineOffsets = new IntArrayList(); 231 private long lastValidOffset = 0; 232 233 public LineOffsetCounter() { 234 originalLineOffsets.add(0); 235 } 236 237 @Override 238 protected void handleAll(char c) { 239 currentOriginalOffset++; 240 } 241 242 @Override 243 protected void newLine() { 244 if (currentOriginalOffset > Integer.MAX_VALUE) { 245 throw new IllegalStateException("File is too big: " + currentOriginalOffset); 246 } 247 originalLineOffsets.add((int) currentOriginalOffset); 248 } 249 250 @Override 251 protected void eof() { 252 lastValidOffset = currentOriginalOffset; 253 } 254 255 public int[] getOriginalLineOffsets() { 256 return originalLineOffsets.trimAndGet(); 257 } 258 259 public int getLastValidOffset() { 260 if (lastValidOffset > Integer.MAX_VALUE) { 261 throw new IllegalStateException("File is too big: " + lastValidOffset); 262 } 263 return (int) lastValidOffset; 264 } 265 266 } 267 268 /** 269 * Compute hash of a file ignoring line ends differences. 270 * Maximum performance is needed. 271 */ 272 public Metadata readMetadata(File file, Charset encoding) { 273 LineCounter lineCounter = new LineCounter(file, encoding); 274 FileHashComputer fileHashComputer = new FileHashComputer(file); 275 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 276 readFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter); 277 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 278 lineOffsetCounter.getLastValidOffset()); 279 } 280 281 /** 282 * For testing purpose 283 */ 284 public Metadata readMetadata(Reader reader) { 285 LineCounter lineCounter = new LineCounter(new File("fromString"), StandardCharsets.UTF_16); 286 FileHashComputer fileHashComputer = new FileHashComputer(new File("fromString")); 287 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 288 try { 289 read(reader, lineCounter, fileHashComputer, lineOffsetCounter); 290 } catch (IOException e) { 291 throw new IllegalStateException("Should never occurs", e); 292 } 293 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 294 lineOffsetCounter.getLastValidOffset()); 295 } 296 297 public static void readFile(File file, Charset encoding, CharHandler... handlers) { 298 try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), 299 ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); 300 Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { 301 read(reader, handlers); 302 } catch (IOException e) { 303 throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); 304 } 305 } 306 307 private static void read(Reader reader, CharHandler... handlers) throws IOException { 308 char c; 309 int i = reader.read(); 310 boolean afterCR = false; 311 while (i != -1) { 312 c = (char) i; 313 if (afterCR) { 314 for (CharHandler handler : handlers) { 315 if (c == CARRIAGE_RETURN) { 316 handler.newLine(); 317 handler.handleAll(c); 318 } else if (c == LINE_FEED) { 319 handler.handleAll(c); 320 handler.newLine(); 321 } else { 322 handler.newLine(); 323 handler.handleIgnoreEoL(c); 324 handler.handleAll(c); 325 } 326 } 327 afterCR = c == CARRIAGE_RETURN; 328 } else if (c == LINE_FEED) { 329 for (CharHandler handler : handlers) { 330 handler.handleAll(c); 331 handler.newLine(); 332 } 333 } else if (c == CARRIAGE_RETURN) { 334 afterCR = true; 335 for (CharHandler handler : handlers) { 336 handler.handleAll(c); 337 } 338 } else { 339 for (CharHandler handler : handlers) { 340 handler.handleIgnoreEoL(c); 341 handler.handleAll(c); 342 } 343 } 344 i = reader.read(); 345 } 346 for (CharHandler handler : handlers) { 347 if (afterCR) { 348 handler.newLine(); 349 } 350 handler.eof(); 351 } 352 } 353 354 public static class Metadata { 355 final int lines; 356 final int nonBlankLines; 357 final String hash; 358 final int[] originalLineOffsets; 359 final int lastValidOffset; 360 361 private Metadata(int lines, int nonBlankLines, String hash, int[] originalLineOffsets, int lastValidOffset) { 362 this.lines = lines; 363 this.nonBlankLines = nonBlankLines; 364 this.hash = hash; 365 this.originalLineOffsets = originalLineOffsets; 366 this.lastValidOffset = lastValidOffset; 367 } 368 } 369 370 public interface LineHashConsumer { 371 372 void consume(int lineIdx, @Nullable byte[] hash); 373 374 } 375 376 /** 377 * Compute a MD5 hash of each line of the file after removing of all blank chars 378 */ 379 public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) { 380 readFile(f.file(), f.charset(), new LineHashComputer(consumer, f.file())); 381 } 382}