001/* 002 * SonarQube 003 * Copyright (C) 2009-2017 SonarSource SA 004 * mailto:info AT sonarsource DOT com 005 * 006 * This program is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public License 017 * along with this program; if not, write to the Free Software Foundation, 018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 019 */ 020package org.sonar.api.batch.fs.internal; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.io.Reader; 028import java.nio.ByteBuffer; 029import java.nio.CharBuffer; 030import java.nio.charset.CharacterCodingException; 031import java.nio.charset.Charset; 032import java.nio.charset.CharsetEncoder; 033import java.nio.charset.CodingErrorAction; 034import java.nio.charset.StandardCharsets; 035import java.security.MessageDigest; 036 037import javax.annotation.CheckForNull; 038import javax.annotation.Nullable; 039 040import org.apache.commons.codec.binary.Hex; 041import org.apache.commons.codec.digest.DigestUtils; 042import org.sonar.api.CoreProperties; 043import org.sonar.api.batch.ScannerSide; 044import org.sonar.api.batch.fs.InputFile; 045import org.sonar.api.utils.log.Logger; 046import org.sonar.api.utils.log.Loggers; 047 048/** 049 * Computes hash of files. Ends of Lines are ignored, so files with 050 * same content but different EOL encoding have the same hash. 051 */ 052@ScannerSide 053public class FileMetadata { 054 055 private static final Logger LOG = Loggers.get(FileMetadata.class); 056 057 private static final char LINE_FEED = '\n'; 058 private static final char CARRIAGE_RETURN = '\r'; 059 060 public abstract static class CharHandler { 061 062 protected void handleAll(char c) { 063 } 064 065 protected void handleIgnoreEoL(char c) { 066 } 067 068 protected void newLine() { 069 } 070 071 protected void eof() { 072 } 073 } 074 075 private static class LineCounter extends CharHandler { 076 private int lines = 1; 077 private int nonBlankLines = 0; 078 private boolean blankLine = true; 079 boolean alreadyLoggedInvalidCharacter = false; 080 private final String filePath; 081 private final Charset encoding; 082 083 LineCounter(String filePath, Charset encoding) { 084 this.filePath = filePath; 085 this.encoding = encoding; 086 } 087 088 @Override 089 protected void handleAll(char c) { 090 if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { 091 LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", filePath, 092 lines, encoding, CoreProperties.ENCODING_PROPERTY); 093 alreadyLoggedInvalidCharacter = true; 094 } 095 } 096 097 @Override 098 protected void newLine() { 099 lines++; 100 if (!blankLine) { 101 nonBlankLines++; 102 } 103 blankLine = true; 104 } 105 106 @Override 107 protected void handleIgnoreEoL(char c) { 108 if (!Character.isWhitespace(c)) { 109 blankLine = false; 110 } 111 } 112 113 @Override 114 protected void eof() { 115 if (!blankLine) { 116 nonBlankLines++; 117 } 118 } 119 120 public int lines() { 121 return lines; 122 } 123 124 public int nonBlankLines() { 125 return nonBlankLines; 126 } 127 128 } 129 130 private static class FileHashComputer extends CharHandler { 131 private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); 132 private StringBuilder sb = new StringBuilder(); 133 private final CharsetEncoder encoder; 134 private final String filePath; 135 136 public FileHashComputer(String filePath) { 137 encoder = StandardCharsets.UTF_8.newEncoder() 138 .onMalformedInput(CodingErrorAction.REPLACE) 139 .onUnmappableCharacter(CodingErrorAction.REPLACE); 140 this.filePath = filePath; 141 } 142 143 @Override 144 protected void handleIgnoreEoL(char c) { 145 sb.append(c); 146 } 147 148 @Override 149 protected void newLine() { 150 sb.append(LINE_FEED); 151 processBuffer(); 152 sb.setLength(0); 153 } 154 155 @Override 156 protected void eof() { 157 if (sb.length() > 0) { 158 processBuffer(); 159 } 160 } 161 162 private void processBuffer() { 163 try { 164 if (sb.length() > 0) { 165 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 166 globalMd5Digest.update(encoded.array(), 0, encoded.limit()); 167 } 168 } catch (CharacterCodingException e) { 169 throw new IllegalStateException("Error encoding line hash in file: " + filePath, e); 170 } 171 } 172 173 @CheckForNull 174 public String getHash() { 175 return Hex.encodeHexString(globalMd5Digest.digest()); 176 } 177 } 178 179 private static class LineHashComputer extends CharHandler { 180 private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); 181 private final CharsetEncoder encoder; 182 private final StringBuilder sb = new StringBuilder(); 183 private final LineHashConsumer consumer; 184 private final File file; 185 private int line = 1; 186 187 public LineHashComputer(LineHashConsumer consumer, File f) { 188 this.consumer = consumer; 189 this.file = f; 190 this.encoder = StandardCharsets.UTF_8.newEncoder() 191 .onMalformedInput(CodingErrorAction.REPLACE) 192 .onUnmappableCharacter(CodingErrorAction.REPLACE); 193 } 194 195 @Override 196 protected void handleIgnoreEoL(char c) { 197 if (!Character.isWhitespace(c)) { 198 sb.append(c); 199 } 200 } 201 202 @Override 203 protected void newLine() { 204 processBuffer(); 205 sb.setLength(0); 206 line++; 207 } 208 209 @Override 210 protected void eof() { 211 if (this.line > 0) { 212 processBuffer(); 213 } 214 } 215 216 private void processBuffer() { 217 try { 218 if (sb.length() > 0) { 219 ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb)); 220 lineMd5Digest.update(encoded.array(), 0, encoded.limit()); 221 consumer.consume(line, lineMd5Digest.digest()); 222 } 223 } catch (CharacterCodingException e) { 224 throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); 225 } 226 } 227 } 228 229 private static class LineOffsetCounter extends CharHandler { 230 private long currentOriginalOffset = 0; 231 private IntArrayList originalLineOffsets = new IntArrayList(); 232 private long lastValidOffset = 0; 233 234 public LineOffsetCounter() { 235 originalLineOffsets.add(0); 236 } 237 238 @Override 239 protected void handleAll(char c) { 240 currentOriginalOffset++; 241 } 242 243 @Override 244 protected void newLine() { 245 if (currentOriginalOffset > Integer.MAX_VALUE) { 246 throw new IllegalStateException("File is too big: " + currentOriginalOffset); 247 } 248 originalLineOffsets.add((int) currentOriginalOffset); 249 } 250 251 @Override 252 protected void eof() { 253 lastValidOffset = currentOriginalOffset; 254 } 255 256 public int[] getOriginalLineOffsets() { 257 return originalLineOffsets.trimAndGet(); 258 } 259 260 public int getLastValidOffset() { 261 if (lastValidOffset > Integer.MAX_VALUE) { 262 throw new IllegalStateException("File is too big: " + lastValidOffset); 263 } 264 return (int) lastValidOffset; 265 } 266 267 } 268 269 /** 270 * Compute hash of a file ignoring line ends differences. 271 * Maximum performance is needed. 272 */ 273 public Metadata readMetadata(InputStream stream, Charset encoding, String filePath, @Nullable CharHandler otherHandler) { 274 LineCounter lineCounter = new LineCounter(filePath, encoding); 275 FileHashComputer fileHashComputer = new FileHashComputer(filePath); 276 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 277 278 if (otherHandler != null) { 279 CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter, otherHandler}; 280 readFile(stream, encoding, filePath, handlers); 281 } else { 282 CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter}; 283 readFile(stream, encoding, filePath, handlers); 284 } 285 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 286 lineOffsetCounter.getLastValidOffset()); 287 } 288 289 public Metadata readMetadata(InputStream stream, Charset encoding, String filePath) { 290 return readMetadata(stream, encoding, filePath, null); 291 } 292 293 /** 294 * For testing purpose 295 */ 296 public Metadata readMetadata(Reader reader) { 297 LineCounter lineCounter = new LineCounter("fromString", StandardCharsets.UTF_16); 298 FileHashComputer fileHashComputer = new FileHashComputer("fromString"); 299 LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); 300 CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter}; 301 302 try { 303 read(reader, handlers); 304 } catch (IOException e) { 305 throw new IllegalStateException("Should never occur", e); 306 } 307 return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), 308 lineOffsetCounter.getLastValidOffset()); 309 } 310 311 public static void readFile(InputStream stream, Charset encoding, String filePath, CharHandler[] handlers) { 312 try (Reader reader = new BufferedReader(new InputStreamReader(stream, encoding))) { 313 read(reader, handlers); 314 } catch (IOException e) { 315 throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", filePath, encoding), e); 316 } 317 } 318 319 private static void read(Reader reader, CharHandler[] handlers) throws IOException { 320 char c; 321 int i = reader.read(); 322 boolean afterCR = false; 323 while (i != -1) { 324 c = (char) i; 325 if (afterCR) { 326 for (CharHandler handler : handlers) { 327 if (c == CARRIAGE_RETURN) { 328 handler.newLine(); 329 handler.handleAll(c); 330 } else if (c == LINE_FEED) { 331 handler.handleAll(c); 332 handler.newLine(); 333 } else { 334 handler.newLine(); 335 handler.handleIgnoreEoL(c); 336 handler.handleAll(c); 337 } 338 } 339 afterCR = c == CARRIAGE_RETURN; 340 } else if (c == LINE_FEED) { 341 for (CharHandler handler : handlers) { 342 handler.handleAll(c); 343 handler.newLine(); 344 } 345 } else if (c == CARRIAGE_RETURN) { 346 afterCR = true; 347 for (CharHandler handler : handlers) { 348 handler.handleAll(c); 349 } 350 } else { 351 for (CharHandler handler : handlers) { 352 handler.handleIgnoreEoL(c); 353 handler.handleAll(c); 354 } 355 } 356 i = reader.read(); 357 } 358 for (CharHandler handler : handlers) { 359 if (afterCR) { 360 handler.newLine(); 361 } 362 handler.eof(); 363 } 364 } 365 366 @FunctionalInterface 367 public interface LineHashConsumer { 368 void consume(int lineIdx, @Nullable byte[] hash); 369 } 370 371 /** 372 * Compute a MD5 hash of each line of the file after removing of all blank chars 373 */ 374 public static void computeLineHashesForIssueTracking(InputFile f, LineHashConsumer consumer) { 375 try { 376 readFile(f.inputStream(), f.charset(), f.absolutePath(), new CharHandler[] {new LineHashComputer(consumer, f.file())}); 377 } catch (IOException e) { 378 throw new IllegalStateException("Failed to compute line hashes for " + f.absolutePath(), e); 379 } 380 } 381}