001/*
002 * SonarQube
003 * Copyright (C) 2009-2016 SonarSource SA
004 * mailto:contact AT sonarsource DOT com
005 *
006 * This program is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * This program is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public License
017 * along with this program; if not, write to the Free Software Foundation,
018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
019 */
020package org.sonar.api.batch.fs.internal;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.io.InputStreamReader;
027import java.io.Reader;
028import java.nio.ByteBuffer;
029import java.nio.CharBuffer;
030import java.nio.charset.CharacterCodingException;
031import java.nio.charset.Charset;
032import java.nio.charset.CharsetEncoder;
033import java.nio.charset.CodingErrorAction;
034import java.nio.charset.StandardCharsets;
035import java.security.MessageDigest;
036import javax.annotation.CheckForNull;
037import javax.annotation.Nullable;
038import org.apache.commons.codec.binary.Hex;
039import org.apache.commons.codec.digest.DigestUtils;
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.input.BOMInputStream;
042import org.sonar.api.CoreProperties;
043import org.sonar.api.batch.ScannerSide;
044import org.sonar.api.utils.log.Logger;
045import org.sonar.api.utils.log.Loggers;
046
047/**
048 * Computes hash of files. Ends of Lines are ignored, so files with
049 * same content but different EOL encoding have the same hash.
050 */
051@ScannerSide
052public class FileMetadata {
053
054  private static final Logger LOG = Loggers.get(FileMetadata.class);
055
056  private static final char LINE_FEED = '\n';
057  private static final char CARRIAGE_RETURN = '\r';
058
059  public abstract static class CharHandler {
060
061    protected void handleAll(char c) {
062    }
063
064    protected void handleIgnoreEoL(char c) {
065    }
066
067    protected void newLine() {
068    }
069
070    protected void eof() {
071    }
072  }
073
074  private static class LineCounter extends CharHandler {
075    private int lines = 1;
076    private int nonBlankLines = 0;
077    private boolean blankLine = true;
078    boolean alreadyLoggedInvalidCharacter = false;
079    private final File file;
080    private final Charset encoding;
081
082    LineCounter(File file, Charset encoding) {
083      this.file = file;
084      this.encoding = encoding;
085    }
086
087    @Override
088    protected void handleAll(char c) {
089      if (!alreadyLoggedInvalidCharacter && c == '\ufffd') {
090        LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", file,
091          lines, encoding, CoreProperties.ENCODING_PROPERTY);
092        alreadyLoggedInvalidCharacter = true;
093      }
094    }
095
096    @Override
097    protected void newLine() {
098      lines++;
099      if (!blankLine) {
100        nonBlankLines++;
101      }
102      blankLine = true;
103    }
104
105    @Override
106    protected void handleIgnoreEoL(char c) {
107      if (!Character.isWhitespace(c)) {
108        blankLine = false;
109      }
110    }
111
112    @Override
113    protected void eof() {
114      if (!blankLine) {
115        nonBlankLines++;
116      }
117    }
118
119    public int lines() {
120      return lines;
121    }
122
123    public int nonBlankLines() {
124      return nonBlankLines;
125    }
126
127  }
128
129  private static class FileHashComputer extends CharHandler {
130    private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest();
131    private StringBuilder sb = new StringBuilder();
132    private final CharsetEncoder encoder;
133    private final File file;
134
135    public FileHashComputer(File f) {
136      encoder = StandardCharsets.UTF_8.newEncoder()
137        .onMalformedInput(CodingErrorAction.REPLACE)
138        .onUnmappableCharacter(CodingErrorAction.REPLACE);
139      file = f;
140    }
141
142    @Override
143    protected void handleIgnoreEoL(char c) {
144      sb.append(c);
145    }
146
147    @Override
148    protected void newLine() {
149      sb.append(LINE_FEED);
150      processBuffer();
151      sb.setLength(0);
152    }
153
154    @Override
155    protected void eof() {
156      if (sb.length() > 0) {
157        processBuffer();
158      }
159    }
160
161    private void processBuffer() {
162      try {
163        if (sb.length() > 0) {
164          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
165          globalMd5Digest.update(encoded.array(), 0, encoded.limit());
166        }
167      } catch (CharacterCodingException e) {
168        throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e);
169      }
170    }
171
172    @CheckForNull
173    public String getHash() {
174      return Hex.encodeHexString(globalMd5Digest.digest());
175    }
176  }
177
178  private static class LineHashComputer extends CharHandler {
179    private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
180    private final CharsetEncoder encoder;
181    private final StringBuilder sb = new StringBuilder();
182    private final LineHashConsumer consumer;
183    private final File file;
184    private int line = 1;
185
186    public LineHashComputer(LineHashConsumer consumer, File f) {
187      this.consumer = consumer;
188      this.file = f;
189      this.encoder = StandardCharsets.UTF_8.newEncoder()
190        .onMalformedInput(CodingErrorAction.REPLACE)
191        .onUnmappableCharacter(CodingErrorAction.REPLACE);
192    }
193
194    @Override
195    protected void handleIgnoreEoL(char c) {
196      if (!Character.isWhitespace(c)) {
197        sb.append(c);
198      }
199    }
200
201    @Override
202    protected void newLine() {
203      processBuffer();
204      sb.setLength(0);
205      line++;
206    }
207
208    @Override
209    protected void eof() {
210      if (this.line > 0) {
211        processBuffer();
212      }
213    }
214
215    private void processBuffer() {
216      try {
217        if (sb.length() > 0) {
218          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
219          lineMd5Digest.update(encoded.array(), 0, encoded.limit());
220          consumer.consume(line, lineMd5Digest.digest());
221        }
222      } catch (CharacterCodingException e) {
223        throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e);
224      }
225    }
226  }
227
228  private static class LineOffsetCounter extends CharHandler {
229    private long currentOriginalOffset = 0;
230    private IntArrayList originalLineOffsets = new IntArrayList();
231    private long lastValidOffset = 0;
232
233    public LineOffsetCounter() {
234      originalLineOffsets.add(0);
235    }
236
237    @Override
238    protected void handleAll(char c) {
239      currentOriginalOffset++;
240    }
241
242    @Override
243    protected void newLine() {
244      if (currentOriginalOffset > Integer.MAX_VALUE) {
245        throw new IllegalStateException("File is too big: " + currentOriginalOffset);
246      }
247      originalLineOffsets.add((int) currentOriginalOffset);
248    }
249
250    @Override
251    protected void eof() {
252      lastValidOffset = currentOriginalOffset;
253    }
254
255    public int[] getOriginalLineOffsets() {
256      return originalLineOffsets.trimAndGet();
257    }
258
259    public int getLastValidOffset() {
260      if (lastValidOffset > Integer.MAX_VALUE) {
261        throw new IllegalStateException("File is too big: " + lastValidOffset);
262      }
263      return (int) lastValidOffset;
264    }
265
266  }
267
268  /**
269   * Compute hash of a file ignoring line ends differences.
270   * Maximum performance is needed.
271   */
272  public Metadata readMetadata(File file, Charset encoding) {
273    LineCounter lineCounter = new LineCounter(file, encoding);
274    FileHashComputer fileHashComputer = new FileHashComputer(file);
275    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
276    readFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter);
277    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
278      lineOffsetCounter.getLastValidOffset());
279  }
280
281  /**
282   * For testing purpose
283   */
284  public Metadata readMetadata(Reader reader) {
285    LineCounter lineCounter = new LineCounter(new File("fromString"), StandardCharsets.UTF_16);
286    FileHashComputer fileHashComputer = new FileHashComputer(new File("fromString"));
287    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
288    try {
289      read(reader, lineCounter, fileHashComputer, lineOffsetCounter);
290    } catch (IOException e) {
291      throw new IllegalStateException("Should never occurs", e);
292    }
293    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
294      lineOffsetCounter.getLastValidOffset());
295  }
296
297  public static void readFile(File file, Charset encoding, CharHandler... handlers) {
298    try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file),
299      ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
300      Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) {
301      read(reader, handlers);
302    } catch (IOException e) {
303      throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e);
304    }
305  }
306
307  private static void read(Reader reader, CharHandler... handlers) throws IOException {
308    char c;
309    int i = reader.read();
310    boolean afterCR = false;
311    while (i != -1) {
312      c = (char) i;
313      if (afterCR) {
314        for (CharHandler handler : handlers) {
315          if (c == CARRIAGE_RETURN) {
316            handler.newLine();
317            handler.handleAll(c);
318          } else if (c == LINE_FEED) {
319            handler.handleAll(c);
320            handler.newLine();
321          } else {
322            handler.newLine();
323            handler.handleIgnoreEoL(c);
324            handler.handleAll(c);
325          }
326        }
327        afterCR = c == CARRIAGE_RETURN;
328      } else if (c == LINE_FEED) {
329        for (CharHandler handler : handlers) {
330          handler.handleAll(c);
331          handler.newLine();
332        }
333      } else if (c == CARRIAGE_RETURN) {
334        afterCR = true;
335        for (CharHandler handler : handlers) {
336          handler.handleAll(c);
337        }
338      } else {
339        for (CharHandler handler : handlers) {
340          handler.handleIgnoreEoL(c);
341          handler.handleAll(c);
342        }
343      }
344      i = reader.read();
345    }
346    for (CharHandler handler : handlers) {
347      if (afterCR) {
348        handler.newLine();
349      }
350      handler.eof();
351    }
352  }
353
354  public static class Metadata {
355    final int lines;
356    final int nonBlankLines;
357    final String hash;
358    final int[] originalLineOffsets;
359    final int lastValidOffset;
360
361    private Metadata(int lines, int nonBlankLines, String hash, int[] originalLineOffsets, int lastValidOffset) {
362      this.lines = lines;
363      this.nonBlankLines = nonBlankLines;
364      this.hash = hash;
365      this.originalLineOffsets = originalLineOffsets;
366      this.lastValidOffset = lastValidOffset;
367    }
368  }
369
370  public interface LineHashConsumer {
371
372    void consume(int lineIdx, @Nullable byte[] hash);
373
374  }
375
376  /**
377   * Compute a MD5 hash of each line of the file after removing of all blank chars
378   */
379  public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) {
380    readFile(f.file(), f.charset(), new LineHashComputer(consumer, f.file()));
381  }
382}