001/*
002 * SonarQube
003 * Copyright (C) 2009-2017 SonarSource SA
004 * mailto:info AT sonarsource DOT com
005 *
006 * This program is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * This program is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public License
017 * along with this program; if not, write to the Free Software Foundation,
018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
019 */
020package org.sonar.api.batch.fs.internal;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.io.Reader;
028import java.nio.ByteBuffer;
029import java.nio.CharBuffer;
030import java.nio.charset.CharacterCodingException;
031import java.nio.charset.Charset;
032import java.nio.charset.CharsetEncoder;
033import java.nio.charset.CodingErrorAction;
034import java.nio.charset.StandardCharsets;
035import java.security.MessageDigest;
036
037import javax.annotation.CheckForNull;
038import javax.annotation.Nullable;
039
040import org.apache.commons.codec.binary.Hex;
041import org.apache.commons.codec.digest.DigestUtils;
042import org.sonar.api.CoreProperties;
043import org.sonar.api.batch.ScannerSide;
044import org.sonar.api.batch.fs.InputFile;
045import org.sonar.api.utils.log.Logger;
046import org.sonar.api.utils.log.Loggers;
047
048/**
049 * Computes hash of files. Ends of Lines are ignored, so files with
050 * same content but different EOL encoding have the same hash.
051 */
052@ScannerSide
053public class FileMetadata {
054
055  private static final Logger LOG = Loggers.get(FileMetadata.class);
056
057  private static final char LINE_FEED = '\n';
058  private static final char CARRIAGE_RETURN = '\r';
059
060  public abstract static class CharHandler {
061
062    protected void handleAll(char c) {
063    }
064
065    protected void handleIgnoreEoL(char c) {
066    }
067
068    protected void newLine() {
069    }
070
071    protected void eof() {
072    }
073  }
074
075  private static class LineCounter extends CharHandler {
076    private int lines = 1;
077    private int nonBlankLines = 0;
078    private boolean blankLine = true;
079    boolean alreadyLoggedInvalidCharacter = false;
080    private final String filePath;
081    private final Charset encoding;
082
083    LineCounter(String filePath, Charset encoding) {
084      this.filePath = filePath;
085      this.encoding = encoding;
086    }
087
088    @Override
089    protected void handleAll(char c) {
090      if (!alreadyLoggedInvalidCharacter && c == '\ufffd') {
091        LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", filePath,
092          lines, encoding, CoreProperties.ENCODING_PROPERTY);
093        alreadyLoggedInvalidCharacter = true;
094      }
095    }
096
097    @Override
098    protected void newLine() {
099      lines++;
100      if (!blankLine) {
101        nonBlankLines++;
102      }
103      blankLine = true;
104    }
105
106    @Override
107    protected void handleIgnoreEoL(char c) {
108      if (!Character.isWhitespace(c)) {
109        blankLine = false;
110      }
111    }
112
113    @Override
114    protected void eof() {
115      if (!blankLine) {
116        nonBlankLines++;
117      }
118    }
119
120    public int lines() {
121      return lines;
122    }
123
124    public int nonBlankLines() {
125      return nonBlankLines;
126    }
127
128  }
129
130  private static class FileHashComputer extends CharHandler {
131    private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest();
132    private StringBuilder sb = new StringBuilder();
133    private final CharsetEncoder encoder;
134    private final String filePath;
135
136    public FileHashComputer(String filePath) {
137      encoder = StandardCharsets.UTF_8.newEncoder()
138        .onMalformedInput(CodingErrorAction.REPLACE)
139        .onUnmappableCharacter(CodingErrorAction.REPLACE);
140      this.filePath = filePath;
141    }
142
143    @Override
144    protected void handleIgnoreEoL(char c) {
145      sb.append(c);
146    }
147
148    @Override
149    protected void newLine() {
150      sb.append(LINE_FEED);
151      processBuffer();
152      sb.setLength(0);
153    }
154
155    @Override
156    protected void eof() {
157      if (sb.length() > 0) {
158        processBuffer();
159      }
160    }
161
162    private void processBuffer() {
163      try {
164        if (sb.length() > 0) {
165          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
166          globalMd5Digest.update(encoded.array(), 0, encoded.limit());
167        }
168      } catch (CharacterCodingException e) {
169        throw new IllegalStateException("Error encoding line hash in file: " + filePath, e);
170      }
171    }
172
173    @CheckForNull
174    public String getHash() {
175      return Hex.encodeHexString(globalMd5Digest.digest());
176    }
177  }
178
179  private static class LineHashComputer extends CharHandler {
180    private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
181    private final CharsetEncoder encoder;
182    private final StringBuilder sb = new StringBuilder();
183    private final LineHashConsumer consumer;
184    private final File file;
185    private int line = 1;
186
187    public LineHashComputer(LineHashConsumer consumer, File f) {
188      this.consumer = consumer;
189      this.file = f;
190      this.encoder = StandardCharsets.UTF_8.newEncoder()
191        .onMalformedInput(CodingErrorAction.REPLACE)
192        .onUnmappableCharacter(CodingErrorAction.REPLACE);
193    }
194
195    @Override
196    protected void handleIgnoreEoL(char c) {
197      if (!Character.isWhitespace(c)) {
198        sb.append(c);
199      }
200    }
201
202    @Override
203    protected void newLine() {
204      processBuffer();
205      sb.setLength(0);
206      line++;
207    }
208
209    @Override
210    protected void eof() {
211      if (this.line > 0) {
212        processBuffer();
213      }
214    }
215
216    private void processBuffer() {
217      try {
218        if (sb.length() > 0) {
219          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
220          lineMd5Digest.update(encoded.array(), 0, encoded.limit());
221          consumer.consume(line, lineMd5Digest.digest());
222        }
223      } catch (CharacterCodingException e) {
224        throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e);
225      }
226    }
227  }
228
229  private static class LineOffsetCounter extends CharHandler {
230    private long currentOriginalOffset = 0;
231    private IntArrayList originalLineOffsets = new IntArrayList();
232    private long lastValidOffset = 0;
233
234    public LineOffsetCounter() {
235      originalLineOffsets.add(0);
236    }
237
238    @Override
239    protected void handleAll(char c) {
240      currentOriginalOffset++;
241    }
242
243    @Override
244    protected void newLine() {
245      if (currentOriginalOffset > Integer.MAX_VALUE) {
246        throw new IllegalStateException("File is too big: " + currentOriginalOffset);
247      }
248      originalLineOffsets.add((int) currentOriginalOffset);
249    }
250
251    @Override
252    protected void eof() {
253      lastValidOffset = currentOriginalOffset;
254    }
255
256    public int[] getOriginalLineOffsets() {
257      return originalLineOffsets.trimAndGet();
258    }
259
260    public int getLastValidOffset() {
261      if (lastValidOffset > Integer.MAX_VALUE) {
262        throw new IllegalStateException("File is too big: " + lastValidOffset);
263      }
264      return (int) lastValidOffset;
265    }
266
267  }
268
269  /**
270   * Compute hash of a file ignoring line ends differences.
271   * Maximum performance is needed.
272   */
273  public Metadata readMetadata(InputStream stream, Charset encoding, String filePath, @Nullable CharHandler otherHandler) {
274    LineCounter lineCounter = new LineCounter(filePath, encoding);
275    FileHashComputer fileHashComputer = new FileHashComputer(filePath);
276    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
277
278    if (otherHandler != null) {
279      CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter, otherHandler};
280      readFile(stream, encoding, filePath, handlers);
281    } else {
282      CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter};
283      readFile(stream, encoding, filePath, handlers);
284    }
285    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
286      lineOffsetCounter.getLastValidOffset());
287  }
288
289  public Metadata readMetadata(InputStream stream, Charset encoding, String filePath) {
290    return readMetadata(stream, encoding, filePath, null);
291  }
292
293  /**
294   * For testing purpose
295   */
296  public Metadata readMetadata(Reader reader) {
297    LineCounter lineCounter = new LineCounter("fromString", StandardCharsets.UTF_16);
298    FileHashComputer fileHashComputer = new FileHashComputer("fromString");
299    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
300    CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter};
301
302    try {
303      read(reader, handlers);
304    } catch (IOException e) {
305      throw new IllegalStateException("Should never occur", e);
306    }
307    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
308      lineOffsetCounter.getLastValidOffset());
309  }
310
311  public static void readFile(InputStream stream, Charset encoding, String filePath, CharHandler[] handlers) {
312    try (Reader reader = new BufferedReader(new InputStreamReader(stream, encoding))) {
313      read(reader, handlers);
314    } catch (IOException e) {
315      throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", filePath, encoding), e);
316    }
317  }
318
319  private static void read(Reader reader, CharHandler[] handlers) throws IOException {
320    char c;
321    int i = reader.read();
322    boolean afterCR = false;
323    while (i != -1) {
324      c = (char) i;
325      if (afterCR) {
326        for (CharHandler handler : handlers) {
327          if (c == CARRIAGE_RETURN) {
328            handler.newLine();
329            handler.handleAll(c);
330          } else if (c == LINE_FEED) {
331            handler.handleAll(c);
332            handler.newLine();
333          } else {
334            handler.newLine();
335            handler.handleIgnoreEoL(c);
336            handler.handleAll(c);
337          }
338        }
339        afterCR = c == CARRIAGE_RETURN;
340      } else if (c == LINE_FEED) {
341        for (CharHandler handler : handlers) {
342          handler.handleAll(c);
343          handler.newLine();
344        }
345      } else if (c == CARRIAGE_RETURN) {
346        afterCR = true;
347        for (CharHandler handler : handlers) {
348          handler.handleAll(c);
349        }
350      } else {
351        for (CharHandler handler : handlers) {
352          handler.handleIgnoreEoL(c);
353          handler.handleAll(c);
354        }
355      }
356      i = reader.read();
357    }
358    for (CharHandler handler : handlers) {
359      if (afterCR) {
360        handler.newLine();
361      }
362      handler.eof();
363    }
364  }
365
366  @FunctionalInterface
367  public interface LineHashConsumer {
368    void consume(int lineIdx, @Nullable byte[] hash);
369  }
370
371  /**
372   * Compute a MD5 hash of each line of the file after removing of all blank chars
373   */
374  public static void computeLineHashesForIssueTracking(InputFile f, LineHashConsumer consumer) {
375    try {
376      readFile(f.inputStream(), f.charset(), f.absolutePath(), new CharHandler[] {new LineHashComputer(consumer, f.file())});
377    } catch (IOException e) {
378      throw new IllegalStateException("Failed to compute line hashes for " + f.absolutePath(), e);
379    }
380  }
381}