001/*
002 * SonarQube
003 * Copyright (C) 2009-2016 SonarSource SA
004 * mailto:contact AT sonarsource DOT com
005 *
006 * This program is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * This program is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public License
017 * along with this program; if not, write to the Free Software Foundation,
018 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
019 */
020package org.sonar.api.batch.fs.internal;
021
022import com.google.common.primitives.Ints;
023import java.io.BufferedReader;
024import java.io.File;
025import java.io.FileInputStream;
026import java.io.IOException;
027import java.io.InputStreamReader;
028import java.io.Reader;
029import java.nio.ByteBuffer;
030import java.nio.CharBuffer;
031import java.nio.charset.CharacterCodingException;
032import java.nio.charset.Charset;
033import java.nio.charset.CharsetEncoder;
034import java.nio.charset.CodingErrorAction;
035import java.nio.charset.StandardCharsets;
036import java.security.MessageDigest;
037import java.util.ArrayList;
038import java.util.List;
039import javax.annotation.CheckForNull;
040import javax.annotation.Nullable;
041import org.apache.commons.codec.binary.Hex;
042import org.apache.commons.codec.digest.DigestUtils;
043import org.apache.commons.io.ByteOrderMark;
044import org.apache.commons.io.input.BOMInputStream;
045import org.sonar.api.CoreProperties;
046import org.sonar.api.batch.BatchSide;
047import org.sonar.api.utils.log.Logger;
048import org.sonar.api.utils.log.Loggers;
049
050/**
051 * Computes hash of files. Ends of Lines are ignored, so files with
052 * same content but different EOL encoding have the same hash.
053 */
054@BatchSide
055public class FileMetadata {
056
057  private static final Logger LOG = Loggers.get(FileMetadata.class);
058
059  private static final char LINE_FEED = '\n';
060  private static final char CARRIAGE_RETURN = '\r';
061
062  public abstract static class CharHandler {
063
064    protected void handleAll(char c) {
065    }
066
067    protected void handleIgnoreEoL(char c) {
068    }
069
070    protected void newLine() {
071    }
072
073    protected void eof() {
074    }
075  }
076
077  private static class LineCounter extends CharHandler {
078    private int lines = 1;
079    private int nonBlankLines = 0;
080    private boolean blankLine = true;
081    boolean alreadyLoggedInvalidCharacter = false;
082    private final File file;
083    private final Charset encoding;
084
085    LineCounter(File file, Charset encoding) {
086      this.file = file;
087      this.encoding = encoding;
088    }
089
090    @Override
091    protected void handleAll(char c) {
092      if (!alreadyLoggedInvalidCharacter && c == '\ufffd') {
093        LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", file,
094          lines, encoding, CoreProperties.ENCODING_PROPERTY);
095        alreadyLoggedInvalidCharacter = true;
096      }
097    }
098
099    @Override
100    protected void newLine() {
101      lines++;
102      if (!blankLine) {
103        nonBlankLines++;
104      }
105      blankLine = true;
106    }
107
108    @Override
109    protected void handleIgnoreEoL(char c) {
110      if (!Character.isWhitespace(c)) {
111        blankLine = false;
112      }
113    }
114
115    @Override
116    protected void eof() {
117      if (!blankLine) {
118        nonBlankLines++;
119      }
120    }
121
122    public int lines() {
123      return lines;
124    }
125
126    public int nonBlankLines() {
127      return nonBlankLines;
128    }
129
130  }
131
132  private static class FileHashComputer extends CharHandler {
133    private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest();
134    private StringBuilder sb = new StringBuilder();
135    private final CharsetEncoder encoder;
136    private final File file;
137
138    public FileHashComputer(File f) {
139      encoder = StandardCharsets.UTF_8.newEncoder()
140        .onMalformedInput(CodingErrorAction.REPLACE)
141        .onUnmappableCharacter(CodingErrorAction.REPLACE);
142      file = f;
143    }
144
145    @Override
146    protected void handleIgnoreEoL(char c) {
147      sb.append(c);
148    }
149
150    @Override
151    protected void newLine() {
152      sb.append(LINE_FEED);
153      processBuffer();
154      sb.setLength(0);
155    }
156
157    @Override
158    protected void eof() {
159      if (sb.length() > 0) {
160        processBuffer();
161      }
162    }
163
164    private void processBuffer() {
165      try {
166        if (sb.length() > 0) {
167          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
168          globalMd5Digest.update(encoded.array(), 0, encoded.limit());
169        }
170      } catch (CharacterCodingException e) {
171        throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e);
172      }
173    }
174
175    @CheckForNull
176    public String getHash() {
177      return Hex.encodeHexString(globalMd5Digest.digest());
178    }
179  }
180
181  private static class LineHashComputer extends CharHandler {
182    private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
183    private final CharsetEncoder encoder;
184    private final StringBuilder sb = new StringBuilder();
185    private final LineHashConsumer consumer;
186    private final File file;
187    private int line = 1;
188
189    public LineHashComputer(LineHashConsumer consumer, File f) {
190      this.consumer = consumer;
191      this.file = f;
192      this.encoder = StandardCharsets.UTF_8.newEncoder()
193        .onMalformedInput(CodingErrorAction.REPLACE)
194        .onUnmappableCharacter(CodingErrorAction.REPLACE);
195    }
196
197    @Override
198    protected void handleIgnoreEoL(char c) {
199      if (!Character.isWhitespace(c)) {
200        sb.append(c);
201      }
202    }
203
204    @Override
205    protected void newLine() {
206      processBuffer();
207      sb.setLength(0);
208      line++;
209    }
210
211    @Override
212    protected void eof() {
213      if (this.line > 0) {
214        processBuffer();
215      }
216    }
217
218    private void processBuffer() {
219      try {
220        if (sb.length() > 0) {
221          ByteBuffer encoded = encoder.encode(CharBuffer.wrap(sb));
222          lineMd5Digest.update(encoded.array(), 0, encoded.limit());
223          consumer.consume(line, lineMd5Digest.digest());
224        }
225      } catch (CharacterCodingException e) {
226        throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e);
227      }
228    }
229  }
230
231  private static class LineOffsetCounter extends CharHandler {
232    private int currentOriginalOffset = 0;
233    private List<Integer> originalLineOffsets = new ArrayList<>();
234    private int lastValidOffset = 0;
235
236    public LineOffsetCounter() {
237      originalLineOffsets.add(0);
238    }
239
240    @Override
241    protected void handleAll(char c) {
242      currentOriginalOffset++;
243    }
244
245    @Override
246    protected void newLine() {
247      originalLineOffsets.add(currentOriginalOffset);
248    }
249
250    @Override
251    protected void eof() {
252      lastValidOffset = currentOriginalOffset;
253    }
254
255    public List<Integer> getOriginalLineOffsets() {
256      return originalLineOffsets;
257    }
258
259    public int getLastValidOffset() {
260      return lastValidOffset;
261    }
262
263  }
264
265  /**
266   * Compute hash of a file ignoring line ends differences.
267   * Maximum performance is needed.
268   */
269  public Metadata readMetadata(File file, Charset encoding) {
270    LineCounter lineCounter = new LineCounter(file, encoding);
271    FileHashComputer fileHashComputer = new FileHashComputer(file);
272    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
273    readFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter);
274    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
275      lineOffsetCounter.getLastValidOffset());
276  }
277
278  /**
279   * For testing purpose
280   */
281  public Metadata readMetadata(Reader reader) {
282    LineCounter lineCounter = new LineCounter(new File("fromString"), StandardCharsets.UTF_16);
283    FileHashComputer fileHashComputer = new FileHashComputer(new File("fromString"));
284    LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
285    try {
286      read(reader, lineCounter, fileHashComputer, lineOffsetCounter);
287    } catch (IOException e) {
288      throw new IllegalStateException("Should never occurs", e);
289    }
290    return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
291      lineOffsetCounter.getLastValidOffset());
292  }
293
294  public static void readFile(File file, Charset encoding, CharHandler... handlers) {
295    try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file),
296      ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
297      Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) {
298      read(reader, handlers);
299    } catch (IOException e) {
300      throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e);
301    }
302  }
303
304  private static void read(Reader reader, CharHandler... handlers) throws IOException {
305    char c;
306    int i = reader.read();
307    boolean afterCR = false;
308    while (i != -1) {
309      c = (char) i;
310      if (afterCR) {
311        for (CharHandler handler : handlers) {
312          if (c == CARRIAGE_RETURN) {
313            handler.newLine();
314            handler.handleAll(c);
315          } else if (c == LINE_FEED) {
316            handler.handleAll(c);
317            handler.newLine();
318          } else {
319            handler.newLine();
320            handler.handleIgnoreEoL(c);
321            handler.handleAll(c);
322          }
323        }
324        afterCR = c == CARRIAGE_RETURN;
325      } else if (c == LINE_FEED) {
326        for (CharHandler handler : handlers) {
327          handler.handleAll(c);
328          handler.newLine();
329        }
330      } else if (c == CARRIAGE_RETURN) {
331        afterCR = true;
332        for (CharHandler handler : handlers) {
333          handler.handleAll(c);
334        }
335      } else {
336        for (CharHandler handler : handlers) {
337          handler.handleIgnoreEoL(c);
338          handler.handleAll(c);
339        }
340      }
341      i = reader.read();
342    }
343    for (CharHandler handler : handlers) {
344      if (afterCR) {
345        handler.newLine();
346      }
347      handler.eof();
348    }
349  }
350
351  public static class Metadata {
352    final int lines;
353    final int nonBlankLines;
354    final String hash;
355    final int[] originalLineOffsets;
356    final int lastValidOffset;
357
358    private Metadata(int lines, int nonBlankLines, String hash, List<Integer> originalLineOffsets, int lastValidOffset) {
359      this.lines = lines;
360      this.nonBlankLines = nonBlankLines;
361      this.hash = hash;
362      this.originalLineOffsets = Ints.toArray(originalLineOffsets);
363      this.lastValidOffset = lastValidOffset;
364    }
365  }
366
367  public interface LineHashConsumer {
368
369    void consume(int lineIdx, @Nullable byte[] hash);
370
371  }
372
373  /**
374   * Compute a MD5 hash of each line of the file after removing of all blank chars
375   */
376  public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) {
377    readFile(f.file(), f.charset(), new LineHashComputer(consumer, f.file()));
378  }
379}