001/*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019 */
020package org.sonar.duplications.java;
021
022import org.sonar.duplications.token.TokenChunker;
023
024/**
025 * See <a href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html">The Java Language Specification, Third Edition: Lexical Structure</a>
026 * and <a href="http://www.jcp.org/en/jsr/detail?id=334">JSR334 (Java 7 - binary integral literals and underscores in numeric literals)</a>.
027 * 
028 * <p>
029 * We decided to use dollar sign as a prefix for normalization, even if it can be a part of an identifier,
030 * because according to Java Language Specification it supposed to be used only in mechanically generated source code.
031 * Thus probability to find it within a normal code should be low.
032 * </p>
033 */
034public final class JavaTokenProducer {
035
036  private JavaTokenProducer() {
037  }
038
039  private static final String NORMALIZED_CHARACTER_LITERAL = "$CHARS";
040  private static final String NORMALIZED_NUMERIC_LITERAL = "$NUMBER";
041
042  private static final String EXP = "([Ee][+-]?+[0-9_]++)";
043  private static final String BINARY_EXP = "([Pp][+-]?+[0-9_]++)";
044
045  private static final String FLOAT_SUFFIX = "[fFdD]";
046  private static final String INT_SUFFIX = "[lL]";
047
048  public static TokenChunker build() {
049    return TokenChunker.builder()
050        // White Space
051        .ignore("\\s")
052        // Comments
053        .ignore("//[^\\n\\r]*+")
054        .ignore("/\\*[\\s\\S]*?\\*/")
055        // String Literals
056        .token("\"([^\"\\\\]*+(\\\\[\\s\\S])?+)*+\"", NORMALIZED_CHARACTER_LITERAL)
057        // Character Literals
058        .token("'([^'\\n\\\\]*+(\\\\.)?+)*+'", NORMALIZED_CHARACTER_LITERAL)
059        // Identifiers, Keywords, Boolean Literals, The Null Literal
060        .token("\\p{javaJavaIdentifierStart}++\\p{javaJavaIdentifierPart}*+")
061        // Floating-Point Literals
062        .token("[0-9_]++\\.([0-9_]++)?+" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
063        .token("\\.[0-9_]++" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
064        .token("[0-9_]++" + EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
065        .token("0[xX][0-9a-fA-F_]++\\.[0-9a-fA-F_]*+" + BINARY_EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
066        .token("0[xX][0-9a-fA-F_]++" + BINARY_EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
067        // Integer Literals
068        .token("0[xX][0-9a-fA-F_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
069        .token("0[bB][01_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Binary (Java 7)
070        .token("[0-9_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal and Octal
071        // Any other character
072        .token(".")
073        .build();
074  }
075
076}