001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2012 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020 package org.sonar.duplications.java;
021
022 import org.sonar.duplications.token.TokenChunker;
023
024 /**
025 * See <a href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html">The Java Language Specification, Third Edition: Lexical Structure</a>
026 * and <a href="http://www.jcp.org/en/jsr/detail?id=334">JSR334 (Java 7 - binary integral literals and underscores in numeric literals)</a>.
027 *
028 * <p>
029 * We decided to use dollar sign as a prefix for normalization, even if it can be a part of an identifier,
030 * because according to Java Language Specification it supposed to be used only in mechanically generated source code.
031 * Thus probability to find it within a normal code should be low.
032 * </p>
033 */
034 public final class JavaTokenProducer {
035
036 private JavaTokenProducer() {
037 }
038
039 private static final String NORMALIZED_CHARACTER_LITERAL = "$CHARS";
040 private static final String NORMALIZED_NUMERIC_LITERAL = "$NUMBER";
041
042 private static final String EXP = "([Ee][+-]?+[0-9_]++)";
043 private static final String BINARY_EXP = "([Pp][+-]?+[0-9_]++)";
044
045 private static final String FLOAT_SUFFIX = "[fFdD]";
046 private static final String INT_SUFFIX = "[lL]";
047
048 public static TokenChunker build() {
049 return TokenChunker.builder()
050 // White Space
051 .ignore("\\s")
052 // Comments
053 .ignore("//[^\\n\\r]*+")
054 .ignore("/\\*[\\s\\S]*?\\*/")
055 // String Literals
056 .token("\"([^\"\\\\]*+(\\\\[\\s\\S])?+)*+\"", NORMALIZED_CHARACTER_LITERAL)
057 // Character Literals
058 .token("'([^'\\n\\\\]*+(\\\\.)?+)*+'", NORMALIZED_CHARACTER_LITERAL)
059 // Identifiers, Keywords, Boolean Literals, The Null Literal
060 .token("\\p{javaJavaIdentifierStart}++\\p{javaJavaIdentifierPart}*+")
061 // Floating-Point Literals
062 .token("[0-9_]++\\.([0-9_]++)?+" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
063 .token("\\.[0-9_]++" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
064 .token("[0-9_]++" + EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
065 .token("0[xX][0-9a-fA-F_]++\\.[0-9a-fA-F_]*+" + BINARY_EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
066 .token("0[xX][0-9a-fA-F_]++" + BINARY_EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
067 // Integer Literals
068 .token("0[xX][0-9a-fA-F_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
069 .token("0[bB][01_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Binary (Java 7)
070 .token("[0-9_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal and Octal
071 // Any other character
072 .token(".")
073 .build();
074 }
075
076 }