001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2012 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    package org.sonar.duplications.java;
021    
022    import org.sonar.duplications.token.TokenChunker;
023    
024    /**
025     * See <a href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html">The Java Language Specification, Third Edition: Lexical Structure</a>
026     * and <a href="http://www.jcp.org/en/jsr/detail?id=334">JSR334 (Java 7 - binary integral literals and underscores in numeric literals)</a>.
027     * 
028     * <p>
029     * We decided to use dollar sign as a prefix for normalization, even if it can be a part of an identifier,
030     * because according to Java Language Specification it supposed to be used only in mechanically generated source code.
031     * Thus probability to find it within a normal code should be low.
032     * </p>
033     */
034    public final class JavaTokenProducer {
035    
036      private JavaTokenProducer() {
037      }
038    
039      private static final String NORMALIZED_CHARACTER_LITERAL = "$CHARS";
040      private static final String NORMALIZED_NUMERIC_LITERAL = "$NUMBER";
041    
042      private static final String EXP = "([Ee][+-]?+[0-9_]++)";
043      private static final String BINARY_EXP = "([Pp][+-]?+[0-9_]++)";
044    
045      private static final String FLOAT_SUFFIX = "[fFdD]";
046      private static final String INT_SUFFIX = "[lL]";
047    
048      public static TokenChunker build() {
049        return TokenChunker.builder()
050            // White Space
051            .ignore("\\s")
052            // Comments
053            .ignore("//[^\\n\\r]*+")
054            .ignore("/\\*[\\s\\S]*?\\*/")
055            // String Literals
056            .token("\"([^\"\\\\]*+(\\\\[\\s\\S])?+)*+\"", NORMALIZED_CHARACTER_LITERAL)
057            // Character Literals
058            .token("'([^'\\n\\\\]*+(\\\\.)?+)*+'", NORMALIZED_CHARACTER_LITERAL)
059            // Identifiers, Keywords, Boolean Literals, The Null Literal
060            .token("\\p{javaJavaIdentifierStart}++\\p{javaJavaIdentifierPart}*+")
061            // Floating-Point Literals
062            .token("[0-9_]++\\.([0-9_]++)?+" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
063            .token("\\.[0-9_]++" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
064            .token("[0-9_]++" + EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal
065            .token("0[xX][0-9a-fA-F_]++\\.[0-9a-fA-F_]*+" + BINARY_EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
066            .token("0[xX][0-9a-fA-F_]++" + BINARY_EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
067            // Integer Literals
068            .token("0[xX][0-9a-fA-F_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal
069            .token("0[bB][01_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Binary (Java 7)
070            .token("[0-9_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal and Octal
071            // Any other character
072            .token(".")
073            .build();
074      }
075    
076    }