001 /* 002 * Sonar, open source software quality management tool. 003 * Copyright (C) 2008-2012 SonarSource 004 * mailto:contact AT sonarsource DOT com 005 * 006 * Sonar is free software; you can redistribute it and/or 007 * modify it under the terms of the GNU Lesser General Public 008 * License as published by the Free Software Foundation; either 009 * version 3 of the License, or (at your option) any later version. 010 * 011 * Sonar is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 * Lesser General Public License for more details. 015 * 016 * You should have received a copy of the GNU Lesser General Public 017 * License along with Sonar; if not, write to the Free Software 018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 019 */ 020 package org.sonar.duplications.java; 021 022 import org.sonar.duplications.token.TokenChunker; 023 024 /** 025 * See <a href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html">The Java Language Specification, Third Edition: Lexical Structure</a> 026 * and <a href="http://www.jcp.org/en/jsr/detail?id=334">JSR334 (Java 7 - binary integral literals and underscores in numeric literals)</a>. 027 * 028 * <p> 029 * We decided to use dollar sign as a prefix for normalization, even if it can be a part of an identifier, 030 * because according to Java Language Specification it supposed to be used only in mechanically generated source code. 031 * Thus probability to find it within a normal code should be low. 032 * </p> 033 */ 034 public final class JavaTokenProducer { 035 036 private JavaTokenProducer() { 037 } 038 039 private static final String NORMALIZED_CHARACTER_LITERAL = "$CHARS"; 040 private static final String NORMALIZED_NUMERIC_LITERAL = "$NUMBER"; 041 042 private static final String EXP = "([Ee][+-]?+[0-9_]++)"; 043 private static final String BINARY_EXP = "([Pp][+-]?+[0-9_]++)"; 044 045 private static final String FLOAT_SUFFIX = "[fFdD]"; 046 private static final String INT_SUFFIX = "[lL]"; 047 048 public static TokenChunker build() { 049 return TokenChunker.builder() 050 // White Space 051 .ignore("\\s") 052 // Comments 053 .ignore("//[^\\n\\r]*+") 054 .ignore("/\\*[\\s\\S]*?\\*/") 055 // String Literals 056 .token("\"([^\"\\\\]*+(\\\\[\\s\\S])?+)*+\"", NORMALIZED_CHARACTER_LITERAL) 057 // Character Literals 058 .token("'([^'\\n\\\\]*+(\\\\.)?+)*+'", NORMALIZED_CHARACTER_LITERAL) 059 // Identifiers, Keywords, Boolean Literals, The Null Literal 060 .token("\\p{javaJavaIdentifierStart}++\\p{javaJavaIdentifierPart}*+") 061 // Floating-Point Literals 062 .token("[0-9_]++\\.([0-9_]++)?+" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal 063 .token("\\.[0-9_]++" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal 064 .token("[0-9_]++" + EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal 065 .token("0[xX][0-9a-fA-F_]++\\.[0-9a-fA-F_]*+" + BINARY_EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal 066 .token("0[xX][0-9a-fA-F_]++" + BINARY_EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal 067 // Integer Literals 068 .token("0[xX][0-9a-fA-F_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Hexadecimal 069 .token("0[bB][01_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Binary (Java 7) 070 .token("[0-9_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL) // Decimal and Octal 071 // Any other character 072 .token(".") 073 .build(); 074 } 075 076 }