001 /*
002 * Sonar, open source software quality management tool.
003 * Copyright (C) 2008-2011 SonarSource
004 * mailto:contact AT sonarsource DOT com
005 *
006 * Sonar is free software; you can redistribute it and/or
007 * modify it under the terms of the GNU Lesser General Public
008 * License as published by the Free Software Foundation; either
009 * version 3 of the License, or (at your option) any later version.
010 *
011 * Sonar is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 * Lesser General Public License for more details.
015 *
016 * You should have received a copy of the GNU Lesser General Public
017 * License along with Sonar; if not, write to the Free Software
018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
019 */
020
021 /**
022 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
023 */
024 package net.sourceforge.pmd.cpd;
025
026 import java.io.BufferedReader;
027 import java.io.CharArrayReader;
028 import java.util.NoSuchElementException;
029 import java.util.StringTokenizer;
030
031 /**
032 * This class does a best-guess try-anything tokenization.
033 *
034 * @author jheintz
035 */
036 public class AnyTokenizer implements Tokenizer {
037 public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
038
039 public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
040 StringBuffer sb = sourceCode.getCodeBuffer();
041 BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()));
042 try {
043 int lineNumber = 1;
044 String line = reader.readLine();
045 while (line != null) {
046 StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
047 try {
048 String token = tokenizer.nextToken();
049 while (token != null) {
050 if (!token.equals(" ") && !token.equals("\t")) {
051 tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber));
052 }
053 token = tokenizer.nextToken();
054 }
055 } catch (NoSuchElementException ex) {
056 // done with tokens
057 }
058 // advance iteration variables
059 line = reader.readLine();
060 lineNumber++;
061 }
062 } catch (Exception ex) {
063 ex.printStackTrace();
064 } finally {
065 try {
066 reader.close();
067 } catch (Exception ex) {
068 }
069 tokenEntries.add(TokenEntry.getEOF());
070 }
071 }
072 }