001    /*
002     * Sonar, open source software quality management tool.
003     * Copyright (C) 2008-2011 SonarSource
004     * mailto:contact AT sonarsource DOT com
005     *
006     * Sonar is free software; you can redistribute it and/or
007     * modify it under the terms of the GNU Lesser General Public
008     * License as published by the Free Software Foundation; either
009     * version 3 of the License, or (at your option) any later version.
010     *
011     * Sonar is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014     * Lesser General Public License for more details.
015     *
016     * You should have received a copy of the GNU Lesser General Public
017     * License along with Sonar; if not, write to the Free Software
018     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
019     */
020    
021    /**
022     * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
023     */
024    package net.sourceforge.pmd.cpd;
025    
026    import java.io.BufferedReader;
027    import java.io.CharArrayReader;
028    import java.util.NoSuchElementException;
029    import java.util.StringTokenizer;
030    
031    /**
032     * This class does a best-guess try-anything tokenization.
033     *
034     * @author jheintz
035     */
036    public class AnyTokenizer implements Tokenizer {
037        public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
038    
039        public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
040            StringBuffer sb = sourceCode.getCodeBuffer();
041            BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()));
042            try {
043                int lineNumber = 1;
044                String line = reader.readLine();
045                while (line != null) {
046                    StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
047                    try {
048                        String token = tokenizer.nextToken();
049                        while (token != null) {
050                            if (!token.equals(" ") && !token.equals("\t")) {
051                                tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber));
052                            }
053                            token = tokenizer.nextToken();
054                        }
055                    } catch (NoSuchElementException ex) {
056                        // done with tokens
057                    }
058                    // advance iteration variables
059                    line = reader.readLine();
060                    lineNumber++;
061                }
062            } catch (Exception ex) {
063                ex.printStackTrace();
064            } finally {
065                try {
066                    reader.close();
067                } catch (Exception ex) {
068                }
069                tokenEntries.add(TokenEntry.getEOF());
070            }
071        }
072    }