Parse text

There are various ways of parsing text. The usual tools are : Example 1

This example uses Scanner. Here, the contents of a file containing name-value pairs is read, and each line is parsed into its constituent data.


import java.io.*;
import java.util.Scanner;

public final class ReadWithScanner {

  public static void main(String... aArgs) throws FileNotFoundException {
    ReadWithScanner parser = new ReadWithScanner("C:\\Temp\\test.txt");
    parser.processLineByLine();
    log("Done.");
  }
  
  /**
  * @param aFileName full name of an existing, readable file.
  */
  public ReadWithScanner(String aFileName){
    fFile = new File(aFileName);  
  }
  
  /** Template method that calls {@link #processLine(String)}.  */
  public final void processLineByLine() throws FileNotFoundException {
    Scanner scanner = new Scanner(fFile);
    try {
      //first use a Scanner to get each line
      while ( scanner.hasNextLine() ){
        processLine( scanner.nextLine() );
      }
    }
    finally {
      //ensure the underlying stream is always closed
      scanner.close();
    }
  }
  
  /** 
  * Overridable method for processing lines in different ways.
  *  
  * <P>This simple default implementation expects simple name-value pairs, separated by an 
  * '=' sign. Examples of valid input : 
  * <tt>height = 167cm</tt>
  * <tt>mass =  65kg</tt>
  * <tt>disposition =  "grumpy"</tt>
  * <tt>this is the name = this is the value</tt>
  */
  protected void processLine(String aLine){
    //use a second Scanner to parse the content of each line 
    Scanner scanner = new Scanner(aLine);
    scanner.useDelimiter("=");
    if ( scanner.hasNext() ){
      String name = scanner.next();
      String value = scanner.next();
      log("Name is : " + quote(name.trim()) + ", and Value is : " + quote(value.trim()) );
    }
    else {
      log("Empty or invalid line. Unable to process.");
    }
    //(no need for finally here, since String is source)
    scanner.close();
  }
  
  // PRIVATE //
  private final File fFile;
  
  private static void log(Object aObject){
    System.out.println(String.valueOf(aObject));
  }
  
  private String quote(String aText){
    String QUOTE = "'";
    return QUOTE + aText + QUOTE;
  }
} 


For a file containing :
height = 167cm
mass =  65kg
disposition =  "grumpy"
this is the name = this is the value
the output of the above class is :
Name is : 'height', and Value is : '167cm'
Name is : 'mass', and Value is : '65kg'
Name is : 'disposition', and Value is : '"grumpy"'
Name is : 'this is the name', and Value is : 'this is the value'
Done.

Example 2

This example uses StringTokenizer. This class is used to parse the text entered into a search box on a web page. It returns a Set of tokens to be used for pattern matching. Here, any text appearing in quotes is treated as a single search token. All other text is split into tokens based simply on whitespace.

An example run:

>java -cp . SearchBoxParser
[mars, sun, milky way, venus]


import java.util.*;

/**
* The user enters text into a search box. This class is used
* to parse that text into specific search terms (or tokens).
* It eliminates common words, and allows for the quoting of text, using
* double quotes.
*/
public final class SearchBoxParser {

  public static void main(String... aArguments) {
    SearchBoxParser parser = new SearchBoxParser( "mars venus \"milky way\" sun" );
    Set<String> tokens = parser.parseSearchText();
    //display the tokens
    System.out.println(tokens);
  }

  /**
  * @param aSearchText is non-null, but may have no content,
  * and represents what the user has input in a search box.
  */
  public SearchBoxParser( String aSearchText ) {
    if ( aSearchText == null ) {
      throw new IllegalArgumentException("Search Text cannot be null.");
    }
    fSearchText = aSearchText;
  }

  /**
  * Parse the user's search box input into a Set of String tokens.
  *
  * @return Set of Strings, one for each word in fSearchText; here "word"
  * is defined as either a lone word surrounded by whitespace, or as a series
  * of words surrounded by double quotes, "like this"; also, very common
  * words (and, the, etc.) do not qualify as possible search targets.
  */
  public Set<String> parseSearchText() {
    Set<String> result = new HashSet<String>();

    boolean returnTokens = true;
    String currentDelims = fWHITESPACE_AND_QUOTES;
    StringTokenizer parser = new StringTokenizer(
      fSearchText,
      currentDelims,
      returnTokens
    );

    String token = null;
    while ( parser.hasMoreTokens() ) {
      token = parser.nextToken(currentDelims);
      if ( !isDoubleQuote(token) ){
        addNonTrivialWordToResult( token, result );
      }
      else {
        currentDelims = flipDelimiters(currentDelims);
      }
    }
    return result;
  }

  // PRIVATE //
  private String fSearchText;
  private static final Set<String> fCOMMON_WORDS = new HashSet<String>();
  private static final String fDOUBLE_QUOTE = "\"";

  //the parser flips between these two sets of delimiters
  private static final String fWHITESPACE_AND_QUOTES = " \t\r\n\"";
  private static final String fQUOTES_ONLY ="\"";

  /**
  * Very common words against which searches will not be
  * performed.
  */
  static {
    fCOMMON_WORDS.add("a");
    fCOMMON_WORDS.add("and");
    fCOMMON_WORDS.add("be");
    fCOMMON_WORDS.add("for");
    fCOMMON_WORDS.add("from");
    fCOMMON_WORDS.add("has");
    fCOMMON_WORDS.add("i");
    fCOMMON_WORDS.add("in");
    fCOMMON_WORDS.add("is");
    fCOMMON_WORDS.add("it");
    fCOMMON_WORDS.add("of");
    fCOMMON_WORDS.add("on");
    fCOMMON_WORDS.add("to");
    fCOMMON_WORDS.add("the");
  }

  /**
  * Use to determine if a particular word entered in the
  * search box should be discarded from the search.
  */
  private boolean isCommonWord( String aSearchTokenCandidate ) {
    return fCOMMON_WORDS.contains(aSearchTokenCandidate);
  }

  private boolean textHasContent(String aText) {
    return (aText != null) && (!aText.trim().equals(""));
  }

  private void addNonTrivialWordToResult( String aToken, Set<String> aResult ){
    if ( textHasContent(aToken) && !isCommonWord(aToken.trim()) ) {
      aResult.add( aToken.trim() );
    }
  }

  private boolean isDoubleQuote( String aToken ){
    return aToken.equals(fDOUBLE_QUOTE);
  }

  private String flipDelimiters( String aCurrentDelims ) {
    String result = null;
    if ( aCurrentDelims.equals(fWHITESPACE_AND_QUOTES) ) {
      result = fQUOTES_ONLY;
    }
    else {
      result = fWHITESPACE_AND_QUOTES;
    }
    return result;
  }
} 


Example 3

This example demonstrates use of regular expressions, by parsing a fully-qualified type name into two parts - the package and the "simple" type name.


import java.util.regex.*;

public final class RegularExpressions {

  /**
  * The pattern is matched to the first argument.
  */
  public static void main (String[] aArguments) {
    matchParts(aArguments[0]);
    matchAll(aArguments[0]);
  }

  /**
  * The Matcher.find method attempts to match *parts* of the input
  * to the given pattern.
  */
  private static void matchParts( String aText ){
    System.out.println(fNEW_LINE + "Match PARTS:");
    //(note the necessity of the comments flag, since our regular
    //expression contains comments:)
    Pattern pattern = Pattern.compile( fREGEXP, Pattern.COMMENTS );
    Matcher matcher = pattern.matcher( aText );
    while ( matcher.find() ) {
      System.out.println("Num groups: " + matcher.groupCount());
      System.out.println("Package: " + matcher.group(1));
      System.out.println("Class: " + matcher.group(2));
    }
  }

  /**
  * The Matcher.matches method attempts to match the *entire*
  * input to the given pattern all at once.
  */
  private static void matchAll( String aText ){
    System.out.println(fNEW_LINE + "Match ALL:");
    Pattern pattern = Pattern.compile( fREGEXP, Pattern.COMMENTS );
    Matcher matcher = pattern.matcher( aText );
    if( matcher.matches() ) {
      System.out.println("Num groups: " + matcher.groupCount());
      System.out.println("Package: " + matcher.group(1));
      System.out.println("Class: " + matcher.group(2));
    }
    else {
      System.err.println("Input does not match pattern.");
    }
  }

  //PRIVATE //

  private static final String fNEW_LINE = System.getProperty("line.separator");

  /**
  * A commented regular expression for fully-qualified type names which
  * follow the common naming conventions, for example, "com.myappBlah.Thing".
  *
  * Thus, the "dot + capital letter" is sufficient to define where the
  * package names end.
  *
  * This regular expression uses two groups, one for the package, and one
  * for the class. Groups are defined by parentheses. Note that ?: will
  * define a group as "non-contributing"; that is, it will not contribute
  * to the return values of the <code>group</code> method.
  */
  private static final String fREGEXP =
      "#Group1 - Package prefix without last dot: " + fNEW_LINE +
      "( (?:\\w|\\.)+ ) \\." + fNEW_LINE +
      "#Group2 - Class name starts with uppercase: " + fNEW_LINE +
      "( [A-Z](?:\\w)+ )";
} 


Some example runs:

>java -cp . RegularExpressions "java.java.Thing java.lang.Random"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Num groups: 2
Package: java.lang
Class: Random

Match ALL:
Input does not match pattern.

>java -cp . RegularExpressions "java.java.Thing"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Num groups: 2
Package: java.java
Class: Thing

>java -cp . RegularExpressions "java.java.Thing "

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Input does not match pattern.


See Also :
Reading and writing text files
Pattern-match lines of a file
Compile regular expressions once
Would you use this technique?
Yes   No   Undecided   
© 2009 Hirondelle Systems | Source Code | Contact | License | Quotes | RSS
Individual classes can be used under this BSD License - Last updated on June 28, 2009.
Over 115,000 unique IPs last month - Built with WEB4J.
- In Memoriam : Bill Dirani -