Parse text

There are various ways of parsing text. The usual tools are: Example 1

This example uses Scanner. Here, the contents of a file containing name-value pairs is read, and each line is parsed into its constituent data.


import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Scanner;

/** Assumes UTF-8 encoding. JDK 7+. */
public class ReadWithScanner {

  public static void main(String... aArgs) throws IOException {
    ReadWithScanner parser = new ReadWithScanner("C:\\Temp\\test.txt");
    parser.processLineByLine();
    log("Done.");
  }
  
  /**
   Constructor.
   @param aFileName full name of an existing, readable file.
  */
  public ReadWithScanner(String aFileName){
    fFilePath = Paths.get(aFileName);
  }
  
  
  /** Template method that calls {@link #processLine(String)}.  */
  public final void processLineByLine() throws IOException {
    try (Scanner scanner =  new Scanner(fFilePath, ENCODING.name())){
      while (scanner.hasNextLine()){
        processLine(scanner.nextLine());
      }      
    }
  }
  
  /** 
   Overridable method for processing lines in different ways.
    
   <P>This simple default implementation expects simple name-value pairs, separated by an 
   '=' sign. Examples of valid input: 
   <tt>height = 167cm</tt>
   <tt>mass =  65kg</tt>
   <tt>disposition =  "grumpy"</tt>
   <tt>this is the name = this is the value</tt>
  */
  protected void processLine(String aLine){
    //use a second Scanner to parse the content of each line 
    Scanner scanner = new Scanner(aLine);
    scanner.useDelimiter("=");
    if (scanner.hasNext()){
      //assumes the line has a certain structure
      String name = scanner.next();
      String value = scanner.next();
      log("Name is : " + quote(name.trim()) + ", and Value is : " + quote(value.trim()));
    }
    else {
      log("Empty or invalid line. Unable to process.");
    }
  }
  
  // PRIVATE 
  private final Path fFilePath;
  private final static Charset ENCODING = StandardCharsets.UTF_8;  
  
  private static void log(Object aObject){
    System.out.println(String.valueOf(aObject));
  }
  
  private String quote(String aText){
    String QUOTE = "'";
    return QUOTE + aText + QUOTE;
  }
} 


For a file containing:
height = 167cm
mass =  65kg
disposition =  "grumpy"
this is the name = this is the value
the output of the above class is:
Name is : 'height', and Value is : '167cm'
Name is : 'mass', and Value is : '65kg'
Name is : 'disposition', and Value is : '"grumpy"'
Name is : 'this is the name', and Value is : 'this is the value'
Done.

Example 2

This example uses StringTokenizer. This class is used to parse the text entered into a search box on a web page. It returns a Set of tokens to be used for pattern matching. Here, any text appearing in quotes is treated as a single search token. All other text is split into tokens based simply on whitespace.

An example run:

>java -cp . SearchBoxParser
[mars, sun, milky way, venus]


import java.util.*;

/**
* The user enters text into a search box. This class is used
* to parse that text into specific search terms (or tokens).
* It eliminates common words, and allows for the quoting of text, using
* double quotes.
* JDK 7+.
*/
public final class SearchBoxParser {

  public static void main(String... aArguments) {
    SearchBoxParser parser = new SearchBoxParser("mars venus \"milky way\" sun");
    Set<String> tokens = parser.parseSearchText();
    //display the tokens
    System.out.println(tokens);
  }

  /**
  * @param aSearchText is non-null, but may have no content,
  * and represents what the user has input in a search box.
  */
  public SearchBoxParser(String aSearchText) {
    if (aSearchText == null) {
      throw new IllegalArgumentException("Search Text cannot be null.");
    }
    fSearchText = aSearchText;
  }

  /**
  * Parse the user's search box input into a Set of String tokens.
  *
  * @return Set of Strings, one for each word in fSearchText; here "word"
  * is defined as either a lone word surrounded by whitespace, or as a series
  * of words surrounded by double quotes, "like this"; also, very common
  * words (and, the, etc.) do not qualify as possible search targets.
  */
  public Set<String> parseSearchText() {
    Set<String> result = new LinkedHashSet<>();

    boolean returnTokens = true;
    String currentDelims = fWHITESPACE_AND_QUOTES;
    StringTokenizer parser = new StringTokenizer(
      fSearchText, currentDelims, returnTokens
    );

    String token = null;
    while (parser.hasMoreTokens()) {
      token = parser.nextToken(currentDelims);
      if (!isDoubleQuote(token)){
        addNonTrivialWordToResult(token, result);
      }
      else {
        currentDelims = flipDelimiters(currentDelims);
      }
    }
    return result;
  }

  // PRIVATE 
  private String fSearchText;
  private static final Set<String> fCOMMON_WORDS = new LinkedHashSet<>();
  private static final String fDOUBLE_QUOTE = "\"";

  //the parser flips between these two sets of delimiters
  private static final String fWHITESPACE_AND_QUOTES = " \t\r\n\"";
  private static final String fQUOTES_ONLY ="\"";

  /**Very common words to be excluded from searches.*/
  static {
    fCOMMON_WORDS.add("a");
    fCOMMON_WORDS.add("and");
    fCOMMON_WORDS.add("be");
    fCOMMON_WORDS.add("for");
    fCOMMON_WORDS.add("from");
    fCOMMON_WORDS.add("has");
    fCOMMON_WORDS.add("i");
    fCOMMON_WORDS.add("in");
    fCOMMON_WORDS.add("is");
    fCOMMON_WORDS.add("it");
    fCOMMON_WORDS.add("of");
    fCOMMON_WORDS.add("on");
    fCOMMON_WORDS.add("to");
    fCOMMON_WORDS.add("the");
  }

  /**
  * Use to determine if a particular word entered in the
  * search box should be discarded from the search.
  */
  private boolean isCommonWord(String aSearchTokenCandidate){
    return fCOMMON_WORDS.contains(aSearchTokenCandidate);
  }

  private boolean textHasContent(String aText){
    return (aText != null) && (!aText.trim().equals(""));
  }

  private void addNonTrivialWordToResult(String aToken, Set<String> aResult){
    if (textHasContent(aToken) && !isCommonWord(aToken.trim())) {
      aResult.add(aToken.trim());
    }
  }

  private boolean isDoubleQuote(String aToken){
    return aToken.equals(fDOUBLE_QUOTE);
  }

  private String flipDelimiters(String aCurrentDelims){
    String result = null;
    if (aCurrentDelims.equals(fWHITESPACE_AND_QUOTES)){
      result = fQUOTES_ONLY;
    }
    else {
      result = fWHITESPACE_AND_QUOTES;
    }
    return result;
  }
} 


Example 3

This example demonstrates use of regular expressions, by parsing a fully-qualified type name into two parts - the package and the "simple" type name.


import java.util.regex.*;

public final class RegularExpressions {

  /**
  * The pattern is matched to the first argument.
  */
  public static void main (String... aArguments) {
    matchParts(aArguments[0]);
    matchAll(aArguments[0]);
  }

  /**
  * The Matcher.find method attempts to match *parts* of the input
  * to the given pattern.
  */
  private static void matchParts(String aText){
    log(fNEW_LINE + "Match PARTS:");
    //note the necessity of the comments flag, since our regular
    //expression contains comments:
    Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS);
    Matcher matcher = pattern.matcher(aText);
    while (matcher.find()) {
      log("Num groups: " + matcher.groupCount());
      log("Package: " + matcher.group(1));
      log("Class: " + matcher.group(2));
    }
  }

  /**
  * The Matcher.matches method attempts to match the *entire*
  * input to the given pattern all at once.
  */
  private static void matchAll(String aText){
    log(fNEW_LINE + "Match ALL:");
    Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS);
    Matcher matcher = pattern.matcher(aText);
    if(matcher.matches()) {
      log("Num groups: " + matcher.groupCount());
      log("Package: " + matcher.group(1));
      log("Class: " + matcher.group(2));
    }
    else {
      log("Input does not match pattern.");
    }
  }

  //PRIVATE

  private static final String fNEW_LINE = System.getProperty("line.separator");
  
  private static void log(String aMessage){
    log(aMessage);
  }

  /**
  * A commented regular expression for fully-qualified type names which
  * follow the common naming conventions, for example, "com.myappBlah.Thing".
  *
  * Thus, the "dot + capital letter" is sufficient to define where the
  * package names end.
  *
  * This regular expression uses two groups, one for the package, and one
  * for the class. Groups are defined by parentheses. Note that ?: will
  * define a group as "non-contributing"; that is, it will not contribute
  * to the return values of the <tt>group</tt> method.
  * 
  * As you can see, regular expressions are often cryptic.
  */
  private static final String fREGEXP =
    "#Group1 - Package prefix without last dot: " + fNEW_LINE +
    "( (?:\\w|\\.)+ ) \\." + fNEW_LINE +
    "#Group2 - Class name starts with uppercase: " + fNEW_LINE +
    "( [A-Z](?:\\w)+ )"
  ;
} 


Some example runs:

>java -cp . RegularExpressions "java.java.Thing java.lang.Random"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Num groups: 2
Package: java.lang
Class: Random

Match ALL:
Input does not match pattern.

>java -cp . RegularExpressions "java.java.Thing"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Num groups: 2
Package: java.java
Class: Thing

>java -cp . RegularExpressions "java.java.Thing "

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Input does not match pattern.


See Also :
Reading and writing text files
Pattern-match lines of a file
Compile regular expressions once
Would you use this technique?
Yes   No   Undecided   
© 2014 Hirondelle Systems | Source Code | Contact | License | RSS
Individual code snippets can be used under this BSD license - Last updated on September 21, 2013.
Over 2,000,000 unique IPs last year - Built with WEB4J.
- In Memoriam : Bill Dirani -