Fetch web page and header

Here's an example of programmatically fetching the HTML content of a web page as simple text.

This could be used, for example, to fetch stock prices or the weather forecast from the web - the raw HTML is first fetched, then the desired content is extracted and presented in some customized manner.

Of course, if the web site publishes a true API, and serves structured data using JSON, XML, or similar, then that should be used instead of fetching hypertext.


import java.io.*;
import java.net.*;
import java.util.Scanner;

/** Fetches the HTML content of a web page (or HTTP header) as a String. */
public final class WebPageFetcher {

  /**
  * Demo harness.
  * 
  * <ul>
  * <li>aArgs[0] : an HTTP URL
  * <li>aArgs[1] : (header | content)
  * </ul>
  */
  public static void main(String... aArgs) throws MalformedURLException {
    String url = aArgs[0];
    String option = aArgs[1];
    WebPageFetcher fetcher = new  WebPageFetcher(url);
    if (HEADER.equalsIgnoreCase(option)) {
      log(fetcher.getPageHeader());
    }
    else if (CONTENT.equalsIgnoreCase(option)) {
      log(fetcher.getPageContent());
    }
    else {
      log("Unknown option.");
    }
  }

  public WebPageFetcher(URL aURL){
    if (! HTTP.equals(aURL.getProtocol())) {
      throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL);
    }
    fURL = aURL;
  }

  public WebPageFetcher(String aUrlName) throws MalformedURLException {
    this(new URL(aUrlName));
  }

  /** Fetch the HTML content of the page as simple text.*/
  public String getPageContent() {
    String result = null;
    URLConnection connection = null;
    try {
      connection =  fURL.openConnection();
      Scanner scanner = new Scanner(connection.getInputStream());
      scanner.useDelimiter(END_OF_INPUT);
      result = scanner.next();
    }
    catch (IOException ex) {
      log("Cannot open connection to " + fURL.toString());
    }
    return result;
  }

  /** Fetch HTML headers as simple text.  */
  public String getPageHeader(){
    StringBuilder result = new StringBuilder();

    URLConnection connection = null;
    try {
      connection = fURL.openConnection();
    }
    catch (IOException ex) {
      log("Cannot open connection to URL: " + fURL);
    }

    //not all headers come in key-value pairs - sometimes the key is
    //null or an empty String
    int headerIdx = 0;
    String headerKey = null;
    String headerValue = null;
    while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) {
      headerKey = connection.getHeaderFieldKey(headerIdx);
      if (headerKey != null && headerKey.length()>0) {
        result.append(headerKey);
        result.append(" : ");
      }
      result.append(headerValue);
      result.append(NEWLINE);
      headerIdx++;
    }
    return result.toString();
  }

  // PRIVATE
  private URL fURL;
  
  private static final String HTTP = "http";
  private static final String HEADER = "header";
  private static final String CONTENT = "content";
  private static final String END_OF_INPUT = "\\Z";
  private static final String NEWLINE = System.getProperty("line.separator");

  private static void log(Object aObject){
    System.out.println(aObject);
  }
} 


An example run, fetching an HTTP header from google.com:
>java -cp . WebPageFetcher http://www.google.com/ header
HTTP/1.1 200 OK
Cache-Control : private
Content-Type : text/html; charset=ISO-8859-1
Server : GWS/2.1
Transfer-Encoding : chunked
Date : Wed, 29 Aug 2007 13:21:40 GMT


Would you use this technique?
Yes   No   Undecided   
© 2014 Hirondelle Systems | Source Code | Contact | License | RSS
Individual code snippets can be used under this BSD license - Last updated on September 21, 2013.
Over 2,000,000 unique IPs last year - Built with WEB4J.
- In Memoriam : Bill Dirani -