Java Practices->Fetch text from the web

Fetch text from the web

Most URLs return text of some sort - HTML, JSON, XML, plain text, and so on. Here's an example of programmatically fetching the content of a URL as a single String.

If you need data from a site, but the site has no formal web API, then you usually have no choice but to fetch HTML from the site, parse it, and extract the required data.

import java.io.*;
import java.net.*;
import java.util.Objects;
import java.util.Scanner;

/** 
 Fetch the text of a web page (or HTTP header) as a String.
 The same sort of code can be used to fetch any kind of text: 
 HTML, JSON, XML, plain text, and so on. 
*/
public final class WebPageFetcher {

  /**
   Demo harness.
   <ul>
    <li>args[0] : an HTTP URL
    <li>args[1] : (header | content)
   </ul>
  */
  public static void main(String... args) throws MalformedURLException {
    String url = args[0];
    String option = args[1];
    WebPageFetcher fetcher = new  WebPageFetcher(url);
    if (HEADER.equalsIgnoreCase(option)) {
      log(fetcher.getPageHeader());
    }
    else if (CONTENT.equalsIgnoreCase(option)) {
      log(fetcher.getPageContent());
    }
    else {
      log("Unknown option.");
    }
  }

  public WebPageFetcher(URL url){
    if (! HTTP.equals(url.getProtocol())) {
      throw new IllegalArgumentException("URL is not for HTTP Protocol: " + url);
    }
    this.url = url;
  }

  public WebPageFetcher(String urlName) throws MalformedURLException {
    this(new URL(urlName));
  }

  /** Fetch the content of the URL as simple text.*/
  public String getPageContent() {
    String result = null;
    URLConnection connection = null;
    try {
      connection =  url.openConnection();
      try(Scanner scanner = new Scanner(connection.getInputStream())){
        scanner.useDelimiter(END_OF_INPUT);
        result = scanner.next();
      }
    }
    catch (IOException ex) {
      log("Cannot open connection to " + url.toString());
    }
    return result;
  }

  /** 
   Fetch all HTTP headers as simple text.
   One header per line, as a 'key : value' pair.  
  */
  public String getPageHeader(){
    StringBuilder result = new StringBuilder();

    URLConnection connection = null;
    try {
      connection = url.openConnection();
    }
    catch (IOException ex) {
      log("Cannot open connection to URL: " + url);
    }

    //not all headers come in key-value pairs - sometimes the key is
    //null or an empty String
    int headerIdx = 0;
    String headerKey = null;
    String headerValue = null;
    while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) {
      headerKey = connection.getHeaderFieldKey(headerIdx);
      if (headerKey != null && headerKey.length()>0) {
        result.append(headerKey);
        result.append(" : ");
      }
      result.append(headerValue);
      result.append(NEWLINE);
      headerIdx++;
    }
    return result.toString();
  }

  // PRIVATE
  private URL url;
  
  private static final String HTTP = "http";
  private static final String HEADER = "header";
  private static final String CONTENT = "content";
  private static final String END_OF_INPUT = "\\Z";
  private static final String NEWLINE = System.getProperty("line.separator");

  private static void log(Object thing){
    System.out.println(Objects.toString(thing));
  }
}

An example run, fetching the HTTP headers from date4j.net:

>java -cp . WebPageFetcher http://www.date4j.net/ header
HTTP/1.1 200 OK
Date : Tue, 14 Nov 2017 00:24:05 GMT
Accept-Ranges : bytes
ETag : W/"21757-1441143396000"
Last-Modified : Tue, 01 Sep 2015 21:36:36 GMT
Content-Type : text/html; charset=UTF-8
Content-Length : 21757
Vary : Accept-Encoding
Keep-Alive : timeout=3, max=100
Connection : Keep-Alive