Beware of Byte Order Marks

When dealing with text files having a Unicode encoding, some tools will prepend a special character called a byte order mark (BOM) to the file. The BOM is from 2 to 4 bytes long, according to the encoding. (In the UTF-8 encoding, for example, the byte order mark is 3 bytes long.) The BOM is often not rendered visually in an editor, so they can be difficult to detect.

Probably the most widely used Unicode encoding is the UTF-8 encoding. However, in UTF-8, the BOM is not required or even recommended by the UTF-8 standard. When a text file having UTF-8 encoding is under your control, you should usually ensure that it does not get saved with a byte order mark, because they can lead to problems. (It should always be safe to remove it.)

This is particularly important in the context of a web application. HTML, Java Server Pages, tag files, and so on, should usually be served using UTF-8 encoding, without using a BOM. If not, the end user may see strange-looking characters in their browser, or perhaps extra, unwanted empty lines. They can also cause cascading style sheets to malfunction.

Unfortunately, on the Windows operating system, the Notepad tool will always add a byte order mark when saving text files as UTF-8, so you need to exercise care when using Notepad.

Some text editors will give you information about the presence or absence of a BOM, and some will not. To help you control your source code, here's a class which will detect and optionally remove UTF-8 BOMs from a source tree:


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/** 
 Detect and remove Byte Order Marks (BOMs) from text files saved with a
 Unicode encoding.

 <P>Dev tool only. If you use this tool to remove BOMs, please ensure 
 you have made a backup.
 
 <P>This class assumes the UTF-8 encoding for the BOM, but 
 is easily changed to handle any encoding. 
 See http://en.wikipedia.org/wiki/Byte_order_mark for more info.
 JDK 5+.
*/ 
public final class BomDetector {
  
  /** Run the tool against a root directory.*/
  public static void main(String... aArgs) throws IOException{
    BomDetector bom = new BomDetector(
     "C:\\Temp3\\test\\", 
     ".txt", ".jsp", ".jspf", ".tag", ".html", 
     ".css", ".xml", ".js", ".sql", ".tld"
    );
  
    int count = 0;
    for(String file : bom.findBOMs()){
      log(file);
      ++count;
    }
    log("Number of files with BOM:" + count);
    
    /*
    for(String file : bom.removeBOMs()){
      log("Removed BOM from: " + file);
    }
    */
  }
  
  public BomDetector(String aRootDirectory, String... aFileExtensions){
    fRootDir = new File(aRootDirectory);
    fExtensions = Arrays.asList(aFileExtensions);
    if(!fRootDir.exists() || fRootDir.isFile() ){
      throw new RuntimeException("Root directory not valid.");
    }
  }
  
  /** Find files with BOMs under the given root directory. Return their names. */
  public List<String> findBOMs() throws IOException {
    List<String> result = new ArrayList<String>();
    for(File textFile : findTextFilesBeneath(fRootDir)){
      if(startsWithBOM(textFile)){
        result.add(textFile.getCanonicalPath());
      }
    }
    return result;
  }
  
  /** 
   Find and remove BOMs from files under the given root directory.  
   Overwrites files.
   Return the names of the affected files.
  */
  public List<String> removeBOMs() throws IOException{
    List<String> result = new ArrayList<String>();
    for(String bomFile : findBOMs()){
      stripBomFrom(bomFile);
      result.add(bomFile);
    }
    return result;
  }
  
  // PRIVATE
  private File fRootDir;
  private List<String> fExtensions;
  
  /** Different encodings will have different BOMs. This is for UTF-8. */
  private final int[] BYTE_ORDER_MARK = {239, 187, 191};
  
  private static void log(Object aThing){
    System.out.println(String.valueOf(aThing));
  }
  
  private List<File> findTextFilesBeneath(File aStartingDir) throws IOException {
    List<File> result = new ArrayList<File>();
    File[] filesAndDirs = aStartingDir.listFiles();
    List<File> filesDirs = Arrays.asList(filesAndDirs);
    for(File file : filesDirs){
      if (isTextFile(file)){
        result.add(file);
      }
      if( file.isDirectory() ) {
        //recursive call!!
        List<File> deeperList = findTextFilesBeneath(file);
        result.addAll(deeperList);
      }
    }
    return result;
  }
  
  private boolean isTextFile(File aFile) throws IOException{
    boolean result = false;
    String fileName = aFile.getCanonicalPath();
    int finalDot = fileName.lastIndexOf(".");
    if (finalDot > -1){
      String extension = fileName.substring(finalDot);
      result = fExtensions.contains(extension);
    }
    return result;
  }
  
  private boolean startsWithBOM(File aTextFile) throws IOException {
    boolean result = false;
    if(aTextFile.length() < BYTE_ORDER_MARK.length) return false;
    //open as bytes here, not characters
    int[] firstFewBytes = new int[BYTE_ORDER_MARK.length];
    InputStream input = null;
    try {
      input = new FileInputStream(aTextFile);
      for(int index = 0; index < BYTE_ORDER_MARK.length; ++index){
        firstFewBytes[index] = input.read(); //read a single byte
      }
      result = Arrays.equals(firstFewBytes, BYTE_ORDER_MARK);
    }
    finally {
      input.close();
    }
    return result;
  }
  
  private void stripBomFrom(String aTextFile) throws IOException{
    File bomFile = new File(aTextFile);
    long initialSize = bomFile.length();
    long truncatedSize = initialSize - BYTE_ORDER_MARK.length;
    byte[] memory = new byte[(int)(truncatedSize)];
    InputStream input = null;
    try {
      input = new BufferedInputStream(new FileInputStream(bomFile));
      input.skip(BYTE_ORDER_MARK.length);
      int totalBytesReadIntoMemory = 0;
      while(totalBytesReadIntoMemory < truncatedSize){
        int bytesRemaining = (int)truncatedSize - totalBytesReadIntoMemory;
        int bytesRead = input.read(memory, totalBytesReadIntoMemory, bytesRemaining);
        if(bytesRead > 0){
          totalBytesReadIntoMemory = totalBytesReadIntoMemory + bytesRead;
        }
      }
      overwriteWithoutBOM(memory, bomFile);
    }
    finally {
      input.close();
    }
    File after = new File(aTextFile);
    long finalSize = after.length();
    long changeInSize = initialSize - finalSize;
    if(changeInSize != BYTE_ORDER_MARK.length){
      throw new RuntimeException(
        "Change in file size: " + changeInSize + 
        " Expected change: " + BYTE_ORDER_MARK.length  
      );
    }
  }
  
  private void overwriteWithoutBOM(
    byte[] aBytesWithoutBOM, File aTextFile
  ) throws IOException{
    OutputStream output = null;
    try {
      output = new BufferedOutputStream(new FileOutputStream(aTextFile));
      output.write(aBytesWithoutBOM);
    }
    finally {
      output.close();
    }
  }
} 



See Also :
Prefer UTF-8 in all layers
Would you use this technique?
Yes   No   Undecided   
© 2014 Hirondelle Systems | Source Code | Contact | License | RSS
Individual code snippets can be used under this BSD license - Last updated on September 21, 2013.
Over 2,000,000 unique IPs last year - Built with WEB4J.
- In Memoriam : Bill Dirani -