204 lines
7.0 KiB
Java
204 lines
7.0 KiB
Java
|
|
/*
|
||
|
|
* The Broad Institute
|
||
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||
|
|
* This software and its documentation are copyright 2009 by the
|
||
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||
|
|
*
|
||
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||
|
|
*/
|
||
|
|
package edu.mit.broad.picard.util;
|
||
|
|
|
||
|
|
import edu.mit.broad.picard.PicardException;
|
||
|
|
import edu.mit.broad.sam.util.CloseableIterator;
|
||
|
|
|
||
|
|
import java.util.Iterator;
|
||
|
|
import java.util.NoSuchElementException;
|
||
|
|
import java.io.Closeable;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Class for parsing text files where each line consists of fields separated by whitespace.
|
||
|
|
* Code is abstracted into this class so that we can optimize its performance over time.
|
||
|
|
*
|
||
|
|
* This class assumes that every line will have the same number of whitespace-separated "words"
|
||
|
|
* and that lines that start with "#" are comments and should be ignored.
|
||
|
|
*
|
||
|
|
* Classes that extend this parser can do so simply by implementing their own constructors and the
|
||
|
|
* readNextLine(), close(), and getFileName() methods.
|
||
|
|
*
|
||
|
|
* @author Kathleen Tibbetts
|
||
|
|
*/
|
||
|
|
public abstract class AbstractTextFileParser implements Iterable<String[]>, CloseableIterator<String[]> {
|
||
|
|
|
||
|
|
private boolean treatGroupedDelimitersAsOne = true; // Whether multiple delimiters in succession should be treated as one
|
||
|
|
private byte nextLine[] = null;
|
||
|
|
private int wordCount = 0; /* The number of delimiter-separated "words" per line of the file.
|
||
|
|
We can save a little caclulation, or handle files with varying numbers of
|
||
|
|
words per line, by specifying this if known in advance */
|
||
|
|
private boolean iterating = false;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Closes this stream and releases any system resources associated with it.
|
||
|
|
*/
|
||
|
|
public abstract void close();
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return the next line of text from the underlying stream(s) or null if there is no next line
|
||
|
|
*/
|
||
|
|
protected abstract byte[] readNextLine();
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return the name(s) of the file(s) being parsed, or null if no name is available
|
||
|
|
*/
|
||
|
|
protected abstract String getFileName();
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return an iterator over a set of elements of type String[]
|
||
|
|
*/
|
||
|
|
public Iterator<String[]> iterator() {
|
||
|
|
if (iterating) {
|
||
|
|
throw new IllegalStateException("iterator() method can only be called once, before the" +
|
||
|
|
"first call to hasNext()");
|
||
|
|
}
|
||
|
|
nextLine = readNextLine();
|
||
|
|
iterating = true;
|
||
|
|
return this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Returns true if the iteration has more elements.
|
||
|
|
*
|
||
|
|
* @return true if the iteration has more elements. Otherwise returns false.
|
||
|
|
*/
|
||
|
|
public boolean hasNext() {
|
||
|
|
// If this is the start of iteration, queue up the first item
|
||
|
|
if(!iterating) {
|
||
|
|
nextLine = readNextLine();
|
||
|
|
iterating = true;
|
||
|
|
}
|
||
|
|
return nextLine != null;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Returns the next element in the iteration.
|
||
|
|
*
|
||
|
|
* @return the next tlement in the iteration
|
||
|
|
* @throws java.util.NoSuchElementException
|
||
|
|
*/
|
||
|
|
public String[] next() {
|
||
|
|
|
||
|
|
if (!hasNext()) {
|
||
|
|
throw new NoSuchElementException("Iteration from text file(s) " +
|
||
|
|
getFileName() + " has no more elements.");
|
||
|
|
}
|
||
|
|
|
||
|
|
String[] result = parseLine(nextLine);
|
||
|
|
do {
|
||
|
|
nextLine = readNextLine();
|
||
|
|
}
|
||
|
|
while (nextLine != null && isComment(nextLine));
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* This method represents the most efficient way (so far) to parse a line of whitespace-delimited text
|
||
|
|
*
|
||
|
|
* @param line the line to parse
|
||
|
|
* @return an array of all the "words"
|
||
|
|
*/
|
||
|
|
private String[] parseLine(byte line[]) {
|
||
|
|
|
||
|
|
if (getWordCount() == 0) {
|
||
|
|
calculateWordCount(line);
|
||
|
|
}
|
||
|
|
String parts[] = new String[getWordCount()];
|
||
|
|
boolean delimiter = true;
|
||
|
|
int index=0;
|
||
|
|
int start = 0;
|
||
|
|
|
||
|
|
try
|
||
|
|
{
|
||
|
|
for (int i = 0; i < line.length; i++) {
|
||
|
|
if (isDelimiter(line[i])) {
|
||
|
|
if (!delimiter) {
|
||
|
|
parts[index++] = new String(line,start,i-start);
|
||
|
|
}
|
||
|
|
else if(!isTreatGroupedDelimitersAsOne()) {
|
||
|
|
parts[index++] = null;
|
||
|
|
}
|
||
|
|
delimiter=true;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (delimiter) start = i;
|
||
|
|
delimiter = false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (!delimiter) {
|
||
|
|
parts[index] = new String(line,start,line.length-start);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
catch (ArrayIndexOutOfBoundsException e) {
|
||
|
|
throw new PicardException("Unexpected number of elements found when parsing file " +
|
||
|
|
this.getFileName() + ": " + index + ". Expected a maximum of " +
|
||
|
|
this.getWordCount() + " elements per line.");
|
||
|
|
}
|
||
|
|
return parts;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Calculates the number of delimiter-separated "words" in a line and sets the value of <code>wordCount</code>
|
||
|
|
*
|
||
|
|
* @param line representative line from the file
|
||
|
|
*/
|
||
|
|
protected void calculateWordCount(byte line[]) {
|
||
|
|
int words = 0;
|
||
|
|
boolean delimiter = true;
|
||
|
|
for (byte b : line) {
|
||
|
|
if (isDelimiter(b)) {
|
||
|
|
if (delimiter && !isTreatGroupedDelimitersAsOne()) words++;
|
||
|
|
delimiter = true;
|
||
|
|
} else {
|
||
|
|
if (delimiter) words++;
|
||
|
|
delimiter = false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
setWordCount(words);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Required method for Iterator API.
|
||
|
|
*
|
||
|
|
* @throws UnsupportedOperationException
|
||
|
|
*/
|
||
|
|
public void remove() {
|
||
|
|
throw new UnsupportedOperationException("Remove() not supported.");
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Determines whether a given line is a comment
|
||
|
|
*
|
||
|
|
* @param line the line to evaluate
|
||
|
|
* @return true if the line is a comment (and should be ignored) otherwise false
|
||
|
|
*/
|
||
|
|
protected boolean isComment(byte line[]) {
|
||
|
|
return line[0] == '#';
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Determines whether a given character is a delimiter
|
||
|
|
*
|
||
|
|
* @param b the character to evaluate
|
||
|
|
* @return true if <code>b</code> is a delimiter; otherwise false
|
||
|
|
*/
|
||
|
|
protected boolean isDelimiter(byte b) {
|
||
|
|
return b == ' ' || b == '\t';
|
||
|
|
}
|
||
|
|
|
||
|
|
protected int getWordCount() { return wordCount; }
|
||
|
|
protected void setWordCount(int wordCount) { this.wordCount = wordCount; }
|
||
|
|
protected boolean isTreatGroupedDelimitersAsOne() { return treatGroupedDelimitersAsOne; }
|
||
|
|
protected void setTreatGroupedDelimitersAsOne(boolean treatGroupedDelimitersAsOne) {
|
||
|
|
this.treatGroupedDelimitersAsOne = treatGroupedDelimitersAsOne;
|
||
|
|
}
|
||
|
|
}
|