Tribble integration for indexing the AnnotatorInputTable format
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3385 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2f3933148d
commit
6b96f025f5
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata.features.sampileup;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
public class AnnotatorInputTableCodec implements FeatureCodec<AnnotatorInputTableFeature> {
|
||||||
|
|
||||||
|
private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class);
|
||||||
|
|
||||||
|
public static final String DELIMITER = "\t";
|
||||||
|
|
||||||
|
private ArrayList<String> header;
|
||||||
|
|
||||||
|
private File file;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We use this to parse out the header.
|
||||||
|
*
|
||||||
|
* @param f the file
|
||||||
|
*
|
||||||
|
* @return 0. Since we just read the header, the number of lines left to skip is 0.
|
||||||
|
*/
|
||||||
|
public int headerLineCount(File f) {
|
||||||
|
this.file = f;
|
||||||
|
|
||||||
|
int[] lineCounter = new int[1];
|
||||||
|
try {
|
||||||
|
|
||||||
|
header = readHeader(f, lineCounter);
|
||||||
|
} catch(IOException e) {
|
||||||
|
throw new IllegalArgumentException("Unable to read from file " + f, e);
|
||||||
|
}
|
||||||
|
return lineCounter[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the line into an AnnotatorInputTableFeature object.
|
||||||
|
*
|
||||||
|
* @param line
|
||||||
|
*/
|
||||||
|
public AnnotatorInputTableFeature decode(String line) {
|
||||||
|
final ArrayList<String> header = this.header; //optimization
|
||||||
|
final ArrayList<String> values = Utils.split(line, DELIMITER, header.size());
|
||||||
|
|
||||||
|
//if ( values.size() > header.size()) {
|
||||||
|
// throw new CodecLineParsingException(String.format("Encountered a line within " + file + " that has %d columns which is > the number of columns in the header which has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size()));
|
||||||
|
//}
|
||||||
|
|
||||||
|
final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header);
|
||||||
|
for ( int i = 0; i < header.size(); i++ ) {
|
||||||
|
feature.putColumnValue(header.get(i), values.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
final GenomeLoc loc = GenomeLocParser.parseGenomeLoc(values.get(0)); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this
|
||||||
|
|
||||||
|
//parse the location
|
||||||
|
feature.setChr(loc.getContig());
|
||||||
|
feature.setStart((int) loc.getStart());
|
||||||
|
feature.setEnd((int) loc.getStop());
|
||||||
|
|
||||||
|
return feature;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the header.
|
||||||
|
* @param source
|
||||||
|
* @return
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public static ArrayList<String> readHeader(final File source) throws IOException {
|
||||||
|
return readHeader(source, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the header, and also sets the 2nd arg to the number of lines in the header.
|
||||||
|
* @param source
|
||||||
|
* @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header.
|
||||||
|
* @return The header fields.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private static ArrayList<String> readHeader(final File source, int[] lineCounter) throws IOException {
|
||||||
|
|
||||||
|
ArrayList<String> header = null;
|
||||||
|
int numLines = 0;
|
||||||
|
|
||||||
|
final XReadLines reader = new XReadLines(source);
|
||||||
|
try {
|
||||||
|
//find the 1st line that's non-empty and not a comment
|
||||||
|
for ( String line : reader ) {
|
||||||
|
numLines++;
|
||||||
|
line = line.trim();
|
||||||
|
if ( line.isEmpty() || line.startsWith("#") ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
//parse the header
|
||||||
|
header = Utils.split(line, DELIMITER);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that we found the header
|
||||||
|
if ( header == null ) {
|
||||||
|
throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(lineCounter != null) {
|
||||||
|
lineCounter[0] = numLines;
|
||||||
|
}
|
||||||
|
logger.info(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header)));
|
||||||
|
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,282 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata.features.sampileup;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class represents a single record in an AnnotatorInputTable.
|
||||||
|
*/
|
||||||
|
public class AnnotatorInputTableFeature implements Feature {
|
||||||
|
|
||||||
|
private ArrayList<String> columnNames;
|
||||||
|
private HashMap<String, String> columnValues;
|
||||||
|
|
||||||
|
private String chr;
|
||||||
|
private int start;
|
||||||
|
private int end;
|
||||||
|
|
||||||
|
|
||||||
|
// Temporary attributes were added to make it easier to implement certain
|
||||||
|
// optimizations for RODs that span an interval. For example, if a Walker
|
||||||
|
// needs to do a time-consuming computation on data from a ROD, it would normally
|
||||||
|
// have to repeat this computation every time its map(..) method is called.
|
||||||
|
// If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
|
||||||
|
// However, many computations (including validation and parsing) are done per ROD rather than
|
||||||
|
// per position. Therefore, substantial optimizations are possible if the result
|
||||||
|
// of the first computation is cached and reused on subsequent map(..) calls.
|
||||||
|
// Temporary attributes provide a convenient place to store these results,
|
||||||
|
// freeing the Walkers from having to maintain their own ROD -> result hashmaps.
|
||||||
|
private Map<Object, Object> temporaryAttributes;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor.
|
||||||
|
* @param columnNames The column names as parsed out of the file header.
|
||||||
|
*/
|
||||||
|
public AnnotatorInputTableFeature(ArrayList<String> columnNames) {
|
||||||
|
this.columnNames = columnNames;
|
||||||
|
this.columnValues = new HashMap<String, String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the list of column names from the file header.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public ArrayList<String> getHeader() {
|
||||||
|
return columnNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the value of the given column.
|
||||||
|
*
|
||||||
|
* @param columnName The column name as it appears in the file header.
|
||||||
|
* @return The value
|
||||||
|
*/
|
||||||
|
public String getColumnValue(final Object columnName) {
|
||||||
|
return columnValues.get(columnName);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containsColumnName(final Object columnName) {
|
||||||
|
return columnValues.containsKey(columnName);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the value for the given column.
|
||||||
|
*
|
||||||
|
* @param columnName The column name as it appears in the file header.
|
||||||
|
* @param value The value
|
||||||
|
* @return The existing value associated with the columnName, if there is one.
|
||||||
|
*/
|
||||||
|
protected String putColumnValue(final String columnName, final String value) {
|
||||||
|
return columnValues.put(columnName, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all values in this line, hashed by their column names.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Map<String,String> getColumnValues() {
|
||||||
|
return Collections.unmodifiableMap(columnValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the entry set of all column name-value pairs.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Set<Entry<String, String>> getEntrySet() {
|
||||||
|
|
||||||
|
return columnValues.entrySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getChr() {
|
||||||
|
return chr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getStart() {
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getEnd() {
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setChr(String chr) {
|
||||||
|
this.chr = chr;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setStart(int start) {
|
||||||
|
this.start = start;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setEnd(int end) {
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// ROD accessors
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
public GenomeLoc getLocation() {
|
||||||
|
if ( loc != null )
|
||||||
|
return loc;
|
||||||
|
String s = get(header.get(0));
|
||||||
|
if ( s == null )
|
||||||
|
return null;
|
||||||
|
return GenomeLocParser.parseGenomeLoc(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAttributeString() {
|
||||||
|
List<String> strings = new ArrayList<String>(columnValues.size());
|
||||||
|
for ( String key : header ) {
|
||||||
|
if ( containsKey(key) ) { // avoid the header
|
||||||
|
strings.add(this.get(key));
|
||||||
|
//System.out.printf("Adding %s%n", this.get(key));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Utils.join("\t", strings);
|
||||||
|
}
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// map functions
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
public int size() { return columnValues.size(); }
|
||||||
|
public boolean isEmpty() { return columnValues.isEmpty(); }
|
||||||
|
public boolean containsValue(Object o) { return columnValues.containsValue(o); }
|
||||||
|
public String remove(Object o) { return columnValues.remove(o); }
|
||||||
|
public void clear() { columnValues.clear(); }
|
||||||
|
public java.util.Set<String> keySet() { return columnValues.keySet(); }
|
||||||
|
public java.util.Collection<String> values() { return columnValues.values(); }
|
||||||
|
|
||||||
|
public void putAll(java.util.Map<? extends String, ? extends String> map) {
|
||||||
|
columnValues.putAll(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
public java.util.Set<java.util.Map.Entry<String,String>> entrySet() {
|
||||||
|
return columnValues.entrySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether an attribute has been set for the given key.
|
||||||
|
*
|
||||||
|
* Temporary attributes make it easier to implement certain
|
||||||
|
* optimizations for RODs that span an interval. For example, if a Walker
|
||||||
|
* needs to do a time-consuming computation on data from a ROD, it would normally
|
||||||
|
* have to repeat this computation every time its map(..) method is called.
|
||||||
|
* If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
|
||||||
|
* However, many computations (including validation and parsing) are done per ROD rather than
|
||||||
|
* per position. Therefore, substantial optimizations are possible if the result
|
||||||
|
* of the first computation is cached and reused on subsequent map(..) calls.
|
||||||
|
* Temporary attributes provide a convenient place to store these results,
|
||||||
|
* freeing the Walkers from having to maintain their own ROD -> result hashmaps.
|
||||||
|
*
|
||||||
|
* @param key key
|
||||||
|
* @return True if an attribute has been set for this key.
|
||||||
|
*/
|
||||||
|
public boolean containsTemporaryAttribute(Object key) {
|
||||||
|
if(temporaryAttributes != null) {
|
||||||
|
return temporaryAttributes.containsKey(key);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the key to the given value, replacing any previous value. The previous
|
||||||
|
* value is returned.
|
||||||
|
*
|
||||||
|
* Temporary attributes make it easier to implement certain
|
||||||
|
* optimizations for RODs that span an interval. For example, if a Walker
|
||||||
|
* needs to do a time-consuming computation on data from a ROD, it would normally
|
||||||
|
* have to repeat this computation every time its map(..) method is called.
|
||||||
|
* If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
|
||||||
|
* However, many computations (including validation and parsing) are done per ROD rather than
|
||||||
|
* per position. Therefore, substantial optimizations are possible if the result
|
||||||
|
* of the first computation is cached and reused on subsequent map(..) calls.
|
||||||
|
* Temporary attributes provide a convenient place to store these results,
|
||||||
|
* freeing the Walkers from having to maintain their own ROD -> result hashmaps.
|
||||||
|
*
|
||||||
|
* @param key key
|
||||||
|
* @param value value
|
||||||
|
* @return attribute
|
||||||
|
*/
|
||||||
|
public Object setTemporaryAttribute(Object key, Object value) {
|
||||||
|
if(temporaryAttributes == null) {
|
||||||
|
temporaryAttributes = new HashMap<Object, Object>();
|
||||||
|
}
|
||||||
|
return temporaryAttributes.put(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up the value associated with the given key.
|
||||||
|
*
|
||||||
|
* Temporary attributes make it easier to implement certain
|
||||||
|
* optimizations for RODs that span an interval. For example, if a Walker
|
||||||
|
* needs to do a time-consuming computation on data from a ROD, it would normally
|
||||||
|
* have to repeat this computation every time its map(..) method is called.
|
||||||
|
* If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
|
||||||
|
* However, many computations (including validation and parsing) are done per ROD rather than
|
||||||
|
* per position. Therefore, substantial optimizations are possible if the result
|
||||||
|
* of the first computation is cached and reused on subsequent map(..) calls.
|
||||||
|
* Temporary attributes provide a convenient place to store these results,
|
||||||
|
* freeing the Walkers from having to maintain their own ROD -> result hashmaps.
|
||||||
|
*
|
||||||
|
* @param key key
|
||||||
|
* @return The value, or null.
|
||||||
|
*/
|
||||||
|
public Object getTemporaryAttribute(Object key) {
|
||||||
|
if(temporaryAttributes != null) {
|
||||||
|
return temporaryAttributes.get(key);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes the attribute that has the given key.
|
||||||
|
*
|
||||||
|
* Temporary attributes make it easier to implement certain
|
||||||
|
* optimizations for RODs that span an interval. For example, if a Walker
|
||||||
|
* needs to do a time-consuming computation on data from a ROD, it would normally
|
||||||
|
* have to repeat this computation every time its map(..) method is called.
|
||||||
|
* If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
|
||||||
|
* However, many computations (including validation and parsing) are done per ROD rather than
|
||||||
|
* per position. Therefore, substantial optimizations are possible if the result
|
||||||
|
* of the first computation is cached and reused on subsequent map(..) calls.
|
||||||
|
* Temporary attributes provide a convenient place to store these results,
|
||||||
|
* freeing the Walkers from having to maintain their own ROD -> result hashmaps.
|
||||||
|
*
|
||||||
|
* @param key key
|
||||||
|
* @return The value that was associated with this key, or null.
|
||||||
|
*/
|
||||||
|
public Object removeTemporaryAttribute(Object key) {
|
||||||
|
if(temporaryAttributes != null) {
|
||||||
|
return temporaryAttributes.remove(key);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue