Tribble integration for indexing the AnnotatorInputTable format

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3385 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-19 03:37:54 +00:00 · 2010-05-19 03:37:54 +00:00 · 6b96f025f5
parent 2f3933148d
commit 6b96f025f5
2 changed files with 412 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableCodec.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableCodec.java
@ -0,0 +1,130 @@
 package org.broadinstitute.sting.gatk.refdata.features.sampileup;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import org.apache.log4j.Logger;
 import org.broad.tribble.FeatureCodec;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.text.XReadLines;
 public class AnnotatorInputTableCodec implements FeatureCodec<AnnotatorInputTableFeature> {
    private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class);
    public static final String DELIMITER = "\t";
    private ArrayList<String> header;
    private File file;
    /**
     * We use this to parse out the header.
     *
     * @param f the file
     *
     * @return 0. Since we just read the header, the number of lines left to skip is 0.
     */
    public int headerLineCount(File f) {
        this.file = f;
        int[] lineCounter = new int[1];
        try {
            header = readHeader(f, lineCounter);
        } catch(IOException e) {
            throw new IllegalArgumentException("Unable to read from file " + f, e);
        }
        return lineCounter[0];
    }
    /**
     * Parses the line into an AnnotatorInputTableFeature object.
     *
     * @param line
     */
    public AnnotatorInputTableFeature decode(String line) {
        final ArrayList<String> header = this.header; //optimization
        final ArrayList<String> values = Utils.split(line, DELIMITER, header.size());
        //if ( values.size() > header.size()) {
        //    throw new CodecLineParsingException(String.format("Encountered a line within " + file + " that has %d columns which is > the number of columns in the header which has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size()));
        //}
        final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header);
        for ( int i = 0; i < header.size(); i++ ) {
            feature.putColumnValue(header.get(i), values.get(i));
        }
        final GenomeLoc loc = GenomeLocParser.parseGenomeLoc(values.get(0)); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this
        //parse the location
        feature.setChr(loc.getContig());
        feature.setStart((int) loc.getStart());
        feature.setEnd((int) loc.getStop());
        return feature;
    }
    /**
     * Returns the header.
     * @param source
     * @return
     * @throws IOException
     */
    public static ArrayList<String> readHeader(final File source) throws IOException {
        return readHeader(source, null);
    }
    /**
     * Returns the header, and also sets the 2nd arg to the number of lines in the header.
     * @param source
     * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header.
     * @return The header fields.
     * @throws IOException
     */
    private static ArrayList<String> readHeader(final File source, int[] lineCounter) throws IOException {
        ArrayList<String> header = null;
        int numLines = 0;
        final XReadLines reader = new XReadLines(source);
        try {
            //find the 1st line that's non-empty and not a comment
            for ( String line : reader ) {
                numLines++;
                line = line.trim();
                if ( line.isEmpty() || line.startsWith("#") ) {
                    continue;
                }
                //parse the header
                header = Utils.split(line, DELIMITER);
                break;
            }
        }
        finally {
            reader.close();
        }
        // check that we found the header
        if ( header == null ) {
            throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty.");
        }
        if(lineCounter != null) {
            lineCounter[0] = numLines;
        }
        logger.info(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header)));
        return header;
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableFeature.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableFeature.java
@ -0,0 +1,282 @@
 package org.broadinstitute.sting.gatk.refdata.features.sampileup;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 import java.util.Map.Entry;
 import org.broad.tribble.Feature;
 /**
 * This class represents a single record in an AnnotatorInputTable.
 */
 public class AnnotatorInputTableFeature implements Feature {
    private ArrayList<String> columnNames;
    private HashMap<String, String> columnValues;
    private String chr;
    private int start;
    private int end;
    // Temporary attributes were added to make it easier to implement certain
    // optimizations for RODs that span an interval. For example, if a Walker
    // needs to do a time-consuming computation on data from a ROD, it would normally
    // have to repeat this computation every time its map(..) method is called.
    // If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
    // However, many computations (including validation and parsing) are done per ROD rather than
    // per position. Therefore, substantial optimizations are possible if the result
    // of the first computation is cached and reused on subsequent map(..) calls.
    // Temporary attributes provide a convenient place to store these results,
    // freeing the Walkers from having to maintain their own ROD -> result hashmaps.
    private Map<Object, Object> temporaryAttributes;
    /**
     * Constructor.
     * @param columnNames  The column names as parsed out of the file header.
     */
    public AnnotatorInputTableFeature(ArrayList<String> columnNames) {
        this.columnNames = columnNames;
        this.columnValues = new HashMap<String, String>();
    }
    /**
     * Returns the list of column names from the file header.
     * @return
     */
    public ArrayList<String> getHeader() {
        return columnNames;
    }
    /**
     * Returns the value of the given column.
     *
     * @param columnName The column name as it appears in the file header.
     * @return The value
     */
    public String getColumnValue(final Object columnName) {
        return columnValues.get(columnName);
    }
    public boolean containsColumnName(final Object columnName) {
        return columnValues.containsKey(columnName);
    }
    /**
     * Sets the value for the given column.
     *
     * @param columnName The column name as it appears in the file header.
     * @param value The value
     * @return The existing value associated with the columnName, if there is one.
     */
    protected String putColumnValue(final String columnName, final String value) {
        return columnValues.put(columnName, value);
    }
    /**
     * Returns all values in this line, hashed by their column names.
     *
     * @return
     */
    public Map<String,String> getColumnValues() {
        return Collections.unmodifiableMap(columnValues);
    }
    /**
     * Returns the entry set of all column name-value pairs.
     *
     * @return
     */
    public Set<Entry<String, String>> getEntrySet() {
        return columnValues.entrySet();
    }
    public String getChr() {
        return chr;
    }
    public int getStart() {
        return start;
    }
    public int getEnd() {
        return end;
    }
    protected void setChr(String chr) {
        this.chr = chr;
    }
    protected void setStart(int start) {
        this.start = start;
    }
    protected void setEnd(int end) {
        this.end = end;
    }
    /*
    // ----------------------------------------------------------------------
    //
    // ROD accessors
    //
    // ----------------------------------------------------------------------
    public GenomeLoc getLocation() {
        if ( loc != null )
            return loc;
        String s = get(header.get(0));
        if ( s == null )
            return null;
        return GenomeLocParser.parseGenomeLoc(s);
    }
    public String getAttributeString() {
        List<String> strings = new ArrayList<String>(columnValues.size());
        for ( String key : header ) {
            if ( containsKey(key) ) { // avoid the header
                strings.add(this.get(key));
                //System.out.printf("Adding %s%n", this.get(key));
            }
        }
        return Utils.join("\t", strings);
    }
 // ----------------------------------------------------------------------
    //
    // map functions
    //
    // ----------------------------------------------------------------------
    public int size()                               { return columnValues.size(); }
    public boolean isEmpty()                        { return columnValues.isEmpty(); }
    public boolean containsValue(Object o)          { return columnValues.containsValue(o); }
    public String remove(Object o)                  { return columnValues.remove(o); }
    public void clear()                             { columnValues.clear(); }
    public java.util.Set<String> keySet()           { return columnValues.keySet(); }
    public java.util.Collection<String> values()    { return columnValues.values(); }
    public void putAll(java.util.Map<? extends String, ? extends String> map) {
        columnValues.putAll(map);
    }
    public java.util.Set<java.util.Map.Entry<String,String>> entrySet() {
        return columnValues.entrySet();
    }
    */
    /**
     * Checks whether an attribute has been set for the given key.
     *
     * Temporary attributes make it easier to implement certain
     * optimizations for RODs that span an interval. For example, if a Walker
     * needs to do a time-consuming computation on data from a ROD, it would normally
     * have to repeat this computation every time its map(..) method is called.
     * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
     * However, many computations (including validation and parsing) are done per ROD rather than
     * per position. Therefore, substantial optimizations are possible if the result
     * of the first computation is cached and reused on subsequent map(..) calls.
     * Temporary attributes provide a convenient place to store these results,
     * freeing the Walkers from having to maintain their own ROD -> result hashmaps.
     *
     * @param key key
     * @return True if an attribute has been set for this key.
     */
    public boolean containsTemporaryAttribute(Object key) {
        if(temporaryAttributes != null) {
            return temporaryAttributes.containsKey(key);
        }
        return false;
    }
    /**
     * Sets the key to the given value, replacing any previous value. The previous
     * value is returned.
     *
     * Temporary attributes make it easier to implement certain
     * optimizations for RODs that span an interval. For example, if a Walker
     * needs to do a time-consuming computation on data from a ROD, it would normally
     * have to repeat this computation every time its map(..) method is called.
     * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
     * However, many computations (including validation and parsing) are done per ROD rather than
     * per position. Therefore, substantial optimizations are possible if the result
     * of the first computation is cached and reused on subsequent map(..) calls.
     * Temporary attributes provide a convenient place to store these results,
     * freeing the Walkers from having to maintain their own ROD -> result hashmaps.
     *
     * @param key    key
     * @param value  value
     * @return attribute
     */
    public Object setTemporaryAttribute(Object key, Object value) {
        if(temporaryAttributes == null) {
            temporaryAttributes = new HashMap<Object, Object>();
        }
        return temporaryAttributes.put(key, value);
    }
    /**
     * Looks up the value associated with the given key.
     *
     * Temporary attributes make it easier to implement certain
     * optimizations for RODs that span an interval. For example, if a Walker
     * needs to do a time-consuming computation on data from a ROD, it would normally
     * have to repeat this computation every time its map(..) method is called.
     * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
     * However, many computations (including validation and parsing) are done per ROD rather than
     * per position. Therefore, substantial optimizations are possible if the result
     * of the first computation is cached and reused on subsequent map(..) calls.
     * Temporary attributes provide a convenient place to store these results,
     * freeing the Walkers from having to maintain their own ROD -> result hashmaps.
     *
     * @param key key
     * @return The value, or null.
     */
    public Object getTemporaryAttribute(Object key) {
        if(temporaryAttributes != null) {
            return temporaryAttributes.get(key);
        }
        return null;
    }
    /**
     * Removes the attribute that has the given key.
     *
     * Temporary attributes make it easier to implement certain
     * optimizations for RODs that span an interval. For example, if a Walker
     * needs to do a time-consuming computation on data from a ROD, it would normally
     * have to repeat this computation every time its map(..) method is called.
     * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD.
     * However, many computations (including validation and parsing) are done per ROD rather than
     * per position. Therefore, substantial optimizations are possible if the result
     * of the first computation is cached and reused on subsequent map(..) calls.
     * Temporary attributes provide a convenient place to store these results,
     * freeing the Walkers from having to maintain their own ROD -> result hashmaps.
     *
     * @param key key
     * @return The value that was associated with this key, or null.
     */
    public Object removeTemporaryAttribute(Object key) {
         if(temporaryAttributes != null) {
             return temporaryAttributes.remove(key);
         }
         return null;
    }
 }