diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableCodec.java new file mode 100755 index 000000000..e1691a402 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableCodec.java @@ -0,0 +1,130 @@ +package org.broadinstitute.sting.gatk.refdata.features.sampileup; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.text.XReadLines; + +public class AnnotatorInputTableCodec implements FeatureCodec { + + private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class); + + public static final String DELIMITER = "\t"; + + private ArrayList header; + + private File file; + + /** + * We use this to parse out the header. + * + * @param f the file + * + * @return 0. Since we just read the header, the number of lines left to skip is 0. + */ + public int headerLineCount(File f) { + this.file = f; + + int[] lineCounter = new int[1]; + try { + + header = readHeader(f, lineCounter); + } catch(IOException e) { + throw new IllegalArgumentException("Unable to read from file " + f, e); + } + return lineCounter[0]; + } + + + /** + * Parses the line into an AnnotatorInputTableFeature object. + * + * @param line + */ + public AnnotatorInputTableFeature decode(String line) { + final ArrayList header = this.header; //optimization + final ArrayList values = Utils.split(line, DELIMITER, header.size()); + + //if ( values.size() > header.size()) { + // throw new CodecLineParsingException(String.format("Encountered a line within " + file + " that has %d columns which is > the number of columns in the header which has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); + //} + + final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header); + for ( int i = 0; i < header.size(); i++ ) { + feature.putColumnValue(header.get(i), values.get(i)); + } + + final GenomeLoc loc = GenomeLocParser.parseGenomeLoc(values.get(0)); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this + + //parse the location + feature.setChr(loc.getContig()); + feature.setStart((int) loc.getStart()); + feature.setEnd((int) loc.getStop()); + + return feature; + } + + + + /** + * Returns the header. + * @param source + * @return + * @throws IOException + */ + public static ArrayList readHeader(final File source) throws IOException { + return readHeader(source, null); + } + + + /** + * Returns the header, and also sets the 2nd arg to the number of lines in the header. + * @param source + * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header. + * @return The header fields. + * @throws IOException + */ + private static ArrayList readHeader(final File source, int[] lineCounter) throws IOException { + + ArrayList header = null; + int numLines = 0; + + final XReadLines reader = new XReadLines(source); + try { + //find the 1st line that's non-empty and not a comment + for ( String line : reader ) { + numLines++; + line = line.trim(); + if ( line.isEmpty() || line.startsWith("#") ) { + continue; + } + + //parse the header + header = Utils.split(line, DELIMITER); + break; + } + } + finally { + reader.close(); + } + + // check that we found the header + if ( header == null ) { + throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty."); + } + + if(lineCounter != null) { + lineCounter[0] = numLines; + } + logger.info(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header))); + + return header; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableFeature.java new file mode 100755 index 000000000..b7f52848d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/AnnotatorInputTableFeature.java @@ -0,0 +1,282 @@ +package org.broadinstitute.sting.gatk.refdata.features.sampileup; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.broad.tribble.Feature; + +/** + * This class represents a single record in an AnnotatorInputTable. + */ +public class AnnotatorInputTableFeature implements Feature { + + private ArrayList columnNames; + private HashMap columnValues; + + private String chr; + private int start; + private int end; + + + // Temporary attributes were added to make it easier to implement certain + // optimizations for RODs that span an interval. For example, if a Walker + // needs to do a time-consuming computation on data from a ROD, it would normally + // have to repeat this computation every time its map(..) method is called. + // If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + // However, many computations (including validation and parsing) are done per ROD rather than + // per position. Therefore, substantial optimizations are possible if the result + // of the first computation is cached and reused on subsequent map(..) calls. + // Temporary attributes provide a convenient place to store these results, + // freeing the Walkers from having to maintain their own ROD -> result hashmaps. + private Map temporaryAttributes; + + + + + /** + * Constructor. + * @param columnNames The column names as parsed out of the file header. + */ + public AnnotatorInputTableFeature(ArrayList columnNames) { + this.columnNames = columnNames; + this.columnValues = new HashMap(); + } + + + + /** + * Returns the list of column names from the file header. + * @return + */ + public ArrayList getHeader() { + return columnNames; + } + + + /** + * Returns the value of the given column. + * + * @param columnName The column name as it appears in the file header. + * @return The value + */ + public String getColumnValue(final Object columnName) { + return columnValues.get(columnName); + } + + + public boolean containsColumnName(final Object columnName) { + return columnValues.containsKey(columnName); + } + + + /** + * Sets the value for the given column. + * + * @param columnName The column name as it appears in the file header. + * @param value The value + * @return The existing value associated with the columnName, if there is one. + */ + protected String putColumnValue(final String columnName, final String value) { + return columnValues.put(columnName, value); + } + + /** + * Returns all values in this line, hashed by their column names. + * + * @return + */ + public Map getColumnValues() { + return Collections.unmodifiableMap(columnValues); + } + + + /** + * Returns the entry set of all column name-value pairs. + * + * @return + */ + public Set> getEntrySet() { + + return columnValues.entrySet(); + } + + + public String getChr() { + return chr; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + protected void setChr(String chr) { + this.chr = chr; + } + + protected void setStart(int start) { + this.start = start; + } + + protected void setEnd(int end) { + this.end = end; + } + + /* + // ---------------------------------------------------------------------- + // + // ROD accessors + // + // ---------------------------------------------------------------------- + public GenomeLoc getLocation() { + if ( loc != null ) + return loc; + String s = get(header.get(0)); + if ( s == null ) + return null; + return GenomeLocParser.parseGenomeLoc(s); + } + + public String getAttributeString() { + List strings = new ArrayList(columnValues.size()); + for ( String key : header ) { + if ( containsKey(key) ) { // avoid the header + strings.add(this.get(key)); + //System.out.printf("Adding %s%n", this.get(key)); + } + } + return Utils.join("\t", strings); + } + // ---------------------------------------------------------------------- + // + // map functions + // + // ---------------------------------------------------------------------- + public int size() { return columnValues.size(); } + public boolean isEmpty() { return columnValues.isEmpty(); } + public boolean containsValue(Object o) { return columnValues.containsValue(o); } + public String remove(Object o) { return columnValues.remove(o); } + public void clear() { columnValues.clear(); } + public java.util.Set keySet() { return columnValues.keySet(); } + public java.util.Collection values() { return columnValues.values(); } + + public void putAll(java.util.Map map) { + columnValues.putAll(map); + } + + public java.util.Set> entrySet() { + return columnValues.entrySet(); + } + + */ + + /** + * Checks whether an attribute has been set for the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return True if an attribute has been set for this key. + */ + public boolean containsTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.containsKey(key); + } + return false; + } + + /** + * Sets the key to the given value, replacing any previous value. The previous + * value is returned. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @param value value + * @return attribute + */ + public Object setTemporaryAttribute(Object key, Object value) { + if(temporaryAttributes == null) { + temporaryAttributes = new HashMap(); + } + return temporaryAttributes.put(key, value); + } + + /** + * Looks up the value associated with the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return The value, or null. + */ + public Object getTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.get(key); + } + return null; + } + + /** + * Removes the attribute that has the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return The value that was associated with this key, or null. + */ + public Object removeTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.remove(key); + } + return null; + } + + + + + +}