From 64ed770250ec1a5ad3dc7b8f30b40cdb1d23cb35 Mon Sep 17 00:00:00 2001 From: weisburd Date: Mon, 24 May 2010 14:36:28 +0000 Subject: [PATCH] Moved AnnotatorInputTableFeature and Codec to org.broadinstitute.sting.gatk.refdata.features.annotator git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3424 348d0f76-0448-11de-a6fe-93d51630548a --- .../annotator/AnnotatorInputTableCodec.java | 153 +++++++++++ .../annotator/AnnotatorInputTableFeature.java | 260 ++++++++++++++++++ .../gatk/walkers/annotator/AminoAcid.java | 25 ++ .../walkers/annotator/GenomicAnnotation.java | 27 +- 4 files changed, 464 insertions(+), 1 deletion(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java new file mode 100755 index 000000000..3fb89da50 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.annotator; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.util.AsciiLineReader; +import org.broad.tribble.util.LineReader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; + +public class AnnotatorInputTableCodec implements FeatureCodec { + + private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class); + + public static final String DELIMITER = "\t"; + + private ArrayList header; + + /** + * Parses the header. + * + * @param reader + * + * @return The # of header lines for this file. + */ + public int readHeader(LineReader reader) + { + int[] lineCounter = new int[1]; + try { + header = readHeader(reader, lineCounter); + } catch(IOException e) { + throw new IllegalArgumentException("Unable to read from file.", e); + } + return lineCounter[0]; + } + + + /** + * Parses the line into an AnnotatorInputTableFeature object. + * + * @param line + */ + public AnnotatorInputTableFeature decode(String line) { + final ArrayList header = this.header; //optimization + final ArrayList values = Utils.split(line, DELIMITER, header.size()); + + //if ( values.size() > header.size()) { + // throw new CodecLineParsingException(String.format("Encountered a line within " + file + " that has %d columns which is > the number of columns in the header which has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); + //} + + final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header); + for ( int i = 0; i < header.size(); i++ ) { + feature.putColumnValue(header.get(i), values.get(i)); + } + + final GenomeLoc loc = GenomeLocParser.parseGenomeLoc(values.get(0)); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this + + //parse the location + feature.setChr(loc.getContig()); + feature.setStart((int) loc.getStart()); + feature.setEnd((int) loc.getStop()); + + return feature; + } + + + + /** + * Returns the header. + * @param source + * @return + * @throws IOException + */ + public static ArrayList readHeader(final File source) throws IOException { + FileInputStream is = new FileInputStream(source); + try { + return readHeader(new AsciiLineReader(is), null); + } finally { + is.close(); + } + } + + + /** + * Returns the header, and also sets the 2nd arg to the number of lines in the header. + * @param source + * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header. + * @return The header fields. + * @throws IOException + */ + private static ArrayList readHeader(final LineReader source, int[] lineCounter) throws IOException { + + ArrayList header = null; + int numLines = 0; + + //find the 1st line that's non-empty and not a comment + String line = null; + while( (line = source.readLine()) != null ) { + numLines++; + line = line.trim(); + if ( line.isEmpty() || line.startsWith("#") ) { + continue; + } + + //parse the header + header = Utils.split(line, DELIMITER); + break; + } + + // check that we found the header + if ( header == null ) { + throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty."); + } + + if(lineCounter != null) { + lineCounter[0] = numLines; + } + logger.info(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header))); + + return header; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java new file mode 100755 index 000000000..725014d3b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.features.annotator; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.broad.tribble.Feature; + +/** + * This class represents a single record in an AnnotatorInputTable. + */ +public class AnnotatorInputTableFeature implements Feature { + + private ArrayList columnNames; + private HashMap columnValues; + + private String chr; + private int start; + private int end; + + + // Temporary attributes were added to make it easier to implement certain + // optimizations for RODs that span an interval. For example, if a Walker + // needs to do a time-consuming computation on data from a ROD, it would normally + // have to repeat this computation every time its map(..) method is called. + // If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + // However, many computations (including validation and parsing) are done per ROD rather than + // per position. Therefore, substantial optimizations are possible if the result + // of the first computation is cached and reused on subsequent map(..) calls. + // Temporary attributes provide a convenient place to store these results, + // freeing the Walkers from having to maintain their own ROD -> result hashmaps. + private Map temporaryAttributes; + + + + + /** + * Constructor. + * @param columnNames The column names as parsed out of the file header. + */ + public AnnotatorInputTableFeature(ArrayList columnNames) { + this.columnNames = columnNames; + this.columnValues = new HashMap(); + } + + + + /** + * Returns the list of column names from the file header. + * @return + */ + public ArrayList getHeader() { + return columnNames; + } + + + /** + * Returns the value of the given column. + * + * @param columnName The column name as it appears in the file header. + * @return The value + */ + public String getColumnValue(final Object columnName) { + return columnValues.get(columnName); + } + + + public boolean containsColumnName(final Object columnName) { + return columnValues.containsKey(columnName); + } + + + /** + * Sets the value for the given column. + * + * @param columnName The column name as it appears in the file header. + * @param value The value + * @return The existing value associated with the columnName, if there is one. + */ + protected String putColumnValue(final String columnName, final String value) { + return columnValues.put(columnName, value); + } + + /** + * Returns all values in this line, hashed by their column names. + * + * @return + */ + public Map getColumnValues() { + return Collections.unmodifiableMap(columnValues); + } + + + /** + * Returns the entry set of all column name-value pairs. + * + * @return + */ + public Set> getEntrySet() { + + return columnValues.entrySet(); + } + + + public String getChr() { + return chr; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + protected void setChr(String chr) { + this.chr = chr; + } + + protected void setStart(int start) { + this.start = start; + } + + protected void setEnd(int end) { + this.end = end; + } + + + /** + * Checks whether an attribute has been set for the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return True if an attribute has been set for this key. + */ + public boolean containsTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.containsKey(key); + } + return false; + } + + /** + * Sets the key to the given value, replacing any previous value. The previous + * value is returned. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @param value value + * @return attribute + */ + public Object setTemporaryAttribute(Object key, Object value) { + if(temporaryAttributes == null) { + temporaryAttributes = new HashMap(); + } + return temporaryAttributes.put(key, value); + } + + /** + * Looks up the value associated with the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return The value, or null. + */ + public Object getTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.get(key); + } + return null; + } + + /** + * Removes the attribute that has the given key. + * + * Temporary attributes make it easier to implement certain + * optimizations for RODs that span an interval. For example, if a Walker + * needs to do a time-consuming computation on data from a ROD, it would normally + * have to repeat this computation every time its map(..) method is called. + * If a ROD spans an interval, the Walker's map(..) method will be called for every position in ROD. + * However, many computations (including validation and parsing) are done per ROD rather than + * per position. Therefore, substantial optimizations are possible if the result + * of the first computation is cached and reused on subsequent map(..) calls. + * Temporary attributes provide a convenient place to store these results, + * freeing the Walkers from having to maintain their own ROD -> result hashmaps. + * + * @param key key + * @return The value that was associated with this key, or null. + */ + public Object removeTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.remove(key); + } + return null; + } + + + + + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java index 98c9415d4..76ff3f438 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/AminoAcid.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.playground.gatk.walkers.annotator; /** diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java index 5290bd8d3..5cfd55d1d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.playground.gatk.walkers.annotator; import java.util.HashMap; @@ -14,7 +39,7 @@ import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.sampileup.AnnotatorInputTableFeature; +import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;