From a29554e56573c587a0248d726a89c20bf204fc88 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 25 Jul 2011 15:10:25 -0400 Subject: [PATCH 1/8] Removing the Genomic Annotator and its supporting classes --- .../annotator/AnnotatorInputTableCodec.java | 193 --- .../annotator/AnnotatorInputTableFeature.java | 158 --- .../walkers/annotator/VariantAnnotator.java | 2 +- .../annotator/VariantAnnotatorEngine.java | 106 +- .../genomicannotator/GenomicAnnotation.java | 299 ----- .../genomicannotator/GenomicAnnotator.java | 287 ----- .../annotator/genomicannotator/JoinTable.java | 226 ---- .../genomicannotator/JoinTableParser.java | 131 --- .../TranscriptToGenomicInfo.java | 1032 ----------------- .../genotyper/UnifiedGenotyperEngine.java | 5 +- .../walkers/phasing/AnnotateMNPsWalker.java | 890 -------------- .../genomicannotator => utils}/AminoAcid.java | 2 +- .../AminoAcidTable.java | 2 +- .../GenomicAnnotatorIntegrationTest.java | 83 -- 14 files changed, 45 insertions(+), 3371 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java rename public/java/src/org/broadinstitute/sting/{gatk/walkers/annotator/genomicannotator => utils}/AminoAcid.java (97%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/annotator/genomicannotator => utils}/AminoAcidTable.java (99%) delete mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java deleted file mode 100755 index 6bba754be..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.StringTokenizer; - -public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec { - - private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class); - - public static final String DELIMITER = "\t"; - - private ArrayList header; - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Parses the header. - * - * @param reader - * - * @return The # of header lines for this file. - */ - public Object readHeader(LineReader reader) - { - int[] lineCounter = new int[1]; - try { - header = readHeader(reader, lineCounter); - } catch(IOException e) { - throw new IllegalArgumentException("Unable to read from file.", e); - } - return header; - } - - public Class getFeatureType() { - return AnnotatorInputTableFeature.class; - } - - @Override - public Feature decodeLoc(String line) { - StringTokenizer st = new StringTokenizer(line, DELIMITER); - if ( st.countTokens() < 1 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - - GenomeLoc loc; - String chr = st.nextToken(); - if ( chr.indexOf(":") != -1 ) { - loc = genomeLocParser.parseGenomeLoc(chr); - } else { - if ( st.countTokens() < 3 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - loc = genomeLocParser.createGenomeLoc(chr, Integer.valueOf(st.nextToken()), Integer.valueOf(st.nextToken())); - } - return new AnnotatorInputTableFeature(loc.getContig(), loc.getStart(), loc.getStop()); - } - - - /** - * Parses the line into an AnnotatorInputTableFeature object. - * - * @param line - */ - public AnnotatorInputTableFeature decode(String line) { - final ArrayList header = this.header; //optimization - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size()) { - throw new CodecLineParsingException(String.format("Encountered a line that has %d columns while the header has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header); - for ( int i = 0; i < header.size(); i++ ) { - feature.putColumnValue(header.get(i), values.get(i)); - } - - GenomeLoc loc; - if ( values.get(0).indexOf(":") != -1 ) - loc = genomeLocParser.parseGenomeLoc(values.get(0)); - else - loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2))); - - //parse the location - feature.setChr(loc.getContig()); - feature.setStart((int)loc.getStart()); - feature.setEnd((int)loc.getStop()); - - return feature; - } - - /** - * Returns the header. - * @param source - * @return - * @throws IOException - */ - public static ArrayList readHeader(final File source) throws IOException { - FileInputStream is = new FileInputStream(source); - try { - return readHeader(new AsciiLineReader(is), null); - } finally { - is.close(); - } - } - - - /** - * Returns the header, and also sets the 2nd arg to the number of lines in the header. - * @param source - * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header. - * @return The header fields. - * @throws IOException - */ - private static ArrayList readHeader(final LineReader source, int[] lineCounter) throws IOException { - - ArrayList header = null; - int numLines = 0; - - //find the 1st line that's non-empty and not a comment - String line = null; - while( (line = source.readLine()) != null ) { - numLines++; - if ( line.trim().isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that we found the header - if ( header == null ) { - throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty."); - } - - if(lineCounter != null) { - lineCounter[0] = numLines; - } - - logger.debug(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header))); - - return header; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java deleted file mode 100755 index d12badd28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.broad.tribble.Feature; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * This class represents a single record in an AnnotatorInputTable. - */ -public class AnnotatorInputTableFeature implements Feature { - - private ArrayList columnNames; - private HashMap columnValues; //maps colum names to column values - - private String chr; - private int start; - private int end; - private String strRep = null; - - /** - * Constructor. - * @param chr The chromosome name. - * @param start The start position - * @param end The end position - */ - public AnnotatorInputTableFeature(String chr, int start, int end) { - this.chr = chr; - this.start = start; - this.end = end; - } - - - /** - * Constructor. - * @param columnNames The column names as parsed out of the file header. - */ - public AnnotatorInputTableFeature(ArrayList columnNames) { - this.columnNames = columnNames; - this.columnValues = new HashMap(); - } - - - - /** - * @return the list of column names from the file header. - */ - public ArrayList getHeader() { - return columnNames; - } - - - /** - * Returns the value of the given column. - * - * @param columnName The column name as it appears in the file header. - * @return The value - */ - public String getColumnValue(final String columnName) { - return columnValues.get(columnName); - } - - - public boolean containsColumnName(final String columnName) { - return columnValues.containsKey(columnName); - } - - - /** - * Sets the value for the given column. - * - * @param columnName The column name as it appears in the file header. - * @param value The value - * @return The existing value associated with the columnName, if there is one. - */ - protected String putColumnValue(final String columnName, final String value) { - return columnValues.put(columnName, value); - } - - /** - * @return all values in this line, hashed by their column names. - */ - public Map getColumnValues() { - return Collections.unmodifiableMap(columnValues); - } - - - public String getChr() { - return chr; - } - - public int getStart() { - return start; - } - - public int getEnd() { - return end; - } - - protected void setChr(String chr) { - this.chr = chr; - } - - protected void setStart(int start) { - this.start = start; - } - - protected void setEnd(int end) { - this.end = end; - } - - @Override - public String toString() { - if ( strRep == null ) { - StringBuilder sb = new StringBuilder(); - - for(String columnName : columnNames ) { - if ( sb.length() == 0 ) - sb.append("["); - else - sb.append(", "); - sb.append(columnName + "=" + columnValues.get(columnName)); - } - sb.append("]"); - - strRep = sb.toString(); - } - - return strRep; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index acbeee3b2..caaa371a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -219,7 +219,7 @@ public class VariantAnnotator extends RodWalker { if ( stratifiedContexts != null ) { annotatedVCs = new ArrayList(VCs.size()); for ( VariantContext vc : VCs ) - annotatedVCs.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); + annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index fdf498a3d..0d1b21499 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -31,8 +31,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.GenomicAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.JoinTable; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -45,7 +43,6 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -import java.util.Map.Entry; public class VariantAnnotatorEngine { @@ -58,19 +55,6 @@ public class VariantAnnotatorEngine { private HashMap dbAnnotations = new HashMap(); - // command-line option from GenomicAnnotator. - private Map> requestedColumnsMap; - - // command-line option from GenomicAnnotator. - private boolean oneToMany; - - // command-line option from GenomicAnnotator. - private List joinTables; - - // used by GenomicAnnotator. Maps binding name to number of output VCF records - // annotated with records from the input table with this binding name. Only used for - // printing out stats at the end. - private Map inputTableHitCounter = new HashMap(); private static class VAExpression { public String fullName, bindingName, fieldName; @@ -140,7 +124,7 @@ public class VariantAnnotatorEngine { return descriptions; } - public Collection annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); @@ -150,42 +134,18 @@ public class VariantAnnotatorEngine { // annotate expressions where available annotateExpressions(tracker, ref, infoAnnotations); - // process the info field - List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file - infoAnnotationOutputsList.add(new LinkedHashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line. - infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in - // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) - { + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); - if ( annotationsFromCurrentType == null ) { - continue; - } - - if(annotationType instanceof GenomicAnnotation) - { - infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType ); - } - else - { - // add the annotations to each output line. - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - infoAnnotationOutput.putAll(annotationsFromCurrentType); - } - } + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); } - // annotate genotypes - Map genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc); + // generate a new annotated VC + final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations); - // create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList - Collection returnValue = new LinkedList(); - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) ); - } - - return returnValue; + // annotate genotypes, creating another new VC in the process + return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc)); } private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { @@ -251,6 +211,9 @@ public class VariantAnnotatorEngine { return genotypes; } + +/* + // Finish processing data from GenomicAnnotation. private List> processGenomicAnnotation( List> infoAnnotationOutputsList, Map annotationsForCurrentLocusFromAllAnnotatorInputTables) { @@ -403,12 +366,14 @@ public class VariantAnnotatorEngine { incrementStatsCounter(bindingName, infoAnnotationOutputsList.size()); } - /** + */ +/** * Records statistics that will be printed when GenomicAnnotator finishes. * * @param bindingName The table from which annotations were gotten * @param numNewRecords The number of new output VCF records created with annotations from this table - */ + *//* + private void incrementStatsCounter( final String bindingName, int numNewRecords) { //record some stats - there were infoAnnotationOutputsList.size() output VCF records annotated with data from the 'bindingName' input table. Integer counter = inputTableHitCounter.get(bindingName); @@ -453,13 +418,15 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Records statistics for the explodeInfoAnnotationOutputsList(..) calculation. * @param bindingName The table from which annotations were gotten * @param numNewVCFRecordsAnnotatedWithBindingNameData The number of new output VCF records created with annotations from this table * @param infoAnnotationOutputsList output list * @param matchingRecordsSize matching records size - */ + *//* + private void recordStats( final String bindingName, int numNewVCFRecordsAnnotatedWithBindingNameData, final List> infoAnnotationOutputsList, int matchingRecordsSize ) { //update stats for the 'bindingName' table @@ -509,13 +476,14 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Determines whether to exclude the given column from the annotations. * @param key The fully qualified columnName * @return Whether the -S arg specifies that this column should be included in the annotations. * - * TODO this function can be optimized through memoization - */ + *//* + private boolean isKeyFilteredOutBySelectArg(String key) { for(final String bindingName : requestedColumnsMap.keySet()) { @@ -536,10 +504,8 @@ public class VariantAnnotatorEngine { return false; //the -S arg doesn't have anything with the same binding name as this key, so the user implicitly requested this key } - - - - /** + */ +/** * Determines how the engine will handle the case where multiple records in a ROD file * overlap a particular single locus. If oneToMany is set to true, the output will be * one-to-many, so that each locus in the input VCF file could result in multiple @@ -551,18 +517,21 @@ public class VariantAnnotatorEngine { * See class-level comments for more details. * * @param oneToMany true if we should break out from one to many - */ + *//* + public void setOneToMany(boolean oneToMany) { this.oneToMany = oneToMany; } - /** + */ +/** * Sets the columns that will be used for the info annotation field. * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet). * * @param columns An array of strings where each string is a comma-separated list * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ). - */ + *//* + public void setRequestedColumns(String[] columns) { if(columns == null) { throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg."); @@ -574,17 +543,20 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Passes in a pointer to the JoinTables. * * @param joinTables The list of JoinTables. There should be one JoinTable object for each -J arg. - */ + *//* + public void setJoinTables(List joinTables) { this.joinTables = joinTables; } - /** + */ +/** * Parses the columns arg and returns a Map of columns hashed by their binding name. * For example: * The command line: @@ -604,7 +576,8 @@ public class VariantAnnotatorEngine { * @param columnsArg The -s command line arg value. * * @return Map representing a parsed version of this arg - see above. - */ + *//* + private static Map> parseColumnsArg(String[] columnsArg) { Map> result = new HashMap>(); @@ -635,5 +608,6 @@ public class VariantAnnotatorEngine { return Collections.unmodifiableMap(inputTableHitCounter); } +*/ } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java deleted file mode 100644 index 0e8360484..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; -import java.util.Map.Entry; - -/** - * This plugin for {@link VariantAnnotatorEngine} serves as the core - * of the {@link GenomicAnnotator}. It finds all records in the -B input files - * that match the given variant's position and, optionally, the variant's reference and alternate alleles. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class GenomicAnnotation extends InfoFieldAnnotation { - - public static final String CHR_COLUMN = "chr"; - public static final String START_COLUMN = "start"; - public static final String END_COLUMN = "end"; - public static final String HAPLOTYPE_REFERENCE_COLUMN = "haplotypeReference"; - public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate"; - - public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords"; - - /** Characters that aren't allowed within VCF info field key-value pairs */ - public static final char[] ILLEGAL_INFO_FIELD_VALUES = { ' ', '=', ';' }; - /** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */ - public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = { '_', '-', '!' }; - - - private void modifyAnnotationsForIndels(VariantContext vc, String featureName, Map annotationsForRecord) { - String inCodingRegionKey = featureName + ".inCodingRegion"; - String referenceCodonKey = featureName + ".referenceCodon"; - String variantCodonKey = featureName + ".variantCodon"; - String codingCoordStrKey = featureName + ".codingCoordStr"; - String proteinCoordStrKey = featureName + ".proteinCoordStr"; - String haplotypeReferenceKey = featureName + "." + HAPLOTYPE_REFERENCE_COLUMN; - String haplotypeAlternateKey = featureName + "." + HAPLOTYPE_ALTERNATE_COLUMN; - String functionalClassKey = featureName + ".functionalClass"; - String startKey = featureName + "." + START_COLUMN; - String endKey = featureName + "." + END_COLUMN; - String referenceAAKey = featureName + ".referenceAA"; - String variantAAKey = featureName + ".variantAA"; - String changesAAKey = featureName + ".changesAA"; - - annotationsForRecord.put(variantCodonKey, "unknown"); - annotationsForRecord.put(codingCoordStrKey, "unknown"); - annotationsForRecord.put(proteinCoordStrKey, "unknown"); - annotationsForRecord.put(referenceAAKey, "unknown"); - annotationsForRecord.put(variantAAKey, "unknown"); - - String refAllele = vc.getReference().getDisplayString(); - if (refAllele.length() == 0) { refAllele = "-"; } - - String altAllele = vc.getAlternateAllele(0).toString(); - if (altAllele.length() == 0) { altAllele = "-"; } - - annotationsForRecord.put(haplotypeReferenceKey, refAllele); - annotationsForRecord.put(haplotypeAlternateKey, altAllele); - annotationsForRecord.put(startKey, String.format("%d", vc.getStart())); - annotationsForRecord.put(endKey, String.format("%d", vc.getEnd())); - - boolean isCodingRegion = annotationsForRecord.containsKey(inCodingRegionKey) && annotationsForRecord.get(inCodingRegionKey).equalsIgnoreCase("true") ? true : false; - boolean isFrameshift = (vc.getIndelLengths().get(0) % 3 == 0) ? false : true; - - String functionalClass; - if (isCodingRegion) { - functionalClass = isFrameshift ? "frameshift" : "inframe"; - annotationsForRecord.put(changesAAKey, "true"); - } else { - functionalClass = "noncoding"; - } - - annotationsForRecord.put(functionalClassKey, functionalClass); - } - - /** - * For each -B input file, for each record which overlaps the current locus, generates a - * set of annotations of the form: - * - * bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc. - * - * For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. - * - * @return The following is an explanation of this method's return value: - * - * The annotations from a matching in a particular file are stored in a Map - * where the key is bindingName.columnName and the value is the columnValue. - * Since a single input file can have multiple records that overlap the current - * locus (eg. dbSNP can have multiple entries for the same genomic position), a different - * Map is created for each matching record in a particular file. - * The set of matching records for each file is then represented as a List> - * - * The return value of this method is a Map of the form: - * rodName1 -> List> - * rodName2 -> List> - * rodName3 -> List> - * ... - * Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file). - * - * NOTE: The lists (List>) are guaranteed to have size > 0 - * because a rodName -> List> entry will only - * be created in Map if the List has at least one element. - */ - public Map annotate(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc) { - - //iterate over each record that overlaps the current locus, and, if it passes certain filters, - //add its values to the list of annotations for this locus. - final Map annotations = new HashMap(); - for(final GATKFeature gatkFeature : tracker.getAllRods()) - { - final String name = gatkFeature.getName(); - if( name.equals("variant") || name.equals("interval") ) { - continue; - } - - if( ! (gatkFeature.getUnderlyingObject() instanceof AnnotatorInputTableFeature) ) { - continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns. - } - - final Map annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues()); - - //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the - //alleles specified match the the variant's reference allele and alternate allele. - //If they don't match, this record will be skipped, and its values will not be used for annotations. - // - //If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match. - //Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two - //are exactly equal (case-insensitive). - - //The HAPLOTYPE_REFERENCE_COLUMN matches the variant's reference allele based on a case-insensitive string comparison. - //The HAPLOTYPE_ALTERNATE_COLUMN can optionally list more than allele separated by one of these chars: ,\/:| - // only check this value for SNPs - String hapAltValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) ) : null; - if ( hapAltValue != null && !hapAltValue.equals("*") ) { - Set alternateAlleles = vc.getAlternateAlleles(); - //if(alternateAlleles.isEmpty()) { - //handle a site that has been called monomorphic reference - //alternateAlleles.add(vc.getReference()); - //continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right? - //} else - if(alternateAlleles.size() > 1) { - throw new UserException.MalformedFile("File associated with " + vc.getSource() + " contains record [" + vc + "] contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele."); - } - - Allele vcAlt; - if(alternateAlleles.isEmpty()) { - vcAlt = vc.getReference(); - } else { - vcAlt = alternateAlleles.iterator().next(); - } - - boolean matchFound = false; - for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) { - - if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) { - matchFound = true; - break; - } - } - if(!matchFound) { - continue; //skip record - none of its alternate alleles match the variant's alternate allele - } - } - - // only check this value for SNPs - String hapRefValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) ) : null; - if(hapRefValue != null) - { - hapRefValue = hapRefValue.trim(); - if(!hapRefValue.equals("*")) - { - //match against hapolotypeReference. - Allele vcRef = vc.getReference(); - if(!vcRef.basesMatch(hapRefValue)) { - continue; //skip record - } - } - } - - if (vc.isIndel()) { - modifyAnnotationsForIndels(vc, name, annotationsForRecord); - } - - //filters passed, so add this record. - List> listOfMatchingRecords = (List>) annotations.get( name ); - if(listOfMatchingRecords == null) { - listOfMatchingRecords = new LinkedList>(); - listOfMatchingRecords.add( annotationsForRecord ); - annotations.put(name, listOfMatchingRecords); - } else { - listOfMatchingRecords.add( annotationsForRecord ); - } - } - - return annotations; - } - - - - - /** - * Converts the given record to a set of key-value pairs of the form: - * bindingName.columnName1=column1Value, bindingName.columnName2=column2Value - * (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A) - * - * @param record AnnotatorInputTableFeature corresponding to one record in one -B input file. - * @param bindingName The binding name of the given AnnotatorInputTableFeature. - * @return The map of columnName -> columnValue pairs. - */ - public static Map convertRecordToAnnotations( String bindingName, Map record) { - final Map result = new HashMap(); - - for(final Entry entry : record.entrySet()) { - final String value = entry.getValue(); - if(!value.trim().isEmpty()) { - result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue())); - } - } - - return result; - } - - /** - * Combines the 2 values into a full key. - * @param rodBindingName -B name - * @param columnName column name - * @return info field key - */ - public static String generateInfoFieldKey(String rodBindingName, String columnName ) { - return rodBindingName + '.' + columnName; - } - - - - /** - * Replaces any characters that are not allowed in the info field of a VCF file. - * - * @param value info field value - * @return the value with any illegal characters replaced by legal ones. - */ - private static String scrubInfoFieldValue(String value) { - for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) { - value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]); - } - - return value; - } - - - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine("GenericAnnotation", 1, VCFHeaderLineType.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position.")); - } - - public List getKeyNames() { - return Arrays.asList("GenericAnnotation"); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java deleted file mode 100644 index b42310780..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.File; -import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; - -/** - * Annotates variant calls with information from user-specified tabular files. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class)) -@By(DataSource.REFERENCE) -public class GenomicAnnotator extends RodWalker implements TreeReducible { - - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName="vcfOutput", shortName="vcf", doc="Please use --out instead", required=false) - @Deprecated - protected String oldOutArg; - - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - - @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) - protected String[] SELECT_COLUMNS = {}; - - @Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B:name,AnnotatorInputTable /path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false) - protected String[] JOIN_ARGS = {}; - - @Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false) - protected Boolean ONE_TO_MANY = false; - - @Argument(fullName="maxJoinTableSize", shortName="maxJoin", doc="The maximum allowed size (i.e. number of rows) for a table provided with the -J argument", required=false) - protected Integer MAX_JOIN_TABLE_SIZE = 500000; - - @Argument(fullName="ignoreFilteredSites", shortName="noFilt", doc="If specified, don't annotate sites marked as filtered out") - protected Boolean IGNORE_FILTERED_SITES = false; - - private VariantAnnotatorEngine engine; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //read all ROD file headers and construct a set of all column names to be used for validation of command-line args - final Set allFullyQualifiedColumnNames = new LinkedHashSet(); - final Set allBindingNames = new LinkedHashSet(); - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(! ds.getType().equals(AnnotatorInputTableCodec.class)) { - continue; //skip all non-AnnotatorInputTable files. - } - final String bindingName = ds.getName(); - File file = ds.getFile(); - allBindingNames.add(bindingName); - try { - final ArrayList header = AnnotatorInputTableCodec.readHeader(file); - for(String columnName : header) { - allFullyQualifiedColumnNames.add(bindingName + "." + columnName); - } - } catch(IOException e) { - throw new UserException.CouldNotReadInputFile(file, "Failed when attempting to read file header. ", e); - } - } - - //parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg. - final List joinTables = new LinkedList(); - for(String joinArg : JOIN_ARGS) { - - //parse the tokens - final String[] arg = joinArg.split(","); - if(arg.length != 3) { - throw new UserException.BadArgumentValue("-J", "The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - final String bindingName = arg[0]; - final String filename = arg[1]; - final String columnsToJoin = arg[2]; - - if(allBindingNames.contains(bindingName)) { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding."); - } - - String[] splitOnEquals = columnsToJoin.split("=+"); - if(splitOnEquals.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - String[] splitOnDot1 = splitOnEquals[0].split("\\."); - String[] splitOnDot2 = splitOnEquals[1].split("\\."); - if(splitOnDot1.length != 2 || splitOnDot2.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - final String bindingName1 = splitOnDot1[0]; - final String columnName1 = splitOnDot1[1]; - final String bindingName2 = splitOnDot2[0]; - final String columnName2 = splitOnDot2[1]; - - //figure out which of the 2 binding names within the = relation matches the -J bindingName - final String localBindingName = bindingName; //alias - final String localColumnName; - final String externalBindingName; - final String externalColumnName; - if(bindingName1.equals(bindingName)) { - localColumnName = columnName1; - externalBindingName = bindingName2; - externalColumnName = columnName2; - } else if(bindingName2.equals(bindingName)) { - localColumnName = columnName2; - externalBindingName = bindingName1; - externalColumnName = columnName1; - } else { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - //validate externalColumnName - final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName; - if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\""); - } - - //read in the file contents into a JoinTable object - final JoinTable joinTable = new JoinTable(MAX_JOIN_TABLE_SIZE); - joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName); - joinTables.add(joinTable); - - //validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args. - final List columnNames = joinTable.getColumnNames(); - final List fullyQualifiedColumnNames = new LinkedList(); - boolean found = false; - for ( String columnName : columnNames ) { - if ( columnName.equals(localColumnName) ) - found = true; - fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); - } - if ( !found ) - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - - allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames); - } - - //parse the SELECT_COLUMNS arg and validate the column names - List parsedSelectColumns = new LinkedList(); - for ( String token : SELECT_COLUMNS ) - parsedSelectColumns.addAll(Arrays.asList(token.split(","))); - SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS); - - for ( String columnName : SELECT_COLUMNS ) { - if ( !allFullyQualifiedColumnNames.contains(columnName) ) - throw new UserException.BadArgumentValue("-s", "The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames); - } - - //instantiate the VariantAnnotatorEngine - ArrayList annotationsToUse = new ArrayList(); - annotationsToUse.add("GenomicAnnotation"); - engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList(), annotationsToUse); - engine.setOneToMany(ONE_TO_MANY); - engine.setRequestedColumns(SELECT_COLUMNS); - engine.setJoinTables(joinTables); - - // set up the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant"))); - hInfo.addAll(engine.getVCFAnnotationDescriptions()); - - Set rodName = new HashSet(); - rodName.add("variant"); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - VCFHeader vcfHeader = new VCFHeader(hInfo, samples); - vcfWriter.writeHeader(vcfHeader); - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** - * We want reads that span deletions - * - * @return true - */ - public boolean includeReadsWithDeletionAtLoci() { return true; } - - /** - * For each site of interest, annotate based on the requested annotation types - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Set results = new LinkedHashSet(); - for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) { - if ( (vc.isFiltered() && IGNORE_FILTERED_SITES) || - (vc.isVariant() && !vc.isBiallelic()) ) { - results.add(vc); - } else { - Map stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context); - if ( stratifiedContexts != null ) - results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); - else - results.add(vc); - } - } - - for ( VariantContext vc : results ) - vcfWriter.add(vc ,ref.getBase()); - - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public void onTraversalDone(Integer sum) { - - //out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords); - Map inputTableHitCounter = engine.getInputTableHitCounter(); - for ( Entry e : inputTableHitCounter.entrySet() ) { - final String bindingName = e.getKey(); - final int counter = e.getValue(); - //final float percent = 100 * counter /(float) totalOutputVCFRecords; - //out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName ); - System.out.printf(" %d annotated with %s.\n", counter, bindingName ); - } - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java deleted file mode 100755 index 714f374cf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -/** - * This is a container that holds all data corresponding to a single join table as specified by one -J arg (ex: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2). - * Some terminology: - * 'bindingName' is an arbitrary label for a given table that is specified on the command line with either the -B or -J arg. - * In the example above, bindingName1 is the 'local' binding name because it is attached to the join table file provided with this -J arg. bindingName2 is the 'external' binding name because - * it corresponds to some other table specified previously with another -B or -J arg. - * - * The JoinTable object stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName1.columnName in the above example), - * and the entry value is an ArrayList representing the entire join table record. - * The JoinTable object also stores some other join table parameters such as the column names that were parsed out of the file header, and the bindingNames and columnNames from the -J arg. - * - * The join operation is performed by looking up the value of the join column in the external table (the one that this table is being joined to), and then using this value to do a lookup - * on the map - if there's a hit, it will provide the record from the join table that is to be joined with the record in the external table. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTable -{ - //the list of join table column names parsed out of the file header. - private List columnNames; //not fully-qualified - - private String localBindingName; - private String externalBindingName; - private String externalColumnName; - - //stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example), - //and the entry value is an ArrayList representing the entire join table record. - private HashMap> joinColumnValueToRecords = new HashMap>(); - - private int maxSize; - private boolean parsedFromFile = false; - - public JoinTable(int maxSize) { - this.maxSize = maxSize; - } - - /** - * Parses the table from the given file using the JoinTableParser. - * - * @param filename The file containing the table. - * @param localBindingName The binding name within the given file to join on. - * @param localColumnName The column name within the given file to join on. - * @param externalBindingName The binding name of another file (previously specified with either -B or -J). - * @param externalColumnName The column name in this other file to join on. - */ - public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) { - if(parsedFromFile) { - throw new ReviewedStingException("parseFromFile(" + filename +", ..) called more than once"); - } - parsedFromFile = true; - - setLocalBindingName(localBindingName); - setExternalBindingName(externalBindingName); - setExternalColumnName(externalColumnName); - - BufferedReader br = null; - try - { - br = new BufferedReader(new FileReader(filename)); - final JoinTableParser parser = new JoinTableParser(); - - //read in the header - columnNames = parser.readHeader(br); - - //get the index of the localJoinColumnName - int localColumnNameIdx = -1; - for(int i = 0; i < columnNames.size(); i++) { - final String columnName = columnNames.get(i); - if(columnName.equals(localColumnName)) { - localColumnNameIdx = i; - break; - } - } - - if(localColumnNameIdx == -1) { - throw new UserException.BadArgumentValue("-J", "The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - } - - //read in all records and create a map entry for each - String line; - while((line = br.readLine()) != null) { - final ArrayList columnValues = parser.parseLine(line); - if ( columnValues.size() < columnNames.size() ) - throw new UserException.BadInput("the file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line); - final String joinColumnValue = columnValues.get(localColumnNameIdx); - put(joinColumnValue, columnValues, filename); - } - } - catch(IOException e) - { - throw new UserException.CouldNotReadInputFile(new File(filename), "Unable to parse file", e); - } - finally - { - try { - if(br != null) { - br.close(); - } - } catch(IOException e) { - throw new ReviewedStingException("Unable to close file: " + filename, e); - } - } - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName1. - * @return local binding name - */ - public String getLocalBindingName() { - return localBindingName; - } - - public void setLocalBindingName(String localBindingName) { - this.localBindingName = localBindingName; - } - - /** - * @return the list of join table column names parsed out of the file header. - */ - public List getColumnNames() { - return columnNames; //not fully-qualified - } - - protected void setColumnNames(List columnNames) { - this.columnNames = columnNames; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns columnName2. - * @return external column name - */ - public String getExternalColumnName() { - return externalColumnName; - } - - protected void setExternalColumnName( - String externalColumnName) { - this.externalColumnName = externalColumnName; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName2. - * @return external binding name - */ - public String getExternalBindingName() { - return externalBindingName; - } - - protected void setExternalBindingName( - String externalBindingName) { - this.externalBindingName = externalBindingName; - } - - /** - * Whether any join table records have the given value in the join column. - * @param joinColumnValue value - * @return true if the given name value exists in the file - */ - public boolean containsJoinColumnValue(String joinColumnValue) { - return joinColumnValueToRecords.containsKey(joinColumnValue); - } - - /** - * Returns all records in the table where the join column has the given value. - * @param joinColumnValue column value - * @return row - */ - public ArrayList get(String joinColumnValue) { - return joinColumnValueToRecords.get(joinColumnValue); - } - - /** - * Adds the given record to the map. - * @param joinColumnValue value - * @param record row - * @param filename the source file name - */ - protected void put(String joinColumnValue, ArrayList record, String filename) { - if ( joinColumnValueToRecords.containsKey(joinColumnValue) ) - throw new UserException.BadInput("the file " + filename + " contains non-unique entries for the requested column, which isn't allowed."); - joinColumnValueToRecords.put(joinColumnValue, record); - if ( joinColumnValueToRecords.size() > maxSize ) - throw new UserException.BadInput("the file " + filename + " contains more than the maximum number (" + maxSize + ") of allowed rows (see the --maxJoinTableSize argument)."); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java deleted file mode 100755 index 3b6c87f90..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Used to parse files passed to the GenomicAnnotator via the -J arg. - * The files must be tab-delimited, and the first non-empty/non-commented line - * must be a header containing column names. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTableParser -{ - public static final String DELIMITER = "\t"; - - private List header; //column names parsed out of the header line - - - /** - * Constructor. - */ - public JoinTableParser() {} - - /** - * Returns the header and returns it. - * @param br source - * @return column names - * @throws IOException on read - */ - public List readHeader(BufferedReader br) throws IOException - { - if(header != null) { - throw new ReviewedStingException("readHeader(..) called more than once. Header is currently set to: " + header); - } - - header = Collections.unmodifiableList(parseHeader(br)); - - return header; - } - - - /** - * @return A list containing the column names. - */ - public List getHeader() { - return header; - } - - - /** - * Parses the line into an ArrayList containing the values for each column. - * - * @param line to parse - * @return tokens - */ - public ArrayList parseLine(String line) { - - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size() ) { - throw new UserException.MalformedFile(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - return values; - } - - - /** - * Returns the header. - * @param br The file to read. - * @return ArrayList containing column names from the header. - * @throws IOException on reading - */ - public static ArrayList parseHeader(final BufferedReader br) throws IOException - { - ArrayList header = null; - - //find the 1st line that's non-empty and not a comment - String line; - while( (line = br.readLine()) != null ) { - line = line.trim(); - if ( line.isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that header was found - if ( header == null ) { - throw new IllegalArgumentException("No header in " + br + ". All lines are either comments or empty."); - } - - return header; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java deleted file mode 100755 index 0bbfa51b4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java +++ /dev/null @@ -1,1032 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.IOException; -import java.io.PrintStream; -import java.util.*; - -/** - * Takes a table of transcripts (eg. UCSC refGene, knownGene, and CCDS tables) and generates the big table which contains - * annotations for each possible variant at each transcript position (eg. 4 variants at each genomic position). - * - * Required args: - * -B - specifies the input file (ex. -B transcripts,AnnotatorInputTable,/path/to/transcript_table_file.txt) - * -n - Specifies which column(s) from the transcript table contain the gene name(s). (ex. -n name,name2 (for the UCSC refGene table)) - * WARNING: The gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. - * - * - * The map & reduce types are both TreeMap. - * Each TreeMap entry represents one line in the output file. The TreeMap key is a combination of a given output line's position (so that this key can be used to sort all output lines - * by reference order), as well as allele and gene names (so that its unique across all output lines). The String value is the output line itself. - */ -@Reference(window=@Window(start=-4,stop=4)) -@By(DataSource.REFERENCE) -@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name=TranscriptToGenomicInfo.ROD_NAME,type=AnnotatorInputTableFeature.class) } ) -public class TranscriptToGenomicInfo extends RodWalker { - public static final String ROD_NAME = "transcripts"; - - //@Argument(fullName="pass-through", shortName="t", doc="Optionally specifies which columns from the transcript table should be copied verbatim (aka. passed-through) to the records in the output table. For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -t id will cause the refGene id column to be copied to the output table.", required=false) - //protected String[] PASS_THROUGH_COLUMNS = {}; - - @Output - private PrintStream out; - - @Argument(fullName="unique-gene-name-columns", shortName="n", doc="Specifies which column(s) from the transcript table contains the gene name(s). For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -n name,name2 specifies that the name and name2 columns are gene names. WARNING: the gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. If this is not the case, an error will be thrown. ", required=true) - private String[] GENE_NAME_COLUMNS = {}; - - private final char[] ALLELES = {'A','C','G','T'}; - - /** Output columns */ - private static final String[] GENOMIC_ANNOTATION_COLUMNS = { - GenomicAnnotation.CHR_COLUMN, - GenomicAnnotation.START_COLUMN, - GenomicAnnotation.END_COLUMN, - GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, - GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN}; - - private static final String OUTPUT_TRANSCRIPT_STRAND = "transcriptStrand"; //rg. +/- - private static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true - private static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2 - private static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic - private static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript - private static final String OUTPUT_SPLICE_DISTANCE = "spliceDist"; //eg. integer, bp to nearest exon/intron boundary - private static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20 - private static final String OUTPUT_REFERENCE_CODON = "referenceCodon"; - private static final String OUTPUT_REFERENCE_AA = "referenceAA"; - private static final String OUTPUT_VARIANT_CODON = "variantCodon"; - private static final String OUTPUT_VARIANT_AA = "variantAA"; - private static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true - private static final String OUTPUT_FUNCTIONAL_CLASS = "functionalClass"; //eg. missense - private static final String OUTPUT_CODING_COORD_STR = "codingCoordStr"; - private static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr"; - private static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary - private static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5) - private static final String[] TRANSCRIPT_COLUMNS = { - OUTPUT_TRANSCRIPT_STRAND, - OUTPUT_POSITION_TYPE, - OUTPUT_FRAME, - OUTPUT_MRNA_COORD, - OUTPUT_CODON_NUMBER, - OUTPUT_SPLICE_DISTANCE, - OUTPUT_REFERENCE_CODON, - OUTPUT_REFERENCE_AA, - OUTPUT_VARIANT_CODON, - OUTPUT_VARIANT_AA, - OUTPUT_CHANGES_AMINO_ACID, - OUTPUT_FUNCTIONAL_CLASS, - OUTPUT_CODING_COORD_STR, - OUTPUT_PROTEIN_COORD_STR, - OUTPUT_IN_CODING_REGION, - OUTPUT_SPLICE_INFO, - OUTPUT_UORF_CHANGE }; - - //This list specifies the order of output columns in the big table. - private final List outputColumnNames = new LinkedList(); - - private int transcriptsProcessedCounter = 0; - - private long transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter = 0; - private long transcriptsThatDontStartWithMethionineCounter = 0; - private long transcriptsThatDontEndWithStopCodonCounter = 0; - private long skippedTranscriptCounter = 0; - - private long skippedPositionsCounter = 0; - private long totalPositionsCounter = 0; - - /** Possible values for the "POSITION_TYPE" output column. */ - private enum PositionType { - intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron - } - - /** - * Store rods until we hit their ends so that we don't have to recompute - * basic information every time we see them in map(). - */ - private Map storedTranscriptInfo = new HashMap(); - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //parse the GENE_NAME_COLUMNS arg and validate the column names - final List parsedGeneNameColumns = new LinkedList(); - for(String token : GENE_NAME_COLUMNS) { - parsedGeneNameColumns.addAll(Arrays.asList(token.split(","))); - } - GENE_NAME_COLUMNS = parsedGeneNameColumns.toArray(GENE_NAME_COLUMNS); - - ReferenceOrderedDataSource transcriptsDataSource = null; - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(ds.getName().equals(ROD_NAME)) { - transcriptsDataSource = ds; - break; - } - } - - // sanity check - if ( transcriptsDataSource == null ) - throw new IllegalStateException("No rod bound to " + ROD_NAME + " found in rod sources"); - - final ArrayList header; - try { - header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getFile()); - } catch(Exception e) { - throw new UserException.MalformedFile(transcriptsDataSource.getFile(), "Failed when attempting to read header from file", e); - } - - for ( String columnName : GENE_NAME_COLUMNS ) { - if ( !header.contains(columnName) ) - throw new UserException.CommandLineException("The column name '" + columnName + "' provided to -n doesn't match any of the column names in: " + transcriptsDataSource.getFile()); - } - - //init outputColumnNames list - outputColumnNames.addAll(Arrays.asList(GENOMIC_ANNOTATION_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(GENE_NAME_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(TRANSCRIPT_COLUMNS)); - - //init OUTPUT_HEADER_LINE - StringBuilder outputHeaderLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputHeaderLine.length() != 0) { - outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - outputHeaderLine.append(column); - } - - out.println(outputHeaderLine.toString()); - } - - public Integer reduceInit() { return 0; } - - /** - * For each site of interest, generate the appropriate fields. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - final Collection rods = tracker.getBoundRodTracks(); - //if there's nothing overlapping this locus, skip it. - if ( rods.size() == 0 ) - return 0; - - final List transcriptRODs = tracker.getReferenceMetaData(ROD_NAME); - - //there may be multiple transcriptRODs that overlap this locus - for ( Object transcriptRodObject : transcriptRODs ) { - //parse this ROD if it hasn't been already. - final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject; - String featureKey = transcriptRod.toString(); - - TranscriptTableRecord parsedTranscriptRod = storedTranscriptInfo.get(featureKey); - if ( parsedTranscriptRod == null ) { - parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS); - storedTranscriptInfo.put(featureKey, parsedTranscriptRod); - } - - //populate parsedTranscriptRod.txSequence - if(parsedTranscriptRod.positiveStrand) { - parsedTranscriptRod.txSequence.append((char)ref.getBase()); - } else { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - parsedTranscriptRod.txSequence.insert(0, complementBase); - } - - //populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence - final int position = (int) ref.getLocus().getStart(); - if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) ) - { - //we're within an exon of a proteinCodingTranscript - - if(parsedTranscriptRod.positiveStrand) - { - if(position < parsedTranscriptRod.cdsStart) - { - parsedTranscriptRod.utr5Sequence.append((char)ref.getBase()); //within utr5 - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.append((char)ref.getBase()); //within CDS - } - } - else - { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - if(position > parsedTranscriptRod.cdsEnd) - { - //As we move left to right (aka. 3' to 5'), we do insert(0,..) to reverse the sequence so that it become 5' to 3' in parsedTranscriptRod.utr5Sequence. - parsedTranscriptRod.utr5Sequence.insert(0,complementBase); //within utr5. - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.insert(0,complementBase); //within CDS - } - } - } - - if ( position == parsedTranscriptRod.txEnd ) { - //we've reached the end of the transcript - compute all data and write it out. - try { - generateOutputRecordsForROD(parsedTranscriptRod); - } - catch(IOException e) { - throw new RuntimeException(Thread.currentThread().getName() + " - Unexpected error occurred at position: [" + parsedTranscriptRod.txChrom + ":" + position + "] in transcript: " + parsedTranscriptRod, e); - } - - // remove it from the cache - storedTranscriptInfo.remove(featureKey); - - transcriptsProcessedCounter++; - if ( transcriptsProcessedCounter % 100 == 0 ) - logger.info(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed"); - } - } - - return 1; - } - - private static boolean isChrM(final TranscriptTableRecord record) { - return record.txChrom.equals("chrM") || record.txChrom.equals("MT")|| record.txChrom.equals("CRS"); - } - - private void generateOutputRecordsForROD(TranscriptTableRecord parsedTranscriptRod) throws IOException { - //Transcripts that don't produce proteins are indicated in transcript by cdsStart == cdsEnd - //These will be handled by generating only one record, with haplotypeAlternate == "*". - final boolean isProteinCodingTranscript = parsedTranscriptRod.isProteinCodingTranscript(); - final boolean isMitochondrialTranscript = isChrM(parsedTranscriptRod); - - final boolean positiveStrand = parsedTranscriptRod.positiveStrand; //alias - - - if(isProteinCodingTranscript && parsedTranscriptRod.cdsSequence.length() % 3 != 0) { - if (!isMitochondrialTranscript) { - logger.error("ERROR: Transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping..."); - //discard transcripts where CDS length is not a multiple of 3 - skippedTranscriptCounter++; - return; - } else { - - //In mitochondrial genes, the polyA tail may complete the stop codon, allowing transcript . To check for this special case: - //1. check that the CDS covers the entire transcript - //2. add 1 or 2 A's to the 3' end of the transcript (as needed to make it divisible by 3) - //3. check whether the last 3 letters now form a stop codon using the mitochondrial AA table - //4. If not, skip this gene, else incorporate the A's and process it like any other gene. - - if( parsedTranscriptRod.txSequence.length() == parsedTranscriptRod.cdsSequence.length()) { - do { //append A's until sequence length is divisible by 3 - parsedTranscriptRod.txSequence.append('*'); - parsedTranscriptRod.cdsSequence.append('a'); - if(positiveStrand) { - parsedTranscriptRod.txEnd++; - parsedTranscriptRod.cdsEnd++; - parsedTranscriptRod.exonEnds[0]++; - } else { - parsedTranscriptRod.txStart--; - parsedTranscriptRod.cdsStart--; - parsedTranscriptRod.exonStarts[0]--; - } - } while( parsedTranscriptRod.cdsSequence.length() % 3 != 0); - - } else { - logger.error("ERROR: Mitochnodrial transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. The CDS does not cover the entire transcript, so its not possible to use A's from the polyA tail. Skipping..."); - skippedTranscriptCounter++; - return; - } - } - } - - - //warn if the first codon isn't Methionine and/or the last codon isn't a stop codon. - if(isProteinCodingTranscript) { - final int cdsSequenceLength = parsedTranscriptRod.cdsSequence.length(); - - final String firstCodon = parsedTranscriptRod.cdsSequence.substring(0, 3); - final AminoAcid firstAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( firstCodon, true ) : AminoAcidTable.getEukaryoticAA( firstCodon ) ; - - final String lastCodon = parsedTranscriptRod.cdsSequence.substring(cdsSequenceLength - 3, cdsSequenceLength); - final AminoAcid lastAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( lastCodon, false ) : AminoAcidTable.getEukaryoticAA( lastCodon ) ; - - if( firstAA != AminoAcidTable.METHIONINE && !lastAA.isStop()) { - transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine or end in a stop codon. The first codon is: " + firstCodon + " (" + firstAA + "). The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if( firstAA != AminoAcidTable.METHIONINE) { - transcriptsThatDontStartWithMethionineCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine. The first codon is: " + firstCodon + " (" + firstAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if(!lastAA.isStop()) { - transcriptsThatDontEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not end in a stop codon. The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } - } - - final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive - final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive - final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement - final int strandSign = increment_5to3; //alias - - final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive - final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive - - int frame = 0; //the frame of the current position - int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand - int utr5Count_from5 = 0; - int mrnaCoord_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand, but only counts bases within exons. - char[] utr5NucBuffer_5to3 = null; //used to find uORFs - size = 5 because to hold the 3 codons that overlap any given position: [-2,-1,0], [-1,0,1], and [0,1,2] - - int codonCount_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons - 1-based - int codingCoord_from5 = isProteinCodingTranscript ? parsedTranscriptRod.computeInitialCodingCoord() : -1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - boolean codingCoordResetForCDS = false; - boolean codingCoordResetForUtr3 = false; - final char[] currentCodon_5to3 = isProteinCodingTranscript ? new char[3] : null; //holds the current RNA codon - 5' to 3' - - PositionType positionType = null; - boolean isWithinIntronAndFarFromSpliceJunction = false; - int intronStart_5prime = -1; - int intronEnd_5prime; - - final Map outputLineFields = new HashMap(); - - for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) - { - ++totalPositionsCounter; - - //compute certain attributes of the current position - final boolean isWithinExon = parsedTranscriptRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron - - final int distanceToNearestSpliceSite = parsedTranscriptRod.computeDistanceToNearestSpliceSite(txCoord_5to3); - final boolean isWithin10bpOfSpliceJunction = Math.abs(distanceToNearestSpliceSite) <= 10; - - - //increment coding coord is necessary - if(isWithinExon) { - codingCoord_from5++; - } - - //figure out the current positionType - final PositionType prevPositionType = positionType; //save the position before it is updated - if(isProteinCodingTranscript) - { - if(isWithinExon) - { - if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //utr5 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr5; - } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //utr3 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr3; - } else { - positionType = PositionType.CDS; - } - } else { - positionType = PositionType.intron; - } - } else { - if(isWithinExon) { - positionType = PositionType.non_coding_exon; - } else { - positionType = PositionType.non_coding_intron; - } - } - - //handle transitions - if(positionType == PositionType.CDS && prevPositionType != PositionType.CDS && !codingCoordResetForCDS) { - //transitioning from utr5 to CDS, reset the coding coord from -1 to 1. - codingCoord_from5 = 1; - codingCoordResetForCDS = true; - } else if(positionType == PositionType.utr3 && prevPositionType != PositionType.utr3 && !codingCoordResetForUtr3) { - //transitioning from CDS to utr3, reset the coding coord to 1. - codingCoord_from5 = 1; - codingCoordResetForUtr3 = true; - } - - - try - { - //handle introns - boolean wasWithinIntronAndFarFromSpliceJunction = isWithinIntronAndFarFromSpliceJunction; - isWithinIntronAndFarFromSpliceJunction = !isWithinExon && !isWithin10bpOfSpliceJunction; - - if(!wasWithinIntronAndFarFromSpliceJunction && isWithinIntronAndFarFromSpliceJunction) { - //save intron start - intronStart_5prime = txCoord_5to3; - - } else if(wasWithinIntronAndFarFromSpliceJunction && !isWithinIntronAndFarFromSpliceJunction) { - //output intron record - intronEnd_5prime = txCoord_5to3 - increment_5to3; - - final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; - final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); - outputLineFields.clear(); - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(intronStart)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(intronEnd)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - - if ( isProteinCodingTranscript ) - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - - addThisLineToResult(outputLineFields); - } - - //when in utr5, compute the utr5NucBuffer_5to3 which is later used to compute the OUTPUT_UORF_CHANGE field - if(positionType == PositionType.utr5) - { - if(utr5Count_from5 < parsedTranscriptRod.utr5Sequence.length()) - { - if(utr5NucBuffer_5to3 == null) { - //initialize - utr5NucBuffer_5to3 = new char[5]; - utr5NucBuffer_5to3[3] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 ); - - if(utr5Count_from5 + 1 < parsedTranscriptRod.utr5Sequence.length() ) { - utr5NucBuffer_5to3[4] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 1 ); - } - } - - //as we move 5' to 3', shift nucleotides down to the 5' end, making room for the new 3' nucleotide: - utr5NucBuffer_5to3[0] = utr5NucBuffer_5to3[1]; - utr5NucBuffer_5to3[1] = utr5NucBuffer_5to3[2]; - utr5NucBuffer_5to3[2] = utr5NucBuffer_5to3[3]; - utr5NucBuffer_5to3[3] = utr5NucBuffer_5to3[4]; - - char nextRefBase = 0; - if( utr5Count_from5 + 2 < parsedTranscriptRod.utr5Sequence.length() ) - { - nextRefBase = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 2 ); - } - utr5NucBuffer_5to3[4] = nextRefBase; - - //check for bad bases - if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) || - (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) || - (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) || - (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) || - (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4]))) - { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } else { // if(utr5Count_from5 >= parsedTranscriptRod.utr5Sequence.length()) - //defensive programming - throw new RuntimeException("Exception: Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5Count_from5 is now " + utr5Count_from5 + ", while parsedTranscriptRod.utr5Sequence.length() == " + parsedTranscriptRod.utr5Sequence.length() + ". This means parsedTranscriptRod.utr5Sequence isn't as long as it should be. This is a bug in handling this record: " + parsedTranscriptRod); - - } - } - - - //when in CDS, compute current codon - if(positionType == PositionType.CDS) - { - if(frame == 0) - { - currentCodon_5to3[0] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords - currentCodon_5to3[1] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 ); - currentCodon_5to3[2] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 + 1); - } - - //check for bad bases - if(!BaseUtils.isRegularBase(currentCodon_5to3[0]) || !BaseUtils.isRegularBase(currentCodon_5to3[1]) || !BaseUtils.isRegularBase(currentCodon_5to3[2])) { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". CDS codon contains irregular base:" + currentCodon_5to3[0] + currentCodon_5to3[1] + currentCodon_5to3[2]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } - - char haplotypeReference = parsedTranscriptRod.txSequence.charAt( txOffset_from5 - 1 ); - if(!positiveStrand) { - haplotypeReference = BaseUtils.simpleComplement(haplotypeReference); //txSequence contents depend on whether its +/- strand - } - char haplotypeReferenceStrandSpecific= positiveStrand ? haplotypeReference : BaseUtils.simpleComplement(haplotypeReference); - - - - if(!BaseUtils.isRegularBase(haplotypeReference) && haplotypeReference != '*') { //* is special case for mitochondrial genes where polyA tail completes the last codon - //check for bad bases - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() + ". The reference contains an irregular base:" + haplotypeReference); // +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - - char haplotypeAlternateStrandSpecific; - for(char haplotypeAlternate : ALLELES ) - { - haplotypeAlternateStrandSpecific= positiveStrand ? haplotypeAlternate : BaseUtils.simpleComplement(haplotypeAlternate); - outputLineFields.clear(); - - if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) { - haplotypeReference = '*'; - haplotypeAlternate = '*'; - } - - //compute simple OUTPUT fields. - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( haplotypeReference ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN, Character.toString( haplotypeAlternate ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - if(isWithinExon) { - outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(mrnaCoord_from5) ); - } - outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) ); - - //compute OUTPUT_SPLICE_INFO - final String spliceInfoString; - if(isWithin10bpOfSpliceJunction) { - if(distanceToNearestSpliceSite < 0) { - //is on the 5' side of the splice junction - if(isWithinExon) { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } - } else { - if(isWithinExon) { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } - } - outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString); - } - - //compute OUTPUT_IN_CODING_REGION - if(isProteinCodingTranscript) - { - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - } - - - //compute OUTPUT_UORF_CHANGE - if(positionType == PositionType.utr5) - { - String refCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + utr5NucBuffer_5to3[2]).toUpperCase(); - String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toUpperCase(); - String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternateStrandSpecific).toUpperCase(); - String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternateStrandSpecific) + utr5NucBuffer_5to3[3]).toUpperCase(); - String varCodon3 = (Character.toString(haplotypeAlternateStrandSpecific) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - //check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF) - String uORFChangeStr = null; - if( (refCodon1.equals("ATG") && !varCodon1.equals("ATG")) || - (refCodon2.equals("ATG") && !varCodon2.equals("ATG")) || - (refCodon3.equals("ATG") && !varCodon3.equals("ATG"))) - { - uORFChangeStr = "-1"; - } - else if((varCodon1.equals("ATG") && !refCodon1.equals("ATG")) || - (varCodon2.equals("ATG") && !refCodon2.equals("ATG")) || - (varCodon3.equals("ATG") && !refCodon3.equals("ATG"))) - { - uORFChangeStr = "+1"; - } - - outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr ); - } - //compute CDS-specific fields - else if (positionType == PositionType.CDS) { - final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - final char temp = currentCodon_5to3[frame]; - currentCodon_5to3[frame] = haplotypeAlternateStrandSpecific; - final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - currentCodon_5to3[frame] = temp; - - final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(referenceCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ; - final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(variantCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( variantCodon ) ; - - if (refAA.isUnknown() || variantAA.isUnknown()) { - logger.warn("Illegal amino acid detected: refCodon=" + referenceCodon + " altCodon=" + variantCodon); - } - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame)); - outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5)); - outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon); - outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode()); - - outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon); - outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode()); - - outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p." + refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter()); //for example: "p.K7$ - - boolean changesAA = !refAA.equals(variantAA); - outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA)); - final String functionalClass; - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } else if (refAA.isStop()) { - functionalClass = "readthrough"; - } else { - functionalClass = "missense"; - } - } else { - functionalClass = "silent"; - } - outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass); - } - - //compute OUTPUT_CODING_COORD_STR - if(isProteinCodingTranscript) - { - //compute coding coord - final StringBuilder codingCoordStr = new StringBuilder(); - codingCoordStr.append( "c." ); - if(positionType == PositionType.utr3) { - codingCoordStr.append( '*' ); - } - - if(isWithinExon) { - codingCoordStr.append( Integer.toString(codingCoord_from5) ); - - codingCoordStr.append ( haplotypeReferenceStrandSpecific + ">" + haplotypeAlternateStrandSpecific); - } else { - //intronic coordinates - if(distanceToNearestSpliceSite < 0) { - codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) ); - } else { - codingCoordStr.append( Integer.toString(codingCoord_from5 ) ); - codingCoordStr.append( "+" ); - } - - codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) ); - } - - outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString()); - } - - - //generate the output line and add it to 'result' map. - if ( !isWithinIntronAndFarFromSpliceJunction ) - addThisLineToResult(outputLineFields); - - if( haplotypeAlternate == '*' ) { - //need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles - break; - } - - } //ALLELE for-loop - } - finally - { - //increment coords - txOffset_from5++; - if(isWithinExon) { - mrnaCoord_from5++; - } - - if(positionType == PositionType.utr5) { - utr5Count_from5++; - } else if(positionType == PositionType.CDS) { - frame = (frame + 1) % 3; - if(frame == 0) { - codonCount_from5++; - } - } - } - } // l for-loop - - } //method close - - - /** - * Utility method. Creates a line containing the outputLineFields, and adds it to result, hashed by the sortKey. - * - * @param outputLineFields Column-name to value pairs. - */ - private void addThisLineToResult(final Map outputLineFields) { - final StringBuilder outputLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputLine.length() != 0) { - outputLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - final String value = outputLineFields.get(column); - if(value != null) { - outputLine.append(value); - } - } - - out.println(outputLine.toString()); - } - - public Integer reduce(Integer value, Integer sum) { return sum + value; } - - public void onTraversalDone(Integer result) { - logger.info("Skipped " + skippedPositionsCounter + " in-transcript genomic positions out of "+ totalPositionsCounter + " total (" + ( totalPositionsCounter == 0 ? 0 : (100*skippedPositionsCounter)/totalPositionsCounter) + "%)"); - logger.info("Skipped " + skippedTranscriptCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*skippedTranscriptCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine or end in a stop codon: " + transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine: " + transcriptsThatDontStartWithMethionineCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't end in a stop codon: " + transcriptsThatDontEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - } - - - /** - * Container for all data fields from a single row of the transcript table. - */ - protected static class TranscriptTableRecord - { - public static final String STRAND_COLUMN = "strand"; //eg. + - public static final String CDS_START_COLUMN = "cdsStart"; - public static final String CDS_END_COLUMN = "cdsEnd"; - public static final String EXON_COUNT_COLUMN = "exonCount"; - public static final String EXON_STARTS_COLUMN = "exonStarts"; - public static final String EXON_ENDS_COLUMN = "exonEnds"; - //public static final String EXON_FRAMES_COLUMN = "exonFrames"; - - - /** - * This StringBuffer accumulates the entire transcript sequence. - * This buffer is used instead of using the GATK window mechanism - * because arbitrary-length look-aheads and look-behinds are needed to deal - * with codons that span splice-junctions in + & - strand transcripts. - * The window mechanism requires hard-coding the window size, which would - * translate into a limit on maximum supported intron size. To avoid this, the - * sequence is accumulated as the transcript is scanned left-to-right. - * Then, all calculations are performed at the end. - */ - public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3' - public StringBuilder utr5Sequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - - public boolean positiveStrand; //whether the transcript is on the + or the - strand. - public String[] geneNames; //eg. NM_021649 - - public String txChrom; //The chromosome name - public int txStart; - public int txEnd; - - public int cdsStart; - public int cdsEnd; - - public int[] exonStarts; - public int[] exonEnds; - //public int[] exonFrames; - not used for anything, frame is computed another way - - /** - * Constructor. - * - * @param transcriptRod A rod representing a single record in the transcript table. - * @param geneNameColumns name columns. - */ - public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) { - - //String binStr = transcriptRod.get("bin"); - //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) - String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN); - if(strandStr == null) { - throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\""); - } else if(strandStr.equals("+")) { - positiveStrand = true; - } else if(strandStr.equals("-")) { - positiveStrand = false; - } else { - throw new IllegalArgumentException("Transcript table record contains unexpected value for 'strand' column: \"" + strandStr + "\""); - } - - geneNames = new String[geneNameColumns.length]; - for(int i = 0; i < geneNameColumns.length; i++) { - geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]); - } - - //String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop) - //String txEndStr = transcriptRod.get(TXEND_COLUMN); - txChrom = transcriptRod.getChr(); - txStart = transcriptRod.getStart(); - txEnd = transcriptRod.getEnd(); - - String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN); - String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN); - - cdsStart = Integer.parseInt(cdsStartStr); - cdsEnd = Integer.parseInt(cdsEndStr); - - txSequence = new StringBuilder( (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' - if(isProteinCodingTranscript()) { - utr5Sequence = new StringBuilder( positiveStrand ? (cdsStart - txStart + 1) : (txEnd - cdsEnd + 1) ); //TODO reduce init size by size of introns - cdsSequence = new StringBuilder( (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns - } - - String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN); - String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN); - String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN); - //String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN); - - String[] exonStartStrs = exonStartsStr.split(","); - String[] exonEndStrs = exonEndsStr.split(","); - //String[] exonFrameStrs = exonFramesStr.split(","); - - int exonCount = Integer.parseInt(exonCountStr); - if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length /* || exonCount != exonFrameStrs.length */) - { - throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod); - } - - exonStarts = new int[exonCount]; - exonEnds = new int[exonCount]; - //exonFrames = new int[exonCount]; - for(int i = 0; i < exonCount; i++) { - exonStarts[i] = Integer.parseInt(exonStartStrs[i]); - exonEnds[i] = Integer.parseInt(exonEndStrs[i]); - //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); - } - } - - - /** - * Takes a genomic position on the same contig as the transcript, and - * returns true if this position falls within an exon. - */ - public boolean isWithinExon(final int genomPosition) { - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - if(genomPosition < curStart) { - return false; - } - final int curStop = exonEnds[i]; - if(genomPosition <= curStop) { - return true; - } - } - - return false; - } - - /** - * Computes the distance to the nearest splice-site. - * The returned value is negative its on the 5' side (eg. upstream) of the juntion, and - * positive if its on the 3' side. - */ - public int computeDistanceToNearestSpliceSite(final int genomPosition) { - int prevDistance = Integer.MAX_VALUE; - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - int curDistance = curStart - genomPosition; - if(genomPosition < curStart) { - //position is within the current intron - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStart + 1; - } - - final int curStop = exonEnds[i]; - curDistance = curStop - genomPosition + 1; - if(genomPosition <= curStop) { - //position is within an exon - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStop; - } - } - - throw new IllegalArgumentException("Genomic position: [" + genomPosition +"] not found within transcript: " + this +". " + - "This method should not have been called for this position. NOTE: this method assumes that all transcripts start " + - "with an exon and end with an exon (rather than an intron). Is this wrong?"); - //return prevDistance; //out of exons. return genomPosition-curStop - } - - - /** - * Returns true if this is a coding transcript (eg. is translated - * into proteins). Returns false for non-coding RNA. - */ - public boolean isProteinCodingTranscript() { - return cdsStart < cdsEnd; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds)); - return sb.toString(); - } - - - - /** - * Computes the coding coord of the 1st nucleotide in the transcript. - * If the 1st nucleotide is in the 5'utr, the returned value will be negative. - * Otherwise (if the 1st nucleotide is CDS), the returned value is 1. - */ - public int computeInitialCodingCoord() { - if(!isProteinCodingTranscript()) { - throw new ReviewedStingException("This method should only be called for protein-coding transcripts"); - } - - if(positiveStrand) - { - if( cdsStart == exonStarts[0] ) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = 0; i < exonStarts.length; i++) - { - final int exonStart = exonStarts[i]; - final int exonEnd = exonEnds[i]; - if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += (cdsStart - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += (exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - else //(negative strand) - { - final int cdsStart_5prime = cdsEnd; - if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = exonEnds.length - 1; i >= 0; i--) - { - final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] - final int exonEnd = exonStarts[i]; - if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += -(cdsStart_5prime - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += -(exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - } - } - - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a10897172..3fcf87bd5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -255,7 +255,7 @@ public class UnifiedGenotyperEngine { pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc).iterator().next(); + vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } return new VariantCallContext(vc, ref.getBase(), false); @@ -436,8 +436,7 @@ public class UnifiedGenotyperEngine { pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); - vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element. + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); } VariantCallContext call = new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java deleted file mode 100755 index 9aa370d3f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java +++ /dev/null @@ -1,890 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - - -/** - * Walks along all variant ROD loci, and dynamically annotates alleles at MNP records. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -public class AnnotateMNPsWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private ManualSortingVCFWriter sortingWriter = null; - - @Argument(fullName = "emitOnlyMNPs", shortName = "emitOnlyMNPs", doc = "Only output MNP records; [default:false]", required = false) - protected boolean emitOnlyMNPs = false; - - private LinkedList rodNames = null; - private GenomeLocParser locParser = null; - private TreeMap> MNPstartToStops = null; // Must be TreeMap sorted by START sites! - - public final static String REFSEQ_ROD_NAME = "refseq"; - public final static String VARIANT_ROD_NAME = "variant"; - - private LocusToFeatures locusToRefSeqFeatures = null; - - - protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq."; - - protected final static String REFSEQ_NAME = "name"; - protected final static String REFSEQ_NAME2 = "name2"; - - protected final static String REFSEQ_POSITION_TYPE = "positionType"; - protected final static String REFSEQ_CDS = "CDS"; - - protected final static String REFSEQ_STRAND = "transcriptStrand"; - protected final static String REFSEQ_POS_STRAND = "+"; - protected final static String REFSEQ_NEG_STRAND = "-"; - - protected final static String REFSEQ_CODON_COORD = "codonCoord"; - protected final static String REFSEQ_CODING_FRAME = "frame"; - - protected final static String REFSEQ_REF_CODON = "referenceCodon"; - protected final static String REFSEQ_REF_AA = "referenceAA"; - - protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate"; - - protected final static String REFSEQ_VARIANT_CODON = "variantCodon"; - protected final static String REFSEQ_VARIANT_AA = "variantAA"; - protected final static String REFSEQ_CHANGES_AA = "changesAA"; - protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass"; - protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr"; - - protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants"; - protected final static String REFSEQ_NUM_AA_CHANGES = "numAAchanges"; - protected final static String REFSEQ_HAS_MULT_AA_CHANGES = "alleleHasMultAAchanges"; - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add(VARIANT_ROD_NAME); - - locParser = getToolkit().getGenomeLocParser(); - MNPstartToStops = new TreeMap>(); // sorted by start sites - - initializeVcfWriter(); - - locusToRefSeqFeatures = new LocusToFeatures(); - } - - private void initializeVcfWriter() { - sortingWriter = new ManualSortingVCFWriter(writer); - writer = sortingWriter; - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - writer.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site of interest, annotate it if it's a MNP. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return count of MNPs observed - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - int numMNPsObserved = 0; - GenomeLoc curLocus = ref.getLocus(); - clearOldLocusFeatures(curLocus); - - boolean requireStartHere = false; // see EVERY site of the MNP - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - GenomeLoc vcLoc = VariantContextUtils.getLocation(locParser, vc); - boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart(); - boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop(); - - if (vc.isMNP()) { - logger.debug("Observed MNP at " + vcLoc); - - if (isChrM(vc)) { - if (atStartOfVc) { - logger.warn("Skipping mitochondrial MNP at " + vcLoc + " due to complexity of coding table [need to know if first codon, etc.]..."); - writeVCF(vc); - } - continue; - } - - GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStop()); - final List refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME); - for (Object refSeqObject : refSeqRODs) { - AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject; - locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc); - } - - if (atStartOfVc) { // MNP is starting here, so register that we're waiting for it - Set stopLocs = MNPstartToStops.get(curLocus); - if (stopLocs == null) { - stopLocs = new HashSet(); - MNPstartToStops.put(curLocus, stopLocs); - } - stopLocs.add(stopLoc); - } - - if (atEndOfVc) { - numMNPsObserved++; // only count a MNP at its stop site - logger.debug("Observed end of MNP at " + curLocus); - logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures); - - Map MNPannotations = annotateMNP(vc); - MNPannotations.putAll(RefSeqDataParser.removeRefSeqAttributes(vc.getAttributes())); // remove any RefSeq INFO, since adding it in more thoroughly here - vc = VariantContext.modifyAttributes(vc, MNPannotations); - writeVCF(vc); - - GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStart()); - Set stopLocs = MNPstartToStops.get(startLoc); - if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop) - stopLocs.remove(stopLoc); - if (stopLocs.isEmpty()) // no longer waiting for startLoc - MNPstartToStops.remove(startLoc); - } - } - } - else if (atStartOfVc && !emitOnlyMNPs) {// only want to write other VariantContexts records once (where they start): - writeVCF(vc); - } - } - - Integer mostUpstreamWritableLoc = null; - if (!MNPstartToStops.isEmpty()) { - GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey(); - mostUpstreamWritableLoc = waitingForLoc.getStart() - 1; - } - sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc); - - return numMNPsObserved; - } - - private static boolean isChrM(final VariantContext vc) { - return vc.getChr().equals("chrM") || vc.getChr().equals("MT"); - } - - private Map annotateMNP(VariantContext vc) { - Map annotations = new HashMap(); - - RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc); - MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures); - - for (Map.Entry nameToFeatureEntry : nameToPositionalFeatures.entrySet()) { - String featureName = nameToFeatureEntry.getKey(); - RefSeqFeatureList feature = nameToFeatureEntry.getValue(); - CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature); - - annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.getCodonAnnotationsString()); - annotations.put(kb.getKey(REFSEQ_NUM_AA_CHANGES), codonAnnotationsForAlleles.getNumAAchangesString()); - annotations.put(kb.getKey(REFSEQ_HAS_MULT_AA_CHANGES), codonAnnotationsForAlleles.hasAlleleWithMultipleAAchanges); - annotations.put(kb.getKey(REFSEQ_NAME), featureName); - annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2); - annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS); - annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND)); - annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString()); - - kb.incrementFeatureIndex(); - } - - return annotations; - } - - private static class MNPannotationKeyBuilder { - private int featureIndex; - private boolean multipleEntries; - - public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) { - this.featureIndex = 1; - this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1; - } - - public void incrementFeatureIndex() { - featureIndex++; - } - - public String getKey(String type) { - String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type; - if (multipleEntries) - annotationKey += "_" + featureIndex; - return annotationKey; - } - } - - private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) { - byte[] primArray = new byte[nonNullArray.length]; - - for (int i = 0; i < nonNullArray.length; i++) { - if (nonNullArray[i] == null) - throw new ReviewedStingException("nonNullArray[i] == null"); - primArray[i] = nonNullArray[i]; - } - - return primArray; - } - - private void clearOldLocusFeatures(GenomeLoc curLoc) { - Iterator> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator(); - while (locusFeaturesIt.hasNext()) { - Map.Entry locusFeaturesEntry = locusFeaturesIt.next(); - if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures())) - locusFeaturesIt.remove(); - } - } - - public Integer reduce(Integer count, Integer total) { - if (count != null) - total = total + count; - - return total; - } - - /** - * @param result the number of MNPs processed. - */ - public void onTraversalDone(Integer result) { - System.out.println("Number of MNPs observed: " + result); - writer.close(); - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, writer, logger); - } - - /* - Inner classes: - */ - - // Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP: - - private class RefSeqNameToFeatures { - private Map nameToFeatures; - - public RefSeqNameToFeatures(VariantContext vc) { - this.nameToFeatures = new HashMap(); - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (int i = 0; i < MNPlength; i++) { - int genomicPosition = MNPstart + i; - GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition); - - PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc); - if (locFeatures == null) // no features for posLoc - continue; - - for (Map.Entry nameToFeatureEntry : locFeatures.entrySet()) { - String name = nameToFeatureEntry.getKey(); - PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue(); - - RefSeqFeatureList featureList = nameToFeatures.get(name); - if (featureList == null) { - featureList = new RefSeqFeatureList(MNPlength); - nameToFeatures.put(name, featureList); - } - featureList.updateFeatureAtPosition(i, posFeature); - } - } - } - - public Set> entrySet() { - return nameToFeatures.entrySet(); - } - } - - // For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP - - private static class RefSeqFeatureList { - private final static String CODON_FRAME_START = "("; - private final static String CODON_FRAME_END = ")"; - private final static String CODON_DELIM = "|"; - - private CodingRefSeqFeature[] refSeqFeatures; - private String name2; - private Boolean positiveStrand; - - private Map> codonToIndices; // Map of: codon index -> MNP indices that refer to codon - - public RefSeqFeatureList(int MNPlength) { - this.refSeqFeatures = new CodingRefSeqFeature[MNPlength]; - for (int i = 0; i < MNPlength; i++) - this.refSeqFeatures[i] = null; - - this.name2 = null; - this.positiveStrand = null; - this.codonToIndices = new TreeMap>(); - } - - public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) { - if (name2 == null) { - name2 = feature.name2; - positiveStrand = feature.positiveStrand; - } - else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + feature); - } - - CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature); - refSeqFeatures[index] = crsf; - - List indicesWithCodon = codonToIndices.get(crsf.codonCoord); - if (indicesWithCodon == null) { - indicesWithCodon = new LinkedList(); - codonToIndices.put(crsf.codonCoord, indicesWithCodon); - } - indicesWithCodon.add(index); - } - - public Set>> codonIndicesEntrySet() { - return codonToIndices.entrySet(); - } - - public String getCodonCoordString() { - StringBuilder sb = new StringBuilder(); - - for (int i = 0; i < refSeqFeatures.length; i++) { - CodingRefSeqFeature crsf = refSeqFeatures[i]; - if (crsf != null) - sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END); - if (i < refSeqFeatures.length - 1) - sb.append(CODON_DELIM); - } - - return sb.toString(); - } - } - - private static class CodingRefSeqFeature { - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - public CodingRefSeqFeature(PositionalRefSeqFeature feature) { - this.codonCoord = feature.codonCoord; - this.codingFrame = feature.codingFrame; - this.referenceCodon = feature.referenceCodon.toUpperCase(); - this.referenceAA = feature.referenceAA; - } - } - - private static class CodonAnnotationsForAltAlleles { - protected final static int MIN_CODON_INDEX = 0; - protected final static int NUM_CODON_INDICES = 3; - private final static String CODON_ANNOTATION_DELIM = ","; - - private List alleleAnnotations; - private int[] alleleToNumAAchanges; - private boolean hasAlleleWithMultipleAAchanges; - - public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) { - this.alleleAnnotations = new LinkedList(); - - Set altAlleles = vc.getAlternateAlleles(); - int numAltAlleles = altAlleles.size(); - this.alleleToNumAAchanges = new int[numAltAlleles]; - for (int i = 0; i < numAltAlleles; i++) - this.alleleToNumAAchanges[i] = 0; - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (Map.Entry> codonToIndicesEntry : feature.codonIndicesEntrySet()) { - int codonIndex = codonToIndicesEntry.getKey(); - List indices = codonToIndicesEntry.getValue(); - if (indices.isEmpty()) - throw new ReviewedStingException("indices should not exist if it's empty!"); - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (feature.refSeqFeatures[index].codonCoord != codonIndex) - throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex"); - if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES) - throw new UserException("RefSeq codon frame not one of {0,1,2}"); - } - CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)]; - String refCodon = firstFeatureForCodon.referenceCodon; - - SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, altAlleles, MNPlength, refCodon, firstFeatureForCodon, indices, feature); - alleleAnnotations.add(codonAnnotation); - - // From a single codon, summarize the data for ALL alleles: - for (int i = 0; i < numAltAlleles; i++) { - if (codonAnnotation.annotationsForAlleles[i].codonFunc.changesAA) { - alleleToNumAAchanges[i]++; - if (alleleToNumAAchanges[i] > 1) - this.hasAlleleWithMultipleAAchanges = true; - } - } - } - } - - public String getCodonAnnotationsString() { - StringBuilder sb = new StringBuilder(); - - int index = 0; - for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) { - sb.append(codonToAlleles); - if (index < alleleAnnotations.size() - 1) - sb.append(CODON_ANNOTATION_DELIM); - index++; - } - - return sb.toString(); - } - - public String getNumAAchangesString() { - StringBuilder sb = new StringBuilder(); - - for (int index = 0; index < alleleToNumAAchanges.length; index++) { - sb.append(alleleToNumAAchanges[index]); - if (index < alleleToNumAAchanges.length - 1) - sb.append(SingleCodonAnnotationsForAlleles.ALLELE_ANNOTATION_DELIM); - } - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAlleles { - private final static String CODON_MAP_SYMBOL = "->"; - private final static String CODON_ANNOTATION_START = "["; - private final static String CODON_ANNOTATION_END = "]"; - private final static String REF_CODON_INFO_DELIM = "|"; - private final static String ALLELE_ANNOTATION_DELIM = ","; - private final static String ASSIGNMENT = ":"; - - private int codonIndex; - private String refCodon; - private String refAA; - - private SingleCodonAnnotationsForAllele[] annotationsForAlleles; - - public SingleCodonAnnotationsForAlleles(int codonIndex, Collection altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List indices, RefSeqFeatureList feature) { - if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES) - throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES); - - AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon); - if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA)) - throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA"); - - this.codonIndex = codonIndex; - this.refCodon = refCodon; - this.refAA = refAA.getCode(); - this.annotationsForAlleles = new SingleCodonAnnotationsForAllele[altAlleles.size()]; - - int altInd = 0; - for (Allele altAllele : altAlleles) { - if (altAllele.length() != MNPlength) - throw new ReviewedStingException("length(altAllele) != length(MNP)"); - byte[] altBases = altAllele.getBases(); - - Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES]; - for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++) - variantCodonArr[i] = null; - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (variantCodonArr[frame] != null) - throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame); - - byte base = altBases[index]; - if (!feature.positiveStrand) // negative strand codon - base = BaseUtils.simpleComplement(base); - - variantCodonArr[frame] = base; - } - - /* For missing frames, there MUST exist AT LEAST one index that refers to this codon, - so use it to derive the missing bases [ALREADY complemented if on the negative strand]: - */ - for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) { - if (variantCodonArr[frame] == null) - variantCodonArr[frame] = (byte) refCodon.charAt(frame); - } - String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase(); - - SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex); - annotationsForAlleles[altInd] = alleleAnnotation; - altInd++; - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START); - sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM); - sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM); - - int index = 0; - for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) { - sb.append(annotation); - if (index < annotationsForAlleles.length - 1) - sb.append(ALLELE_ANNOTATION_DELIM); - index++; - } - sb.append(CODON_ANNOTATION_END); - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAllele { - private final static String ALLELE_START = "{"; - private final static String ALLELE_END = "}"; - private final static String CODON_INFO_DELIM = "|"; - private final static String ASSIGNMENT = ":"; - private final static String MNP_DEPENDENT_AA = "MNPdependentAA"; - - private CodonFunction codonFunc; - private String proteinCoordStr; - private boolean MNPdependentAA; - private String originalAA; - - public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) { - this.codonFunc = new CodonFunction(variantCodon, refCodon, refAA); - this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + codonFunc.variantAA.getLetter(); - - int refCodonLength = refCodon.length(); - if (codonFunc.variantCodon.length() != refCodonLength) - throw new ReviewedStingException("codonFunc.variantCodon.length() != refCodonLength, but ALREADY checked that they're both 3"); - - this.MNPdependentAA = true; - this.originalAA = "("; - for (int i = 0; i < refCodonLength; i++) { - // Take [0,i-1] and [i+1, end] from refCodon, and i from variantCodon: - String singleBaseChangeCodon = refCodon.substring(0, i) + variantCodon.substring(i, i+1) + refCodon.substring(i+1, refCodonLength); - CodonFunction singleBaseChangeCodonFunc = new CodonFunction(singleBaseChangeCodon, refCodon, refAA); - if (singleBaseChangeCodonFunc.variantAA.equals(codonFunc.variantAA)) { - this.MNPdependentAA = false; - this.originalAA = ""; - break; - } - - this.originalAA = this.originalAA + "" + singleBaseChangeCodonFunc.variantAA.getLetter(); - if (i < refCodonLength - 1) - this.originalAA = this.originalAA + ","; - } - - if (this.MNPdependentAA) - this.originalAA = this.originalAA + ")"; - } - - private static class CodonFunction { - private String variantCodon; - private AminoAcid variantAA; - private boolean changesAA; - private String functionalClass; - - public CodonFunction(String variantCodon, String refCodon, AminoAcid refAA) { - this.variantCodon = variantCodon; - this.variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon); - this.changesAA = !refAA.equals(variantAA); - - if (!this.variantCodon.equals(refCodon)) { - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } - else if (refAA.isStop()) { - functionalClass = "readthrough"; - } - else { - functionalClass = "missense"; - } - } - else { // the same aa: - functionalClass = "silent"; - } - } - else { // the same codon: - functionalClass = "no_change"; - } - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(ALLELE_START); - sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(codonFunc.variantCodon).append(CODON_INFO_DELIM); - sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(codonFunc.variantAA.getCode()).append(CODON_INFO_DELIM); - sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(codonFunc.changesAA).append(CODON_INFO_DELIM); - sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(codonFunc.functionalClass).append(CODON_INFO_DELIM); - sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr).append(CODON_INFO_DELIM); - sb.append(MNP_DEPENDENT_AA).append(ASSIGNMENT).append(MNPdependentAA).append(originalAA); - sb.append(ALLELE_END); - - return sb.toString(); - } - } -} - - -// External classes: - -class LocusToFeatures { - private Map locusToFeatures; - - public LocusToFeatures() { - this.locusToFeatures = new TreeMap(); - } - - public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) { - return locusToFeatures.get(loc); - } - - public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc); - if (locFeatures == null) { - locFeatures = new PositionalRefSeqFeatures(locusUsingThis); - locusToFeatures.put(loc, locFeatures); - } - locFeatures.putFeature(refSeqAnnotation, locusUsingThis); - } - - public Set> entrySet() { - return locusToFeatures.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry locFeatures : entrySet()) { - GenomeLoc loc = locFeatures.getKey(); - PositionalRefSeqFeatures features = locFeatures.getValue(); - sb.append("Locus: ").append(loc).append("\n").append(features); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeatures { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE}; - - private Map nameToFeature; - private GenomeLoc furthestLocusUsingFeatures; - - public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) { - this.nameToFeature = new HashMap(); - this.furthestLocusUsingFeatures = locusUsingThis; - } - - public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - - if (locusUsingThis.isPast(furthestLocusUsingFeatures)) - furthestLocusUsingFeatures = locusUsingThis; - - String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE); - if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations - return; - - PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation); - - String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME); - PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName); - if (locusFeature == null) { - locusFeature = newLocusFeature; - nameToFeature.put(refSeqName, locusFeature); - } - else if (!locusFeature.equals(newLocusFeature)) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation); - } - - locusFeature.updateFeature(refSeqAnnotation); - } - - public GenomeLoc getFurthestLocusUsingFeatures() { - return furthestLocusUsingFeatures; - } - - public Set> entrySet() { - return nameToFeature.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry nameFeatureEntry : entrySet()) { - String name = nameFeatureEntry.getKey(); - PositionalRefSeqFeature feature = nameFeatureEntry.getValue(); - sb.append(name).append(" -> [").append(feature).append("]\n"); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeature { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND, - AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME, - AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA}; - - protected String name2; - protected boolean positiveStrand; - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - private Map baseToAnnotations; - - public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2); - this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND)); - this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD)); - this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME)); - this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON); - this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA); - - this.baseToAnnotations = new HashMap(); - } - - public boolean equals(PositionalRefSeqFeature that) { - return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame - && this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA); - } - - public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) { - if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE); - String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE); - - baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation)); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("name2= ").append(name2); - sb.append(", positiveStrand= ").append(positiveStrand); - sb.append(", codonCoord= ").append(codonCoord); - sb.append(", codingFrame= ").append(codingFrame); - sb.append(", referenceCodon= ").append(referenceCodon); - sb.append(", referenceAA= ").append(referenceAA); - - sb.append(", baseAnnotations= {"); - for (Map.Entry baseToAnnotationsEntry : baseToAnnotations.entrySet()) { - String base = baseToAnnotationsEntry.getKey(); - BaseAnnotations annotations = baseToAnnotationsEntry.getValue(); - sb.append(" ").append(base).append(" -> {").append(annotations).append("}"); - } - sb.append(" }"); - - return sb.toString(); - } -} - -class BaseAnnotations { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA, - AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS, - AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION}; - - protected String variantCodon; - protected String variantAA; - protected boolean changesAA; - protected String functionalClass; - protected String proteinCoordStr; - - public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON); - this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA); - this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA)); - this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS); - this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION); - } - - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("variantCodon= ").append(variantCodon); - sb.append(", variantAA= ").append(variantAA); - sb.append(", changesAA= ").append(changesAA); - sb.append(", functionalClass= ").append(functionalClass); - sb.append(", proteinCoordStr= ").append(proteinCoordStr); - - return sb.toString(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcid.java index 0d0b906e0..0b47093fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; /** * Represents a single amino acid. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java index c10eb5dd7..1ae28ffb3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; import java.util.HashMap; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java deleted file mode 100755 index c75a5b2dc..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java +++ /dev/null @@ -1,83 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - - -import java.util.Arrays; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -public class GenomicAnnotatorIntegrationTest extends WalkerTest { - String testFileWithIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.cleaned.indels.vcf"; - String testFileWithSNPsAndIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.variants.vcf"; - - @Test - public void testGenomicAnnotatorOnDbSNP() { - - /* - TODO put this test back in once it gets faster. - String[] md5 = {"d19d6d1eb52fb09e7493653dc645d92a"}; - WalkerTestSpec spec = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + " " + - "-B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf " + - "-B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt " + - "-m " + //generate many records from one input record if necessary - "-o %s " + - "-BTI variant", - 1, - Arrays.asList(md5)); - executeTest("test with dbSNP", spec); - */ - - - String[] md5WithDashSArg = {"efba4ce1641cfa2ef88a64395f2ebce8"}; - WalkerTestSpec specWithSArg = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + - " -B:variant,vcf3 /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" + - " -B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt" + - " -m" + //generate many records from one input record if necessary - " -o %s" + - " -BTI variant" + - " -s dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet" + - " -NO_HEADER", - 1, - Arrays.asList(md5WithDashSArg)); - executeTest("test with dbSNP and -s arg", specWithSArg); - - } - - @Test - public void testGenomicAnnotatorOnIndels() { - WalkerTestSpec testOnIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("772fc3f43b70770ec6c6acbb8bbbd4c0") - ); - executeTest("testGenomicAnnotatorOnIndels", testOnIndels); - } - - @Test - public void testGenomicAnnotatorOnSNPsAndIndels() { - WalkerTestSpec testOnSNPsAndIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithSNPsAndIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("081ade7f3d2d3c5f19cb1e8651a626f3") - ); - executeTest("testGenomicAnnotatorOnSNPsAndIndels", testOnSNPsAndIndels); - } -} From 5626199bb69f7d9788b5112922d88d845dda1721 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 2 Aug 2011 10:14:21 -0400 Subject: [PATCH 3/8] The Unified Genotyper now does NOT emit SLOD/SB by default; to compute SB use --computeSLOD --- .../genotyper/UnifiedArgumentCollection.java | 6 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 12 ++-- .../UnifiedGenotyperIntegrationTest.java | 61 ++++++++----------- 4 files changed, 34 insertions(+), 47 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 2b25df4aa..52bf3f715 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -58,8 +58,8 @@ public class UnifiedArgumentCollection { @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) - public boolean NO_SLOD = false; + @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) + public boolean COMPUTE_SLOD = false; // control the error modes @@ -154,7 +154,7 @@ public class UnifiedArgumentCollection { uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; + uac.COMPUTE_SLOD = COMPUTE_SLOD; uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 2a0338bca..c673f7b3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -144,7 +144,7 @@ public class UnifiedGenotyper extends LocusWalker e = new HashMap(); - e.put( "--min_base_quality_score 26", "5043c9a101e691602eb7a3f9704bdf20" ); - e.put( "--min_mapping_quality_score 26", "71a833eb8fd93ee62ae0d5a430f27940" ); - e.put( "--p_nonref_model GRID_SEARCH", "ddf443e9dcadef367476b26b4d52c134" ); + e.put( "--min_base_quality_score 26", "6d3aa9f783ca63f37c952f83eeda593c" ); + e.put( "--min_mapping_quality_score 26", "51bfdf777123bf49de5d92ffde5c74e7" ); + e.put( "--p_nonref_model GRID_SEARCH", "333328ab2c8da2875fade599e80a271f" ); + e.put( "--computeSLOD", "226caa28a4fa9fe34f3beb8a23f3d53d" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -153,9 +154,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "eaad6ceb71ab94290650a70bea5ab951" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "05bf7db8a3d19ef4a3d14772c90b732f" ); - e.put( "--output_mode EMIT_ALL_SITES", "e4b86740468d7369f0156550855586c7" ); + e.put( "-sites_only", "5f659dee408710d3709ed72005cd863a" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "55d09bf13149bddc06cc36be0801507b" ); + e.put( "--output_mode EMIT_ALL_SITES", "727f49dcb2439b18446829efc3b1561c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -169,12 +170,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("71a833eb8fd93ee62ae0d5a430f27940")); + Arrays.asList("51bfdf777123bf49de5d92ffde5c74e7")); executeTest("test confidence 1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("79968844dc3ddecb97748c1acf2984c7")); + Arrays.asList("c67c285e70fd4457c9f9ce7bd878ddca")); executeTest("test confidence 2", spec2); } @@ -186,8 +187,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "4e878664f61d2d800146d3762303fde1" ); - e.put( 1.0 / 1850, "9204caec095ff5e63ca21a10b6fab453" ); + e.put( 0.01, "7ecc564d4db97d5932cef2e558550ed2" ); + e.put( 1.0 / 1850, "aa9e101bb9f9e111fe292fec467d915a" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +212,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); + Arrays.asList("2efd686186b2c5129be4cf89274a24dd")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,25 +231,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("62d0f6d9de344ce68ce121c13b1e78b1")); + Arrays.asList("2892d35331fe9fc141ba19269ec7caed")); executeTest(String.format("test calling with BAQ"), spec); } - @Test - public void testCallingWithBAQOff() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000" + - " -baq OFF", - 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); - - executeTest(String.format("test calling with BAQ OFF"), spec); - } - // -------------------------------------------------------------------------------------------------------------- // // testing indel caller @@ -263,7 +250,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("631ae1f1eb6bc4c1a4136b8495250536")); + Arrays.asList("8c2afb4289ed44521933d1a74c8d6c7f")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("fd556585c79e2b892a5976668f45aa43")); + Arrays.asList("b6fb70590a10e1c27fb611732916f27d")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("9cd56feedd2787919e571383889fde70")); + Arrays.asList("61642502bd08cc03cdaaeb83a5426b46")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,14 +288,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("315e1b78d7a403d7fcbcf0caa8c496b8")); + Arrays.asList("69b0b3f089c80b9864294d838a061336")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("cf89e0c54f14482a23c105b73a333d8a")); + Arrays.asList("c90174cfd7dd68bdef36fe2c60145e10")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } From 2c5e526eb7d9a4bf30d9fd0a715874e1517736e0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 2 Aug 2011 10:34:46 -0400 Subject: [PATCH 4/8] Don't use the mismatch fraction by default in the RealignerTargetCreator (since it's only useful when using SW in the indel realigner). Also, no more use of -D but instead move over to using VCFs. One integration test is temporarily commented out while I wait for a VCF file to get fixed. --- .../sting/gatk/walkers/indels/RealignerTargetCreator.java | 2 +- .../indels/RealignerTargetCreatorIntegrationTest.java | 6 +++--- .../indels/RealignerTargetCreatorPerformanceTest.java | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 488e37f26..6453ce8de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -64,7 +64,7 @@ public class RealignerTargetCreator extends RodWalker Date: Tue, 2 Aug 2011 10:39:50 -0400 Subject: [PATCH 5/8] No more use of -D in the integration tests but instead stick with VCFs only. Since all of these tests were duplicated (one each for dbSNP format and for VCF), we don't actually lose coverage in the integration tests. --- .../indels/IndelRealignerIntegrationTest.java | 19 ------------------- .../indels/IndelRealignerPerformanceTest.java | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 2676f7067..19dc99682 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -32,13 +32,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("test realigner defaults with VCF", spec2); - - WalkerTestSpec spec3 = new WalkerTestSpec( - baseCommand + "-D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5)); - executeTest("realigner defaults with dbsnp", spec3); - } @Test @@ -48,12 +41,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList("3dd5d2c9931b375455af0bff1a2c4888")); executeTest("realigner known indels only from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList("05a114623c126b0398fbc1703437461e")); - executeTest("realigner known indels only from dbsnp", spec2); } @Test @@ -63,12 +50,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("realigner use SW from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel USE_SW -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5_with_SW_or_VCF)); - executeTest("realigner use SW from dbsnp", spec2); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java index fd5ad0b22..e8b5033cf 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java @@ -30,7 +30,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-5,650,000" + @@ -45,7 +45,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L chr1:1-150,000,000" + From b9d0d2af223ea5537fac2d5af6d4ecf79702487a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 2 Aug 2011 12:39:11 -0400 Subject: [PATCH 7/8] Adding back temporarily removed integration test now that the file permissions have been fixed. --- .../walkers/indels/RealignerTargetCreatorIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index cc67c354a..f5ed69476 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -19,8 +19,8 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.b36.excluding_sites_after_129.vcf -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); - //executeTest("test dbsnp", spec2); + Arrays.asList("0367d39a122c8ac0899fb868a82ef728")); + executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -R " + b36KGReference + " -B:indels,VCF " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -BTI indels -o %s", From 65c5d55b724bffd1d895e9cadd7d4acfa1d276dd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 2 Aug 2011 12:48:36 -0400 Subject: [PATCH 8/8] Not sure how I missed these. These lines are now superfluous. --- .../sting/gatk/walkers/genotyper/UGCallVariants.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 68d8f9b54..a3b9f379e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -62,7 +62,6 @@ public class UGCallVariants extends RodWalker { private Set trackNames = new HashSet(); public void initialize() { - UAC.NO_SLOD = true; for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) { if ( d.getName().startsWith("variant") )