diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index c307d4cc0..b9b97e154 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants; import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; @@ -38,9 +38,22 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation { +/** + * A set of genomic annotations based on the output of the SnpEff variant effect predictor tool + * (http://snpeff.sourceforge.net/). + * + * For each variant, chooses one of the effects of highest biological impact from the SnpEff + * output file (which must be bound to an RMD track named "SnpEff"), and adds annotations + * on that effect. + * + * The possible biological effects and their associated impacts are defined in the class: + * org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants + * + * @author David Roazen + */ +public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation { - // SnpEff field keys: + // SnpEff annotation key names: public static final String GENE_ID_KEY = "GENE_ID"; public static final String GENE_NAME_KEY = "GENE_NAME"; public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID"; @@ -55,11 +68,14 @@ public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation { public static final String CODON_NUM_KEY = "CODON_NUM"; public static final String CDS_SIZE_KEY = "CDS_SIZE"; + // Name of the RMD track bound to the raw SnpEff-generated output file: public static final String RMD_TRACK_NAME = "SnpEff"; public Map annotate ( RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { List snpEffFeatures = tracker.getReferenceMetaData(RMD_TRACK_NAME); + // Add only annotations for one of the most biologically-significant effects as defined in + // the SnpEffConstants class: SnpEffFeature mostSignificantEffect = getMostSignificantEffect(snpEffFeatures); return generateAnnotations(mostSignificantEffect); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java index dfe1f5f1a..827df16bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java @@ -34,6 +34,40 @@ import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygos import java.io.IOException; +/** + * Codec for decoding the output format of the SnpEff variant effect predictor tool + * (http://snpeff.sourceforge.net/). + * + * This format has 23 tab-delimited fields: + * + * Chromosome + * Position + * Reference + * Change + * Change Type: {SNP, MNP, INS, DEL} + * Zygosity: {Hom, Het} + * Quality + * Coverage + * Warnings + * Gene ID + * Gene Name + * Bio Type + * Transcript ID + * Exon ID + * Exon Rank + * Effect + * Old/New Amino Acid + * Old/New Codon + * Codon Num + * CDS Size + * Codons Around + * Amino Acids Around + * Custom Interval ID + * + * We treat all except the Chromosome, Position, and Effect fields as optional. + * + * @author David Roazen + */ public class SnpEffCodec implements FeatureCodec { public static final int EXPECTED_NUMBER_OF_FIELDS = 23; @@ -64,9 +98,13 @@ public class SnpEffCodec implements FeatureCodec { "AAs around", "Custom_interval_ID" }; + + // The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line: public static final int[] REQUIRED_FIELDS = { 0, 1, 15 }; + public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE"; + public Feature decodeLoc ( String line ) { return decode(line); } @@ -101,6 +139,11 @@ public class SnpEffCodec implements FeatureCodec { Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]); boolean isNonCodingGene = isNonCodingGene(tokens[15]); + + // Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present, + // otherwise split it into two subfields. We need this limit to prevent the extra effect-related information + // in the final field (when present) from being inappropriately tokenized: + int effectFieldTokenLimit = isNonCodingGene ? 3 : 2; String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit); EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene); @@ -150,6 +193,9 @@ public class SnpEffCodec implements FeatureCodec { private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) { String effectName = ""; + // If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield, + // otherwise it will be in the first subfield: + if ( effectFieldTokens.length > 1 && isNonCodingGene ) { effectName = effectFieldTokens[1].trim(); } @@ -161,6 +207,9 @@ public class SnpEffCodec implements FeatureCodec { } private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) { + + // The extra effect-related information, if present, will always be the last subfield: + if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) { return effectFieldTokens[effectFieldTokens.length - 1].trim(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java index f226c3523..270db470f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java @@ -24,8 +24,14 @@ package org.broadinstitute.sting.utils.codecs.snpEff; +/** + * A set of constants associated with the SnpEff codec. + * + * @author David Roazen + */ public class SnpEffConstants { + // Possible SnpEff biological effects and their associated impacts: public enum EffectType { START_GAINED (EffectImpact.HIGH), START_LOST (EffectImpact.HIGH), @@ -93,6 +99,7 @@ public class SnpEffConstants { } } + // The kinds of variants supported by the SnpEff output format: public enum ChangeType { SNP, MNP, @@ -100,6 +107,7 @@ public class SnpEffConstants { DEL } + // Possible zygosities of SnpEff variants: public enum Zygosity { Hom, Het diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java index 4a68d7cf1..2f120b7d2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java @@ -26,15 +26,26 @@ package org.broadinstitute.sting.utils.codecs.snpEff; import org.broad.tribble.Feature; +import java.util.NoSuchElementException; + import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; +/** + * Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output. + * + * Many fields are optional, and missing values are represented by nulls. You should always call the + * hasX() method before calling the corresponding getX() method. Required fields can never be null + * and do not have a hasX() method. + * + * @author David Roazen + */ public class SnpEffFeature implements Feature { - private String contig; - private long position; + private String contig; // REQUIRED FIELD + private long position; // REQUIRED FIELD private String reference; private String change; private ChangeType changeType; @@ -48,8 +59,8 @@ public class SnpEffFeature implements Feature { private String transcriptID; private String exonID; private Integer exonRank; - private boolean isNonCodingGene; - private EffectType effect; + private boolean isNonCodingGene; // REQUIRED FIELD + private EffectType effect; // REQUIRED FIELD private String effectExtraInformation; private String oldAndNewAA; private String oldAndNewCodon; @@ -85,6 +96,10 @@ public class SnpEffFeature implements Feature { String aasAround, String customIntervalID ) { + if ( contig == null || effect == null ) { + throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields"); + } + this.contig = contig; this.position = position; this.reference = reference; @@ -113,6 +128,10 @@ public class SnpEffFeature implements Feature { } public boolean isHigherImpactThan ( SnpEffFeature other ) { + + // If one effect is in a non-coding gene and the other is not, the effect NOT in the + // non-coding gene has higher impact: + if ( ! isNonCodingGene() && other.isNonCodingGene() ) { return true; } @@ -120,6 +139,9 @@ public class SnpEffFeature implements Feature { return false; } + // Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts + // of the effects themselves as defined in the SnpEffConstants class: + return getEffectImpact().isHigherImpactThan(other.getEffectImpact()); } @@ -140,6 +162,7 @@ public class SnpEffFeature implements Feature { } public String getReference() { + if ( reference == null ) throw new NoSuchElementException("This feature has no reference field"); return reference; } @@ -148,6 +171,7 @@ public class SnpEffFeature implements Feature { } public String getChange() { + if ( change == null ) throw new NoSuchElementException("This feature has no change field"); return change; } @@ -156,6 +180,7 @@ public class SnpEffFeature implements Feature { } public ChangeType getChangeType() { + if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field"); return changeType; } @@ -164,6 +189,7 @@ public class SnpEffFeature implements Feature { } public Zygosity getZygosity() { + if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field"); return zygosity; } @@ -172,6 +198,7 @@ public class SnpEffFeature implements Feature { } public Double getQuality() { + if ( quality == null ) throw new NoSuchElementException("This feature has no quality field"); return quality; } @@ -180,6 +207,7 @@ public class SnpEffFeature implements Feature { } public Long getCoverage() { + if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field"); return coverage; } @@ -188,6 +216,7 @@ public class SnpEffFeature implements Feature { } public String getWarnings() { + if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field"); return warnings; } @@ -196,6 +225,7 @@ public class SnpEffFeature implements Feature { } public String getGeneID() { + if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field"); return geneID; } @@ -204,6 +234,7 @@ public class SnpEffFeature implements Feature { } public String getGeneName() { + if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field"); return geneName; } @@ -212,6 +243,7 @@ public class SnpEffFeature implements Feature { } public String getBioType() { + if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field"); return bioType; } @@ -220,6 +252,7 @@ public class SnpEffFeature implements Feature { } public String getTranscriptID() { + if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field"); return transcriptID; } @@ -228,6 +261,7 @@ public class SnpEffFeature implements Feature { } public String getExonID() { + if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field"); return exonID; } @@ -236,6 +270,7 @@ public class SnpEffFeature implements Feature { } public Integer getExonRank() { + if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field"); return exonRank; } @@ -256,6 +291,7 @@ public class SnpEffFeature implements Feature { } public String getEffectExtraInformation() { + if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field"); return effectExtraInformation; } @@ -264,6 +300,7 @@ public class SnpEffFeature implements Feature { } public String getOldAndNewAA() { + if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field"); return oldAndNewAA; } @@ -272,6 +309,7 @@ public class SnpEffFeature implements Feature { } public String getOldAndNewCodon() { + if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field"); return oldAndNewCodon; } @@ -280,6 +318,7 @@ public class SnpEffFeature implements Feature { } public Integer getCodonNum() { + if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field"); return codonNum; } @@ -288,6 +327,7 @@ public class SnpEffFeature implements Feature { } public Integer getCdsSize() { + if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field"); return cdsSize; } @@ -296,6 +336,7 @@ public class SnpEffFeature implements Feature { } public String getCodonsAround() { + if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field"); return codonsAround; } @@ -304,6 +345,7 @@ public class SnpEffFeature implements Feature { } public String getAasAround() { + if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field"); return aasAround; } @@ -312,6 +354,7 @@ public class SnpEffFeature implements Feature { } public String getCustomIntervalID() { + if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field"); return customIntervalID; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index e6300e6c9..5dc7299a9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -125,4 +125,15 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); } } + + @Test + public void testSnpEffAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -o %s -A SnpEff -B:variant,VCF " + validationDataLocation + "/1000G.exomes.vcf " + + "-B:SnpEff,SnpEff " + validationDataLocation + "/snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out" + " -L 1", + 1, + Arrays.asList("5fe3644744d3c084a179c3d204555333") + ); + executeTest("Testing SnpEff annotations", spec); + } }