Added an integration test for the SnpEff annotation support, as well as some extra safety checks and comments.

This commit is contained in:
David Roazen 2011-08-08 20:01:24 -04:00
parent 5e288136e0
commit a13bc7b929
5 changed files with 134 additions and 7 deletions

View File

@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
@ -38,9 +38,22 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation {
/**
* A set of genomic annotations based on the output of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
* output file (which must be bound to an RMD track named "SnpEff"), and adds annotations
* on that effect.
*
* The possible biological effects and their associated impacts are defined in the class:
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
*
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
// SnpEff field keys:
// SnpEff annotation key names:
public static final String GENE_ID_KEY = "GENE_ID";
public static final String GENE_NAME_KEY = "GENE_NAME";
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
@ -55,11 +68,14 @@ public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation {
public static final String CODON_NUM_KEY = "CODON_NUM";
public static final String CDS_SIZE_KEY = "CDS_SIZE";
// Name of the RMD track bound to the raw SnpEff-generated output file:
public static final String RMD_TRACK_NAME = "SnpEff";
public Map<String, Object> annotate ( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
List<Object> snpEffFeatures = tracker.getReferenceMetaData(RMD_TRACK_NAME);
// Add only annotations for one of the most biologically-significant effects as defined in
// the SnpEffConstants class:
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(snpEffFeatures);
return generateAnnotations(mostSignificantEffect);
}

View File

@ -34,6 +34,40 @@ import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygos
import java.io.IOException;
/**
* Codec for decoding the output format of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* This format has 23 tab-delimited fields:
*
* Chromosome
* Position
* Reference
* Change
* Change Type: {SNP, MNP, INS, DEL}
* Zygosity: {Hom, Het}
* Quality
* Coverage
* Warnings
* Gene ID
* Gene Name
* Bio Type
* Transcript ID
* Exon ID
* Exon Rank
* Effect
* Old/New Amino Acid
* Old/New Codon
* Codon Num
* CDS Size
* Codons Around
* Amino Acids Around
* Custom Interval ID
*
* We treat all except the Chromosome, Position, and Effect fields as optional.
*
* @author David Roazen
*/
public class SnpEffCodec implements FeatureCodec {
public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
@ -64,9 +98,13 @@ public class SnpEffCodec implements FeatureCodec {
"AAs around",
"Custom_interval_ID"
};
// The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
public Feature decodeLoc ( String line ) {
return decode(line);
}
@ -101,6 +139,11 @@ public class SnpEffCodec implements FeatureCodec {
Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
boolean isNonCodingGene = isNonCodingGene(tokens[15]);
// Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
// otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
// in the final field (when present) from being inappropriately tokenized:
int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
@ -150,6 +193,9 @@ public class SnpEffCodec implements FeatureCodec {
private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
String effectName = "";
// If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
// otherwise it will be in the first subfield:
if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
effectName = effectFieldTokens[1].trim();
}
@ -161,6 +207,9 @@ public class SnpEffCodec implements FeatureCodec {
}
private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
// The extra effect-related information, if present, will always be the last subfield:
if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
return effectFieldTokens[effectFieldTokens.length - 1].trim();
}

View File

@ -24,8 +24,14 @@
package org.broadinstitute.sting.utils.codecs.snpEff;
/**
* A set of constants associated with the SnpEff codec.
*
* @author David Roazen
*/
public class SnpEffConstants {
// Possible SnpEff biological effects and their associated impacts:
public enum EffectType {
START_GAINED (EffectImpact.HIGH),
START_LOST (EffectImpact.HIGH),
@ -93,6 +99,7 @@ public class SnpEffConstants {
}
}
// The kinds of variants supported by the SnpEff output format:
public enum ChangeType {
SNP,
MNP,
@ -100,6 +107,7 @@ public class SnpEffConstants {
DEL
}
// Possible zygosities of SnpEff variants:
public enum Zygosity {
Hom,
Het

View File

@ -26,15 +26,26 @@ package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature;
import java.util.NoSuchElementException;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
/**
* Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
*
* Many fields are optional, and missing values are represented by nulls. You should always call the
* hasX() method before calling the corresponding getX() method. Required fields can never be null
* and do not have a hasX() method.
*
* @author David Roazen
*/
public class SnpEffFeature implements Feature {
private String contig;
private long position;
private String contig; // REQUIRED FIELD
private long position; // REQUIRED FIELD
private String reference;
private String change;
private ChangeType changeType;
@ -48,8 +59,8 @@ public class SnpEffFeature implements Feature {
private String transcriptID;
private String exonID;
private Integer exonRank;
private boolean isNonCodingGene;
private EffectType effect;
private boolean isNonCodingGene; // REQUIRED FIELD
private EffectType effect; // REQUIRED FIELD
private String effectExtraInformation;
private String oldAndNewAA;
private String oldAndNewCodon;
@ -85,6 +96,10 @@ public class SnpEffFeature implements Feature {
String aasAround,
String customIntervalID ) {
if ( contig == null || effect == null ) {
throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
}
this.contig = contig;
this.position = position;
this.reference = reference;
@ -113,6 +128,10 @@ public class SnpEffFeature implements Feature {
}
public boolean isHigherImpactThan ( SnpEffFeature other ) {
// If one effect is in a non-coding gene and the other is not, the effect NOT in the
// non-coding gene has higher impact:
if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
return true;
}
@ -120,6 +139,9 @@ public class SnpEffFeature implements Feature {
return false;
}
// Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
// of the effects themselves as defined in the SnpEffConstants class:
return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
}
@ -140,6 +162,7 @@ public class SnpEffFeature implements Feature {
}
public String getReference() {
if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
return reference;
}
@ -148,6 +171,7 @@ public class SnpEffFeature implements Feature {
}
public String getChange() {
if ( change == null ) throw new NoSuchElementException("This feature has no change field");
return change;
}
@ -156,6 +180,7 @@ public class SnpEffFeature implements Feature {
}
public ChangeType getChangeType() {
if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
return changeType;
}
@ -164,6 +189,7 @@ public class SnpEffFeature implements Feature {
}
public Zygosity getZygosity() {
if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
return zygosity;
}
@ -172,6 +198,7 @@ public class SnpEffFeature implements Feature {
}
public Double getQuality() {
if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
return quality;
}
@ -180,6 +207,7 @@ public class SnpEffFeature implements Feature {
}
public Long getCoverage() {
if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
return coverage;
}
@ -188,6 +216,7 @@ public class SnpEffFeature implements Feature {
}
public String getWarnings() {
if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
return warnings;
}
@ -196,6 +225,7 @@ public class SnpEffFeature implements Feature {
}
public String getGeneID() {
if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
return geneID;
}
@ -204,6 +234,7 @@ public class SnpEffFeature implements Feature {
}
public String getGeneName() {
if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
return geneName;
}
@ -212,6 +243,7 @@ public class SnpEffFeature implements Feature {
}
public String getBioType() {
if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
return bioType;
}
@ -220,6 +252,7 @@ public class SnpEffFeature implements Feature {
}
public String getTranscriptID() {
if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
return transcriptID;
}
@ -228,6 +261,7 @@ public class SnpEffFeature implements Feature {
}
public String getExonID() {
if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
return exonID;
}
@ -236,6 +270,7 @@ public class SnpEffFeature implements Feature {
}
public Integer getExonRank() {
if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
return exonRank;
}
@ -256,6 +291,7 @@ public class SnpEffFeature implements Feature {
}
public String getEffectExtraInformation() {
if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
return effectExtraInformation;
}
@ -264,6 +300,7 @@ public class SnpEffFeature implements Feature {
}
public String getOldAndNewAA() {
if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
return oldAndNewAA;
}
@ -272,6 +309,7 @@ public class SnpEffFeature implements Feature {
}
public String getOldAndNewCodon() {
if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
return oldAndNewCodon;
}
@ -280,6 +318,7 @@ public class SnpEffFeature implements Feature {
}
public Integer getCodonNum() {
if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
return codonNum;
}
@ -288,6 +327,7 @@ public class SnpEffFeature implements Feature {
}
public Integer getCdsSize() {
if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
return cdsSize;
}
@ -296,6 +336,7 @@ public class SnpEffFeature implements Feature {
}
public String getCodonsAround() {
if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
return codonsAround;
}
@ -304,6 +345,7 @@ public class SnpEffFeature implements Feature {
}
public String getAasAround() {
if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
return aasAround;
}
@ -312,6 +354,7 @@ public class SnpEffFeature implements Feature {
}
public String getCustomIntervalID() {
if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
return customIntervalID;
}

View File

@ -125,4 +125,15 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
}
}
@Test
public void testSnpEffAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -o %s -A SnpEff -B:variant,VCF " + validationDataLocation + "/1000G.exomes.vcf " +
"-B:SnpEff,SnpEff " + validationDataLocation + "/snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out" + " -L 1",
1,
Arrays.asList("5fe3644744d3c084a179c3d204555333")
);
executeTest("Testing SnpEff annotations", spec);
}
}