Added an integration test for the SnpEff annotation support, as well as some extra safety checks and comments.

This commit is contained in:
David Roazen 2011-08-08 20:01:24 -04:00
parent 5e288136e0
commit a13bc7b929
5 changed files with 134 additions and 7 deletions

View File

@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants; import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
@ -38,9 +38,22 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*; import java.util.*;
public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation { /**
* A set of genomic annotations based on the output of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
* output file (which must be bound to an RMD track named "SnpEff"), and adds annotations
* on that effect.
*
* The possible biological effects and their associated impacts are defined in the class:
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
*
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
// SnpEff field keys: // SnpEff annotation key names:
public static final String GENE_ID_KEY = "GENE_ID"; public static final String GENE_ID_KEY = "GENE_ID";
public static final String GENE_NAME_KEY = "GENE_NAME"; public static final String GENE_NAME_KEY = "GENE_NAME";
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID"; public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
@ -55,11 +68,14 @@ public class SnpEff extends InfoFieldAnnotation implements StandardAnnotation {
public static final String CODON_NUM_KEY = "CODON_NUM"; public static final String CODON_NUM_KEY = "CODON_NUM";
public static final String CDS_SIZE_KEY = "CDS_SIZE"; public static final String CDS_SIZE_KEY = "CDS_SIZE";
// Name of the RMD track bound to the raw SnpEff-generated output file:
public static final String RMD_TRACK_NAME = "SnpEff"; public static final String RMD_TRACK_NAME = "SnpEff";
public Map<String, Object> annotate ( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) { public Map<String, Object> annotate ( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
List<Object> snpEffFeatures = tracker.getReferenceMetaData(RMD_TRACK_NAME); List<Object> snpEffFeatures = tracker.getReferenceMetaData(RMD_TRACK_NAME);
// Add only annotations for one of the most biologically-significant effects as defined in
// the SnpEffConstants class:
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(snpEffFeatures); SnpEffFeature mostSignificantEffect = getMostSignificantEffect(snpEffFeatures);
return generateAnnotations(mostSignificantEffect); return generateAnnotations(mostSignificantEffect);
} }

View File

@ -34,6 +34,40 @@ import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygos
import java.io.IOException; import java.io.IOException;
/**
* Codec for decoding the output format of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* This format has 23 tab-delimited fields:
*
* Chromosome
* Position
* Reference
* Change
* Change Type: {SNP, MNP, INS, DEL}
* Zygosity: {Hom, Het}
* Quality
* Coverage
* Warnings
* Gene ID
* Gene Name
* Bio Type
* Transcript ID
* Exon ID
* Exon Rank
* Effect
* Old/New Amino Acid
* Old/New Codon
* Codon Num
* CDS Size
* Codons Around
* Amino Acids Around
* Custom Interval ID
*
* We treat all except the Chromosome, Position, and Effect fields as optional.
*
* @author David Roazen
*/
public class SnpEffCodec implements FeatureCodec { public class SnpEffCodec implements FeatureCodec {
public static final int EXPECTED_NUMBER_OF_FIELDS = 23; public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
@ -64,9 +98,13 @@ public class SnpEffCodec implements FeatureCodec {
"AAs around", "AAs around",
"Custom_interval_ID" "Custom_interval_ID"
}; };
// The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
public static final int[] REQUIRED_FIELDS = { 0, 1, 15 }; public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE"; public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
public Feature decodeLoc ( String line ) { public Feature decodeLoc ( String line ) {
return decode(line); return decode(line);
} }
@ -101,6 +139,11 @@ public class SnpEffCodec implements FeatureCodec {
Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]); Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
boolean isNonCodingGene = isNonCodingGene(tokens[15]); boolean isNonCodingGene = isNonCodingGene(tokens[15]);
// Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
// otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
// in the final field (when present) from being inappropriately tokenized:
int effectFieldTokenLimit = isNonCodingGene ? 3 : 2; int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit); String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene); EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
@ -150,6 +193,9 @@ public class SnpEffCodec implements FeatureCodec {
private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) { private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
String effectName = ""; String effectName = "";
// If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
// otherwise it will be in the first subfield:
if ( effectFieldTokens.length > 1 && isNonCodingGene ) { if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
effectName = effectFieldTokens[1].trim(); effectName = effectFieldTokens[1].trim();
} }
@ -161,6 +207,9 @@ public class SnpEffCodec implements FeatureCodec {
} }
private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) { private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
// The extra effect-related information, if present, will always be the last subfield:
if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) { if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
return effectFieldTokens[effectFieldTokens.length - 1].trim(); return effectFieldTokens[effectFieldTokens.length - 1].trim();
} }

View File

@ -24,8 +24,14 @@
package org.broadinstitute.sting.utils.codecs.snpEff; package org.broadinstitute.sting.utils.codecs.snpEff;
/**
* A set of constants associated with the SnpEff codec.
*
* @author David Roazen
*/
public class SnpEffConstants { public class SnpEffConstants {
// Possible SnpEff biological effects and their associated impacts:
public enum EffectType { public enum EffectType {
START_GAINED (EffectImpact.HIGH), START_GAINED (EffectImpact.HIGH),
START_LOST (EffectImpact.HIGH), START_LOST (EffectImpact.HIGH),
@ -93,6 +99,7 @@ public class SnpEffConstants {
} }
} }
// The kinds of variants supported by the SnpEff output format:
public enum ChangeType { public enum ChangeType {
SNP, SNP,
MNP, MNP,
@ -100,6 +107,7 @@ public class SnpEffConstants {
DEL DEL
} }
// Possible zygosities of SnpEff variants:
public enum Zygosity { public enum Zygosity {
Hom, Hom,
Het Het

View File

@ -26,15 +26,26 @@ package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import java.util.NoSuchElementException;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
/**
* Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
*
* Many fields are optional, and missing values are represented by nulls. You should always call the
* hasX() method before calling the corresponding getX() method. Required fields can never be null
* and do not have a hasX() method.
*
* @author David Roazen
*/
public class SnpEffFeature implements Feature { public class SnpEffFeature implements Feature {
private String contig; private String contig; // REQUIRED FIELD
private long position; private long position; // REQUIRED FIELD
private String reference; private String reference;
private String change; private String change;
private ChangeType changeType; private ChangeType changeType;
@ -48,8 +59,8 @@ public class SnpEffFeature implements Feature {
private String transcriptID; private String transcriptID;
private String exonID; private String exonID;
private Integer exonRank; private Integer exonRank;
private boolean isNonCodingGene; private boolean isNonCodingGene; // REQUIRED FIELD
private EffectType effect; private EffectType effect; // REQUIRED FIELD
private String effectExtraInformation; private String effectExtraInformation;
private String oldAndNewAA; private String oldAndNewAA;
private String oldAndNewCodon; private String oldAndNewCodon;
@ -85,6 +96,10 @@ public class SnpEffFeature implements Feature {
String aasAround, String aasAround,
String customIntervalID ) { String customIntervalID ) {
if ( contig == null || effect == null ) {
throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
}
this.contig = contig; this.contig = contig;
this.position = position; this.position = position;
this.reference = reference; this.reference = reference;
@ -113,6 +128,10 @@ public class SnpEffFeature implements Feature {
} }
public boolean isHigherImpactThan ( SnpEffFeature other ) { public boolean isHigherImpactThan ( SnpEffFeature other ) {
// If one effect is in a non-coding gene and the other is not, the effect NOT in the
// non-coding gene has higher impact:
if ( ! isNonCodingGene() && other.isNonCodingGene() ) { if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
return true; return true;
} }
@ -120,6 +139,9 @@ public class SnpEffFeature implements Feature {
return false; return false;
} }
// Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
// of the effects themselves as defined in the SnpEffConstants class:
return getEffectImpact().isHigherImpactThan(other.getEffectImpact()); return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
} }
@ -140,6 +162,7 @@ public class SnpEffFeature implements Feature {
} }
public String getReference() { public String getReference() {
if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
return reference; return reference;
} }
@ -148,6 +171,7 @@ public class SnpEffFeature implements Feature {
} }
public String getChange() { public String getChange() {
if ( change == null ) throw new NoSuchElementException("This feature has no change field");
return change; return change;
} }
@ -156,6 +180,7 @@ public class SnpEffFeature implements Feature {
} }
public ChangeType getChangeType() { public ChangeType getChangeType() {
if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
return changeType; return changeType;
} }
@ -164,6 +189,7 @@ public class SnpEffFeature implements Feature {
} }
public Zygosity getZygosity() { public Zygosity getZygosity() {
if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
return zygosity; return zygosity;
} }
@ -172,6 +198,7 @@ public class SnpEffFeature implements Feature {
} }
public Double getQuality() { public Double getQuality() {
if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
return quality; return quality;
} }
@ -180,6 +207,7 @@ public class SnpEffFeature implements Feature {
} }
public Long getCoverage() { public Long getCoverage() {
if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
return coverage; return coverage;
} }
@ -188,6 +216,7 @@ public class SnpEffFeature implements Feature {
} }
public String getWarnings() { public String getWarnings() {
if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
return warnings; return warnings;
} }
@ -196,6 +225,7 @@ public class SnpEffFeature implements Feature {
} }
public String getGeneID() { public String getGeneID() {
if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
return geneID; return geneID;
} }
@ -204,6 +234,7 @@ public class SnpEffFeature implements Feature {
} }
public String getGeneName() { public String getGeneName() {
if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
return geneName; return geneName;
} }
@ -212,6 +243,7 @@ public class SnpEffFeature implements Feature {
} }
public String getBioType() { public String getBioType() {
if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
return bioType; return bioType;
} }
@ -220,6 +252,7 @@ public class SnpEffFeature implements Feature {
} }
public String getTranscriptID() { public String getTranscriptID() {
if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
return transcriptID; return transcriptID;
} }
@ -228,6 +261,7 @@ public class SnpEffFeature implements Feature {
} }
public String getExonID() { public String getExonID() {
if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
return exonID; return exonID;
} }
@ -236,6 +270,7 @@ public class SnpEffFeature implements Feature {
} }
public Integer getExonRank() { public Integer getExonRank() {
if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
return exonRank; return exonRank;
} }
@ -256,6 +291,7 @@ public class SnpEffFeature implements Feature {
} }
public String getEffectExtraInformation() { public String getEffectExtraInformation() {
if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
return effectExtraInformation; return effectExtraInformation;
} }
@ -264,6 +300,7 @@ public class SnpEffFeature implements Feature {
} }
public String getOldAndNewAA() { public String getOldAndNewAA() {
if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
return oldAndNewAA; return oldAndNewAA;
} }
@ -272,6 +309,7 @@ public class SnpEffFeature implements Feature {
} }
public String getOldAndNewCodon() { public String getOldAndNewCodon() {
if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
return oldAndNewCodon; return oldAndNewCodon;
} }
@ -280,6 +318,7 @@ public class SnpEffFeature implements Feature {
} }
public Integer getCodonNum() { public Integer getCodonNum() {
if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
return codonNum; return codonNum;
} }
@ -288,6 +327,7 @@ public class SnpEffFeature implements Feature {
} }
public Integer getCdsSize() { public Integer getCdsSize() {
if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
return cdsSize; return cdsSize;
} }
@ -296,6 +336,7 @@ public class SnpEffFeature implements Feature {
} }
public String getCodonsAround() { public String getCodonsAround() {
if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
return codonsAround; return codonsAround;
} }
@ -304,6 +345,7 @@ public class SnpEffFeature implements Feature {
} }
public String getAasAround() { public String getAasAround() {
if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
return aasAround; return aasAround;
} }
@ -312,6 +354,7 @@ public class SnpEffFeature implements Feature {
} }
public String getCustomIntervalID() { public String getCustomIntervalID() {
if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
return customIntervalID; return customIntervalID;
} }

View File

@ -125,4 +125,15 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
} }
} }
@Test
public void testSnpEffAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -o %s -A SnpEff -B:variant,VCF " + validationDataLocation + "/1000G.exomes.vcf " +
"-B:SnpEff,SnpEff " + validationDataLocation + "/snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out" + " -L 1",
1,
Arrays.asList("5fe3644744d3c084a179c3d204555333")
);
executeTest("Testing SnpEff annotations", spec);
}
} }