Merge branch 'snpEff'

This commit is contained in:
David Roazen 2011-08-08 22:12:14 -04:00
commit b180a1311a
6 changed files with 1233 additions and 0 deletions

View File

@ -0,0 +1,166 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* A set of genomic annotations based on the output of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
* output file (which must be bound to an RMD track named "SnpEff"), and adds annotations
* on that effect.
*
* The possible biological effects and their associated impacts are defined in the class:
* org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
*
* @author David Roazen
*/
public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
// SnpEff annotation key names:
public static final String GENE_ID_KEY = "GENE_ID";
public static final String GENE_NAME_KEY = "GENE_NAME";
public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
public static final String EXON_ID_KEY = "EXON_ID";
public static final String EXON_RANK_KEY = "EXON_RANK";
public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
public static final String EFFECT_KEY = "EFFECT";
public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
public static final String CODON_NUM_KEY = "CODON_NUM";
public static final String CDS_SIZE_KEY = "CDS_SIZE";
// Name of the RMD track bound to the raw SnpEff-generated output file:
public static final String RMD_TRACK_NAME = "SnpEff";
public Map<String, Object> annotate ( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
List<Object> snpEffFeatures = tracker.getReferenceMetaData(RMD_TRACK_NAME);
// Add only annotations for one of the most biologically-significant effects as defined in
// the SnpEffConstants class:
SnpEffFeature mostSignificantEffect = getMostSignificantEffect(snpEffFeatures);
return generateAnnotations(mostSignificantEffect);
}
private SnpEffFeature getMostSignificantEffect ( List<Object> snpEffFeatures ) {
SnpEffFeature mostSignificantEffect = null;
for ( Object feature : snpEffFeatures ) {
SnpEffFeature snpEffFeature = (SnpEffFeature)feature;
if ( mostSignificantEffect == null ||
snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
mostSignificantEffect = snpEffFeature;
}
}
return mostSignificantEffect;
}
private Map<String, Object> generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(getKeyNames().size()));
if ( mostSignificantEffect.hasGeneID() )
annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
if ( mostSignificantEffect.hasGeneName() )
annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
if ( mostSignificantEffect.hasTranscriptID() )
annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
if ( mostSignificantEffect.hasExonID() )
annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
if ( mostSignificantEffect.hasExonRank() )
annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
if ( mostSignificantEffect.isNonCodingGene() )
annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
if ( mostSignificantEffect.hasEffectExtraInformation() )
annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
if ( mostSignificantEffect.hasOldAndNewAA() )
annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
if ( mostSignificantEffect.hasOldAndNewCodon() )
annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
if ( mostSignificantEffect.hasCodonNum() )
annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
if ( mostSignificantEffect.hasCdsSize() )
annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
return annotations;
}
public List<String> getKeyNames() {
return Arrays.asList( GENE_ID_KEY,
GENE_NAME_KEY,
TRANSCRIPT_ID_KEY,
EXON_ID_KEY,
EXON_RANK_KEY,
WITHIN_NON_CODING_GENE_KEY,
EFFECT_KEY,
EFFECT_IMPACT_KEY,
EFFECT_EXTRA_INFORMATION_KEY,
OLD_NEW_AA_KEY,
OLD_NEW_CODON_KEY,
CODON_NUM_KEY,
CDS_SIZE_KEY
);
}
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID"),
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name"),
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID"),
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID"),
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank"),
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If present, gene is non-coding"),
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "One of the most high-impact effects across all transcripts at this site"),
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the effect"),
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid"),
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon"),
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number"),
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size")
);
}
}

View File

@ -0,0 +1,258 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
import java.io.IOException;
/**
* Codec for decoding the output format of the SnpEff variant effect predictor tool
* (http://snpeff.sourceforge.net/).
*
* This format has 23 tab-delimited fields:
*
* Chromosome
* Position
* Reference
* Change
* Change Type: {SNP, MNP, INS, DEL}
* Zygosity: {Hom, Het}
* Quality
* Coverage
* Warnings
* Gene ID
* Gene Name
* Bio Type
* Transcript ID
* Exon ID
* Exon Rank
* Effect
* Old/New Amino Acid
* Old/New Codon
* Codon Num
* CDS Size
* Codons Around
* Amino Acids Around
* Custom Interval ID
*
* We treat all except the Chromosome, Position, and Effect fields as optional.
*
* @author David Roazen
*/
public class SnpEffCodec implements FeatureCodec {
public static final int EXPECTED_NUMBER_OF_FIELDS = 23;
public static final String FIELD_DELIMITER_PATTERN = "\\t";
public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]";
public static final String HEADER_LINE_START = "# ";
public static final String[] HEADER_FIELD_NAMES = { "Chromo",
"Position",
"Reference",
"Change",
"Change type",
"Homozygous",
"Quality",
"Coverage",
"Warnings",
"Gene_ID",
"Gene_name",
"Bio_type",
"Trancript_ID", // yes, this is how it's spelled in the SnpEff output
"Exon_ID",
"Exon_Rank",
"Effect",
"old_AA/new_AA",
"Old_codon/New_codon",
"Codon_Num(CDS)",
"CDS_size",
"Codons around",
"AAs around",
"Custom_interval_ID"
};
// The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line:
public static final int[] REQUIRED_FIELDS = { 0, 1, 15 };
public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE";
public Feature decodeLoc ( String line ) {
return decode(line);
}
public Feature decode ( String line ) {
String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1);
if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS +
") number of fields: found " + tokens.length + " fields.", line);
}
try {
trimAllFields(tokens);
checkForRequiredFields(tokens, line);
String contig = tokens[0];
long position = Long.parseLong(tokens[1]);
String reference = tokens[2].isEmpty() ? null : tokens[2];
String change = tokens[3].isEmpty() ? null : tokens[3];
ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]);
Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]);
Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]);
Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]);
String warnings = tokens[8].isEmpty() ? null : tokens[8];
String geneID = tokens[9].isEmpty() ? null : tokens[9];
String geneName = tokens[10].isEmpty() ? null : tokens[10];
String bioType = tokens[11].isEmpty() ? null : tokens[11];
String transcriptID = tokens[12].isEmpty() ? null : tokens[12];
String exonID = tokens[13].isEmpty() ? null : tokens[13];
Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]);
boolean isNonCodingGene = isNonCodingGene(tokens[15]);
// Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present,
// otherwise split it into two subfields. We need this limit to prevent the extra effect-related information
// in the final field (when present) from being inappropriately tokenized:
int effectFieldTokenLimit = isNonCodingGene ? 3 : 2;
String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit);
EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene);
String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene);
String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16];
String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17];
Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]);
Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]);
String codonsAround = tokens[20].isEmpty() ? null : tokens[20];
String aasAround = tokens[21].isEmpty() ? null : tokens[21];
String customIntervalID = tokens[22].isEmpty() ? null : tokens[22];
return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage,
warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene,
effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize,
codonsAround, aasAround, customIntervalID);
}
catch ( NumberFormatException e ) {
throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line);
}
catch ( IllegalArgumentException e ) {
throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line);
}
}
private void trimAllFields ( String[] tokens ) {
for ( int i = 0; i < tokens.length; i++ ) {
tokens[i] = tokens[i].trim();
}
}
private void checkForRequiredFields ( String[] tokens, String line ) {
for ( int requiredFieldIndex : REQUIRED_FIELDS ) {
if ( tokens[requiredFieldIndex].isEmpty() ) {
throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" +
HEADER_FIELD_NAMES[requiredFieldIndex] + "\"",
line);
}
}
}
private boolean isNonCodingGene ( String effectField ) {
return effectField.startsWith(NON_CODING_GENE_FLAG);
}
private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) {
String effectName = "";
// If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield,
// otherwise it will be in the first subfield:
if ( effectFieldTokens.length > 1 && isNonCodingGene ) {
effectName = effectFieldTokens[1].trim();
}
else {
effectName = effectFieldTokens[0].trim();
}
return EffectType.valueOf(effectName);
}
private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) {
// The extra effect-related information, if present, will always be the last subfield:
if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) {
return effectFieldTokens[effectFieldTokens.length - 1].trim();
}
return null;
}
public Class getFeatureType() {
return SnpEffFeature.class;
}
public Object readHeader ( LineReader reader ) {
String headerLine = "";
try {
headerLine = reader.readLine();
}
catch ( IOException e ) {
throw new TribbleException("Unable to read header line from input file.");
}
validateHeaderLine(headerLine);
return headerLine;
}
private void validateHeaderLine ( String headerLine ) {
if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) {
throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START);
}
String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN);
if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) {
throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" +
EXPECTED_NUMBER_OF_FIELDS + ") of columns.");
}
for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) {
if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) {
throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" +
HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" +
headerTokens[columnIndex] + "\"");
}
}
}
}

View File

@ -0,0 +1,115 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
/**
* A set of constants associated with the SnpEff codec.
*
* @author David Roazen
*/
public class SnpEffConstants {
// Possible SnpEff biological effects and their associated impacts:
public enum EffectType {
START_GAINED (EffectImpact.HIGH),
START_LOST (EffectImpact.HIGH),
EXON_DELETED (EffectImpact.HIGH),
FRAME_SHIFT (EffectImpact.HIGH),
STOP_GAINED (EffectImpact.HIGH),
STOP_LOST (EffectImpact.HIGH),
SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH),
SPLICE_SITE_DONOR (EffectImpact.HIGH),
NON_SYNONYMOUS_CODING (EffectImpact.MODERATE),
UTR_5_DELETED (EffectImpact.MODERATE),
UTR_3_DELETED (EffectImpact.MODERATE),
CODON_INSERTION (EffectImpact.MODERATE),
CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE),
CODON_DELETION (EffectImpact.MODERATE),
CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE),
NONE (EffectImpact.LOW),
CHROMOSOME (EffectImpact.LOW),
INTERGENIC (EffectImpact.LOW),
UPSTREAM (EffectImpact.LOW),
UTR_5_PRIME (EffectImpact.LOW),
SYNONYMOUS_START (EffectImpact.LOW),
NON_SYNONYMOUS_START (EffectImpact.LOW),
CDS (EffectImpact.LOW),
GENE (EffectImpact.LOW),
TRANSCRIPT (EffectImpact.LOW),
EXON (EffectImpact.LOW),
SYNONYMOUS_CODING (EffectImpact.LOW),
CODON_CHANGE (EffectImpact.LOW),
SYNONYMOUS_STOP (EffectImpact.LOW),
NON_SYNONYMOUS_STOP (EffectImpact.LOW),
INTRON (EffectImpact.LOW),
UTR_3_PRIME (EffectImpact.LOW),
DOWNSTREAM (EffectImpact.LOW),
INTRON_CONSERVED (EffectImpact.LOW),
INTERGENIC_CONSERVED (EffectImpact.LOW),
CUSTOM (EffectImpact.LOW);
private final EffectImpact impact;
EffectType ( EffectImpact impact ) {
this.impact = impact;
}
public EffectImpact getImpact() {
return impact;
}
}
public enum EffectImpact {
LOW (1),
MODERATE (2),
HIGH (3);
private final int severityRating;
EffectImpact ( int severityRating ) {
this.severityRating = severityRating;
}
public boolean isHigherImpactThan ( EffectImpact other ) {
return this.severityRating > other.severityRating;
}
}
// The kinds of variants supported by the SnpEff output format:
public enum ChangeType {
SNP,
MNP,
INS,
DEL
}
// Possible zygosities of SnpEff variants:
public enum Zygosity {
Hom,
Het
}
}

View File

@ -0,0 +1,423 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
import org.broad.tribble.Feature;
import java.util.NoSuchElementException;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
/**
* Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output.
*
* Many fields are optional, and missing values are represented by nulls. You should always call the
* hasX() method before calling the corresponding getX() method. Required fields can never be null
* and do not have a hasX() method.
*
* @author David Roazen
*/
public class SnpEffFeature implements Feature {
private String contig; // REQUIRED FIELD
private long position; // REQUIRED FIELD
private String reference;
private String change;
private ChangeType changeType;
private Zygosity zygosity;
private Double quality;
private Long coverage;
private String warnings;
private String geneID;
private String geneName;
private String bioType;
private String transcriptID;
private String exonID;
private Integer exonRank;
private boolean isNonCodingGene; // REQUIRED FIELD
private EffectType effect; // REQUIRED FIELD
private String effectExtraInformation;
private String oldAndNewAA;
private String oldAndNewCodon;
private Integer codonNum;
private Integer cdsSize;
private String codonsAround;
private String aasAround;
private String customIntervalID;
public SnpEffFeature ( String contig,
long position,
String reference,
String change,
ChangeType changeType,
Zygosity zygosity,
Double quality,
Long coverage,
String warnings,
String geneID,
String geneName,
String bioType,
String transcriptID,
String exonID,
Integer exonRank,
boolean isNonCodingGene,
EffectType effect,
String effectExtraInformation,
String oldAndNewAA,
String oldAndNewCodon,
Integer codonNum,
Integer cdsSize,
String codonsAround,
String aasAround,
String customIntervalID ) {
if ( contig == null || effect == null ) {
throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields");
}
this.contig = contig;
this.position = position;
this.reference = reference;
this.change = change;
this.changeType = changeType;
this.zygosity = zygosity;
this.quality = quality;
this.coverage = coverage;
this.warnings = warnings;
this.geneID = geneID;
this.geneName = geneName;
this.bioType = bioType;
this.transcriptID = transcriptID;
this.exonID = exonID;
this.exonRank = exonRank;
this.isNonCodingGene = isNonCodingGene;
this.effect = effect;
this.effectExtraInformation = effectExtraInformation;
this.oldAndNewAA = oldAndNewAA;
this.oldAndNewCodon = oldAndNewCodon;
this.codonNum = codonNum;
this.cdsSize = cdsSize;
this.codonsAround = codonsAround;
this.aasAround = aasAround;
this.customIntervalID = customIntervalID;
}
public boolean isHigherImpactThan ( SnpEffFeature other ) {
// If one effect is in a non-coding gene and the other is not, the effect NOT in the
// non-coding gene has higher impact:
if ( ! isNonCodingGene() && other.isNonCodingGene() ) {
return true;
}
else if ( isNonCodingGene() && ! other.isNonCodingGene() ) {
return false;
}
// Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts
// of the effects themselves as defined in the SnpEffConstants class:
return getEffectImpact().isHigherImpactThan(other.getEffectImpact());
}
public String getChr() {
return contig;
}
public int getStart() {
return (int)position;
}
public int getEnd() {
return (int)position;
}
public boolean hasReference() {
return reference != null;
}
public String getReference() {
if ( reference == null ) throw new NoSuchElementException("This feature has no reference field");
return reference;
}
public boolean hasChange() {
return change != null;
}
public String getChange() {
if ( change == null ) throw new NoSuchElementException("This feature has no change field");
return change;
}
public boolean hasChangeType() {
return changeType != null;
}
public ChangeType getChangeType() {
if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field");
return changeType;
}
public boolean hasZygosity() {
return zygosity != null;
}
public Zygosity getZygosity() {
if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field");
return zygosity;
}
public boolean hasQuality() {
return quality != null;
}
public Double getQuality() {
if ( quality == null ) throw new NoSuchElementException("This feature has no quality field");
return quality;
}
public boolean hasCoverage() {
return coverage != null;
}
public Long getCoverage() {
if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field");
return coverage;
}
public boolean hasWarnings() {
return warnings != null;
}
public String getWarnings() {
if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field");
return warnings;
}
public boolean hasGeneID() {
return geneID != null;
}
public String getGeneID() {
if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field");
return geneID;
}
public boolean hasGeneName() {
return geneName != null;
}
public String getGeneName() {
if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field");
return geneName;
}
public boolean hasBioType() {
return bioType != null;
}
public String getBioType() {
if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field");
return bioType;
}
public boolean hasTranscriptID() {
return transcriptID != null;
}
public String getTranscriptID() {
if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field");
return transcriptID;
}
public boolean hasExonID() {
return exonID != null;
}
public String getExonID() {
if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field");
return exonID;
}
public boolean hasExonRank() {
return exonRank != null;
}
public Integer getExonRank() {
if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field");
return exonRank;
}
public boolean isNonCodingGene() {
return isNonCodingGene;
}
public EffectType getEffect() {
return effect;
}
public EffectImpact getEffectImpact() {
return effect.getImpact();
}
public boolean hasEffectExtraInformation() {
return effectExtraInformation != null;
}
public String getEffectExtraInformation() {
if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field");
return effectExtraInformation;
}
public boolean hasOldAndNewAA() {
return oldAndNewAA != null;
}
public String getOldAndNewAA() {
if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field");
return oldAndNewAA;
}
public boolean hasOldAndNewCodon() {
return oldAndNewCodon != null;
}
public String getOldAndNewCodon() {
if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field");
return oldAndNewCodon;
}
public boolean hasCodonNum() {
return codonNum != null;
}
public Integer getCodonNum() {
if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field");
return codonNum;
}
public boolean hasCdsSize() {
return cdsSize != null;
}
public Integer getCdsSize() {
if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field");
return cdsSize;
}
public boolean hasCodonsAround() {
return codonsAround != null;
}
public String getCodonsAround() {
if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field");
return codonsAround;
}
public boolean hadAasAround() {
return aasAround != null;
}
public String getAasAround() {
if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field");
return aasAround;
}
public boolean hasCustomIntervalID() {
return customIntervalID != null;
}
public String getCustomIntervalID() {
if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field");
return customIntervalID;
}
public boolean equals ( Object o ) {
if ( o == null || ! (o instanceof SnpEffFeature) ) {
return false;
}
SnpEffFeature other = (SnpEffFeature)o;
return contig.equals(other.contig) &&
position == other.position &&
(reference == null ? other.reference == null : reference.equals(other.reference)) &&
(change == null ? other.change == null : change.equals(other.change)) &&
changeType == other.changeType &&
zygosity == other.zygosity &&
(quality == null ? other.quality == null : quality.equals(other.quality)) &&
(coverage == null ? other.coverage == null : coverage.equals(other.coverage)) &&
(warnings == null ? other.warnings == null : warnings.equals(other.warnings)) &&
(geneID == null ? other.geneID == null : geneID.equals(other.geneID)) &&
(geneName == null ? other.geneName == null : geneName.equals(other.geneName)) &&
(bioType == null ? other.bioType == null : bioType.equals(other.bioType)) &&
(transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) &&
(exonID == null ? other.exonID == null : exonID.equals(other.exonID)) &&
(exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) &&
isNonCodingGene == other.isNonCodingGene &&
effect == other.effect &&
(effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) &&
(oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) &&
(oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) &&
(codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) &&
(cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) &&
(codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) &&
(aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) &&
(customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID));
}
public String toString() {
return "[Contig: " + contig +
" Position: " + position +
" Reference: " + reference +
" Change: " + change +
" Change Type: " + changeType +
" Zygosity: " + zygosity +
" Quality: " + quality +
" Coverage: " + coverage +
" Warnings: " + warnings +
" Gene ID: " + geneID +
" Gene Name: " + geneName +
" Bio Type: " + bioType +
" Transcript ID: " + transcriptID +
" Exon ID: " + exonID +
" Exon Rank: " + exonRank +
" Non-Coding Gene: " + isNonCodingGene +
" Effect: " + effect +
" Effect Extra Information: " + effectExtraInformation +
" Old/New AA: " + oldAndNewAA +
" Old/New Codon: " + oldAndNewCodon +
" Codon Num: " + codonNum +
" CDS Size: " + cdsSize +
" Codons Around: " + codonsAround +
" AAs Around: " + aasAround +
" Custom Interval ID: " + customIntervalID +
"]";
}
}

View File

@ -125,4 +125,16 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
}
}
@Test
public void testSnpEffAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -NO_HEADER -o %s -A SnpEff -B:variant,VCF " +
validationDataLocation + "1000G.exomes.vcf -B:SnpEff,SnpEff " + validationDataLocation +
"snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000",
1,
Arrays.asList("c08648a078368c80530bff004b3157f1")
);
executeTest("Testing SnpEff annotations", spec);
}
}

View File

@ -0,0 +1,259 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.snpEff;
import org.apache.commons.io.input.ReaderInputStream;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.testng.Assert;
import org.testng.annotations.Test;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType;
import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity;
import java.io.StringReader;
public class SnpEffCodecUnitTest {
@Test
public void testParseWellFormedSnpEffHeaderLine() {
String wellFormedSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" +
"Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
"Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
"AAs around\tCustom_interval_ID";
SnpEffCodec codec = new SnpEffCodec();
LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wellFormedSnpEffHeaderLine)));
String headerReturned = (String)codec.readHeader(reader);
Assert.assertEquals(headerReturned, wellFormedSnpEffHeaderLine);
}
@Test(expectedExceptions = TribbleException.InvalidHeader.class)
public void testParseWrongNumberOfFieldsSnpEffHeaderLine() {
String wrongNumberOfFieldsSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" +
"Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
"Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
"AAs around";
SnpEffCodec codec = new SnpEffCodec();
LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wrongNumberOfFieldsSnpEffHeaderLine)));
codec.readHeader(reader);
}
@Test(expectedExceptions = TribbleException.InvalidHeader.class)
public void testParseMisnamedColumnSnpEffHeaderLine() {
String misnamedColumnSnpEffHeaderLine = "# Chromo\tPosition\tRef\tChange\tChange type\t" +
"Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" +
"Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" +
"AAs around\tCustom_interval_ID";
SnpEffCodec codec = new SnpEffCodec();
LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(misnamedColumnSnpEffHeaderLine)));
codec.readHeader(reader);
}
@Test
public void testParseSimpleEffectSnpEffLine() {
String simpleEffectSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
"OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t\t";
SnpEffFeature expectedFeature = new SnpEffFeature("1",
69428l,
"T",
"G",
ChangeType.SNP,
Zygosity.Hom,
6049.69,
61573l,
null,
"ENSG00000177693",
"OR4F5",
"mRNA",
"ENST00000326183",
"exon_1_69055_70108",
1,
false,
EffectType.NON_SYNONYMOUS_CODING,
null,
"F/C",
"TTT/TGT",
113,
918,
null,
null,
null
);
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(simpleEffectSnpEffLine);
Assert.assertEquals(feature, expectedFeature);
}
@Test
public void testParseNonCodingRegionSnpEffLine() {
String nonCodingRegionSnpEffLine = "1\t1337592\tG\tC\tSNP\tHom\t1935.52\t21885\t\tENSG00000250188\t" +
"RP4-758J18.5\tmRNA\tENST00000514958\texon_1_1337454_1338076\t2\tWITHIN_NON_CODING_GENE, NON_SYNONYMOUS_CODING\t" +
"L/V\tCTA/GTA\t272\t952\t\t\t";
SnpEffFeature expectedFeature = new SnpEffFeature("1",
1337592l,
"G",
"C",
ChangeType.SNP,
Zygosity.Hom,
1935.52,
21885l,
null,
"ENSG00000250188",
"RP4-758J18.5",
"mRNA",
"ENST00000514958",
"exon_1_1337454_1338076",
2,
true,
EffectType.NON_SYNONYMOUS_CODING,
null,
"L/V",
"CTA/GTA",
272,
952,
null,
null,
null
);
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(nonCodingRegionSnpEffLine);
Assert.assertEquals(feature, expectedFeature);
}
@Test
public void testParseExtraEffectInformationSnpEffLine() {
String extraEffectInformationSnpEffLine = "1\t879537\tT\tC\tSNP\tHom\t341.58\t13733\t\tENSG00000187634\tSAMD11\t" +
"mRNA\tENST00000341065\t\t\tUTR_3_PRIME: 4 bases from transcript end\t\t\t\t\t\t\t";
SnpEffFeature expectedFeature = new SnpEffFeature("1",
879537l,
"T",
"C",
ChangeType.SNP,
Zygosity.Hom,
341.58,
13733l,
null,
"ENSG00000187634",
"SAMD11",
"mRNA",
"ENST00000341065",
null,
null,
false,
EffectType.UTR_3_PRIME,
"4 bases from transcript end",
null,
null,
null,
null,
null,
null,
null
);
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(extraEffectInformationSnpEffLine);
Assert.assertEquals(feature, expectedFeature);
}
@Test
public void testParseMultiEffectSnpEffLine() {
String multiEffectSnpEffLine = "1\t901901\tC\tT\tSNP\tHom\t162.91\t4646\t\tENSG00000187583\tPLEKHN1\tmRNA\t" +
"ENST00000379410\texon_1_901877_901994\t1\tSTART_GAINED: ATG, UTR_5_PRIME: 11 bases from TSS\t\t\t\t\t\t\t";
SnpEffFeature expectedFeature = new SnpEffFeature("1",
901901l,
"C",
"T",
ChangeType.SNP,
Zygosity.Hom,
162.91,
4646l,
null,
"ENSG00000187583",
"PLEKHN1",
"mRNA",
"ENST00000379410",
"exon_1_901877_901994",
1,
false,
EffectType.START_GAINED,
"ATG, UTR_5_PRIME: 11 bases from TSS",
null,
null,
null,
null,
null,
null,
null
);
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(multiEffectSnpEffLine);
Assert.assertEquals(feature, expectedFeature);
}
@Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
public void testParseWrongNumberOfFieldsSnpEffLine() {
String wrongNumberOfFieldsSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
"OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t";
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(wrongNumberOfFieldsSnpEffLine);
}
@Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
public void testParseBlankEffectFieldSnpEffLine() {
String blankEffectFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
"OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\t\tF/C\tTTT/TGT\t113\t918\t\t\t";
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(blankEffectFieldSnpEffLine);
}
@Test(expectedExceptions = TribbleException.InvalidDecodeLine.class)
public void testParseInvalidNumericFieldSnpEffLine() {
String invalidNumericFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" +
"OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\tfoo\t\t\t";;
SnpEffCodec codec = new SnpEffCodec();
SnpEffFeature feature = (SnpEffFeature)codec.decode(invalidNumericFieldSnpEffLine);
}
}