diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 30b2f828d..8e241bb2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * @version 0.1 */ public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) protected String[] PLFilterNames; public boolean filterOut(SAMRecord rec) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index fdfac6bf7..4f072e88c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * -I input1.bam \ * -I input2.bam \ * --read_filter MappingQualityZero + * + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -n 2000 * * */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 350c683c2..bb3685fb5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -24,7 +24,9 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -46,134 +45,421 @@ import java.util.*; * (http://snpeff.sourceforge.net/). * * For each variant, chooses one of the effects of highest biological impact from the SnpEff - * output file (which must be provided on the command line via --snpEffFile:SnpEff ), + * output file (which must be provided on the command line via --snpEffFile filename.vcf), * and adds annotations on that effect. * - * The possible biological effects and their associated impacts are defined in the class: - * org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants - * * @author David Roazen */ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation { - // SnpEff annotation key names: - public static final String GENE_ID_KEY = "GENE_ID"; - public static final String GENE_NAME_KEY = "GENE_NAME"; - public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID"; - public static final String EXON_ID_KEY = "EXON_ID"; - public static final String EXON_RANK_KEY = "EXON_RANK"; - public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE"; - public static final String EFFECT_KEY = "EFFECT"; - public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT"; - public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION"; - public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA"; - public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON"; - public static final String CODON_NUM_KEY = "CODON_NUM"; - public static final String CDS_SIZE_KEY = "CDS_SIZE"; + private static Logger logger = Logger.getLogger(SnpEff.class); + + // We refuse to parse SnpEff output files generated by unsupported versions, or + // lacking a SnpEff version number in the VCF header: + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; + + // SnpEff aggregates all effects (and effect metadata) together into a single INFO + // field annotation with the key EFF: + public static final String SNPEFF_INFO_FIELD_KEY = "EFF"; + public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]"; + public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|"; + + // Key names for the INFO field annotations we will add to each record, along + // with parsing-related information: + public enum InfoFieldKey { + EFF (-1), + EFF_IMPACT (0), + EFF_CODON_CHANGE (1), + EFF_AMINO_ACID_CHANGE (2), + EFF_GENE_NAME (3), + EFF_GENE_BIOTYPE (4), + EFF_TRANSCRIPT_ID (6), + EFF_EXON_ID (7); + + // Index within the effect metadata subfields from the SnpEff EFF annotation + // where each key's associated value can be found during parsing. + private final int fieldIndex; + + InfoFieldKey ( int fieldIndex ) { + this.fieldIndex = fieldIndex; + } + + public int getFieldIndex() { + return fieldIndex; + } + } + + // Possible SnpEff biological effects. All effect names found in the SnpEff input file + // are validated against this list. + public enum EffectType { + NONE, + CHROMOSOME, + INTERGENIC, + UPSTREAM, + UTR_5_PRIME, + UTR_5_DELETED, + START_GAINED, + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + CDS, + GENE, + TRANSCRIPT, + EXON, + EXON_DELETED, + NON_SYNONYMOUS_CODING, + SYNONYMOUS_CODING, + FRAME_SHIFT, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + STOP_GAINED, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, + STOP_LOST, + INTRON, + UTR_3_PRIME, + UTR_3_DELETED, + DOWNSTREAM, + INTRON_CONSERVED, + INTERGENIC_CONSERVED, + REGULATION, + CUSTOM, + WITHIN_NON_CODING_GENE + } + + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. + public enum EffectImpact { + LOW (1), + MODERATE (2), + HIGH (3); + + private final int severityRating; + + EffectImpact ( int severityRating ) { + this.severityRating = severityRating; + } + + public boolean isHigherImpactThan ( EffectImpact other ) { + return this.severityRating > other.severityRating; + } + } + + // SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information. + public enum EffectCoding { + CODING, + NON_CODING, + UNKNOWN + } + + + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + validateRodBinding(walker.getSnpEffRodBinding()); + checkSnpEffVersion(walker, toolkit); + } public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { - RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - validateRodBinding(snpEffRodBinding); + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - List features = tracker.getValues(snpEffRodBinding, ref.getLocus()); + // Get only SnpEff records that start at this locus, not merely span it: + List snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus()); - // Add only annotations for one of the most biologically-significant effects as defined in - // the SnpEffConstants class: - SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features); - - if ( mostSignificantEffect == null ) { + // Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate. + // If there is more than one such record, we only need to pick the first one, since the biological + // effects will be the same across all such records: + VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc); + if ( matchingRecord == null ) { return null; } - return generateAnnotations(mostSignificantEffect); + // Parse the SnpEff INFO field annotation from the matching record into individual effect objects: + List effects = parseSnpEffRecord(matchingRecord); + if ( effects.size() == 0 ) { + return null; + } + + // Add only annotations for one of the most biologically-significant effects from this set: + SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects); + return mostSignificantEffect.getAnnotations(); } - private void validateRodBinding ( RodBinding snpEffRodBinding ) { + private void validateRodBinding ( RodBinding snpEffRodBinding ) { if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) { - throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " + - "as a rodbinding on the command line, but no SnpEff rodbinding was found."); + throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " + + "as a rodbinding on the command line via the --snpEffFile option, but " + + "no SnpEff rodbinding was found."); } } - private SnpEffFeature getMostSignificantEffect ( List snpEffFeatures ) { - SnpEffFeature mostSignificantEffect = null; + private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - for ( SnpEffFeature snpEffFeature : snpEffFeatures ) { + VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName()); + VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY); + + if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) { + throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " + + "input file, and so could not verify that the file was generated by a supported version of SnpEff (" + + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); + } + + String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0]; + + if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) { + throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " + + "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS)); + } + } + + private boolean isSupportedSnpEffVersion ( String versionString ) { + for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) { + if ( supportedVersion.equals(versionString) ) { + return true; + } + } + + return false; + } + + private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) { + for ( VariantContext snpEffRecord : snpEffRecords ) { + if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) { + return snpEffRecord; + } + } + + return null; + } + + private List parseSnpEffRecord ( VariantContext snpEffRecord ) { + List parsedEffects = new ArrayList(); + + Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY); + List individualEffects; + + // The VCF codec stores multi-valued fields as a List, and single-valued fields as a String. + // We can have either in the case of SnpEff, since there may be one or more than one effect in this record. + if ( effectFieldValue instanceof List ) { + individualEffects = (List)effectFieldValue; + } + else { + individualEffects = Arrays.asList((String)effectFieldValue); + } + + for ( String effectString : individualEffects ) { + String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER); + + if ( effectNameAndMetadata.length != 2 ) { + logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s", + snpEffRecord.getChr(), snpEffRecord.getStart(), effectString)); + continue; + } + + String effectName = effectNameAndMetadata[0]; + String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1); + + SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata); + + if ( parsedEffect.isWellFormed() ) { + parsedEffects.add(parsedEffect); + } + else { + logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"", + snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString)); + } + } + + return parsedEffects; + } + + private SnpEffEffect getMostSignificantEffect ( List effects ) { + SnpEffEffect mostSignificantEffect = null; + + for ( SnpEffEffect effect : effects ) { if ( mostSignificantEffect == null || - snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) { + effect.isHigherImpactThan(mostSignificantEffect) ) { - mostSignificantEffect = snpEffFeature; + mostSignificantEffect = effect; } } return mostSignificantEffect; } - private Map generateAnnotations ( SnpEffFeature mostSignificantEffect ) { - Map annotations = new LinkedHashMap(Utils.optimumHashSize(getKeyNames().size())); - - if ( mostSignificantEffect.hasGeneID() ) - annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID()); - if ( mostSignificantEffect.hasGeneName() ) - annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName()); - if ( mostSignificantEffect.hasTranscriptID() ) - annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID()); - if ( mostSignificantEffect.hasExonID() ) - annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID()); - if ( mostSignificantEffect.hasExonRank() ) - annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank())); - if ( mostSignificantEffect.isNonCodingGene() ) - annotations.put(WITHIN_NON_CODING_GENE_KEY, null); - - annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString()); - annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString()); - if ( mostSignificantEffect.hasEffectExtraInformation() ) - annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation()); - - if ( mostSignificantEffect.hasOldAndNewAA() ) - annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA()); - if ( mostSignificantEffect.hasOldAndNewCodon() ) - annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon()); - if ( mostSignificantEffect.hasCodonNum() ) - annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum())); - if ( mostSignificantEffect.hasCdsSize() ) - annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize())); - - return annotations; - } - public List getKeyNames() { - return Arrays.asList( GENE_ID_KEY, - GENE_NAME_KEY, - TRANSCRIPT_ID_KEY, - EXON_ID_KEY, - EXON_RANK_KEY, - WITHIN_NON_CODING_GENE_KEY, - EFFECT_KEY, - EFFECT_IMPACT_KEY, - EFFECT_EXTRA_INFORMATION_KEY, - OLD_NEW_AA_KEY, - OLD_NEW_CODON_KEY, - CODON_NUM_KEY, - CDS_SIZE_KEY + return Arrays.asList( InfoFieldKey.EFF.toString(), + InfoFieldKey.EFF_IMPACT.toString(), + InfoFieldKey.EFF_CODON_CHANGE.toString(), + InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), + InfoFieldKey.EFF_GENE_NAME.toString(), + InfoFieldKey.EFF_GENE_BIOTYPE.toString(), + InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), + InfoFieldKey.EFF_EXON_ID.toString() ); } public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"), - new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), - new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())), - new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant") + new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } + + /** + * Helper class to parse, validate, and store a single SnpEff effect and its metadata. + */ + protected static class SnpEffEffect { + private EffectType effect; + private EffectImpact impact; + private String codonChange; + private String aminoAcidChange; + private String geneName; + private String geneBiotype; + private EffectCoding coding; + private String transcriptID; + private String exonID; + + private String parseError = null; + private boolean isWellFormed = true; + + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + + // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, + // errors come after warnings, not vice versa: + private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; + private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + + private static final int SNPEFF_CODING_FIELD_INDEX = 5; + + public SnpEffEffect ( String effectName, String[] effectMetadata ) { + parseEffectName(effectName); + parseEffectMetadata(effectMetadata); + } + + private void parseEffectName ( String effectName ) { + try { + effect = EffectType.valueOf(effectName); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("%s is not a recognized effect type", effectName)); + } + } + + private void parseEffectMetadata ( String[] effectMetadata ) { + if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { + parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + } + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { + parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + } + else { + parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", + EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length)); + } + + return; + } + + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()])); + } + + codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()]; + aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()]; + geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()]; + geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()]; + + if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { + try { + coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX])); + } + } + else { + coding = EffectCoding.UNKNOWN; + } + + transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()]; + exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()]; + } + + private void parseError ( String message ) { + isWellFormed = false; + + // Cache only the first error encountered: + if ( parseError == null ) { + parseError = message; + } + } + + public boolean isWellFormed() { + return isWellFormed; + } + + public String getParseError() { + return parseError == null ? "" : parseError; + } + + public boolean isCoding() { + return coding == EffectCoding.CODING; + } + + public boolean isHigherImpactThan ( SnpEffEffect other ) { + // If one effect is within a coding gene and the other is not, the effect that is + // within the coding gene has higher impact: + + if ( isCoding() && ! other.isCoding() ) { + return true; + } + else if ( ! isCoding() && other.isCoding() ) { + return false; + } + + // Otherwise, both effects are either in or not in a coding gene, so we compare the impacts + // of the effects themselves: + + return impact.isHigherImpactThan(other.impact); + } + + public Map getAnnotations() { + Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length)); + + addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString()); + addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange); + addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange); + addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName); + addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype); + addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID); + addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID); + + return annotations; + } + + private void addAnnotation ( Map annotations, String keyName, String keyValue ) { + // Only add annotations for keys associated with non-empty values: + if ( keyValue != null && keyValue.trim().length() > 0 ) { + annotations.put(keyName, keyValue); + } + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 971727727..fb3dbc3cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -93,8 +92,8 @@ public class VariantAnnotator extends RodWalker implements Ann * listed in the SnpEff output file for each variant. */ @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false) - public RodBinding snpEffFile; - public RodBinding getSnpEffRodBinding() { return snpEffFile; } + public RodBinding snpEffFile; + public RodBinding getSnpEffRodBinding() { return snpEffFile; } /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. @@ -204,9 +203,9 @@ public class VariantAnnotator extends RodWalker implements Ann } if ( USE_ALL_ANNOTATIONS ) - engine = new VariantAnnotatorEngine(this); + engine = new VariantAnnotatorEngine(this, getToolkit()); else - engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this); + engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit()); engine.initializeExpressions(expressionsToUse); engine.invokeAnnotationInitializationMethods(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 17830f129..68cd07803 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -46,6 +47,7 @@ public class VariantAnnotatorEngine { private HashMap, String> dbAnnotations = new HashMap, String>(); private AnnotatorCompatibleWalker walker; + private GenomeAnalysisEngine toolkit; private static class VAExpression { @@ -71,16 +73,18 @@ public class VariantAnnotatorEngine { } // use this constructor if you want all possible annotations - public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) { + public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; + this.toolkit = toolkit; requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations(); requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations(); initializeDBs(); } // use this constructor if you want to select specific annotations (and/or interfaces) - public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker) { + public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; + this.toolkit = toolkit; initializeAnnotations(annotationGroupsToUse, annotationsToUse); initializeDBs(); } @@ -112,11 +116,11 @@ public class VariantAnnotatorEngine { public void invokeAnnotationInitializationMethods() { for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { - annotation.initialize(walker); + annotation.initialize(walker, toolkit); } for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.initialize(walker); + annotation.initialize(walker, toolkit); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java index 9dda57ae3..7200f841b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; @@ -10,8 +9,8 @@ public interface AnnotatorCompatibleWalker { // getter methods for various used bindings public abstract RodBinding getVariantRodBinding(); - public abstract RodBinding getSnpEffRodBinding(); + public abstract RodBinding getSnpEffRodBinding(); public abstract RodBinding getDbsnpRodBinding(); public abstract List> getCompRodBindings(); public abstract List> getResourceRodBindings(); -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java index 9e48de9c3..160a3d258 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.util.List; @@ -34,5 +35,5 @@ public abstract class VariantAnnotatorAnnotation { public abstract List getKeyNames(); // initialization method (optional for subclasses, and therefore non-abstract) - public void initialize ( AnnotatorCompatibleWalker walker ) { } + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 4ee2d5f44..428f97e2a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -128,7 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } public RodBinding getVariantRodBinding() { return null; } - public RodBinding getSnpEffRodBinding() { return null; } + public RodBinding getSnpEffRodBinding() { return null; } public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } @@ -211,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker - * This format has 23 tab-delimited fields: - * - *
- * Chromosome
- * Position
- * Reference
- * Change
- * Change Type: {SNP, MNP, INS, DEL}
- * Zygosity: {Hom, Het}
- * Quality
- * Coverage
- * Warnings
- * Gene ID
- * Gene Name
- * Bio Type
- * Transcript ID
- * Exon ID
- * Exon Rank
- * Effect
- * Old/New Amino Acid
- * Old/New Codon
- * Codon Num
- * CDS Size
- * Codons Around
- * Amino Acids Around
- * Custom Interval ID
- * 
- * Note that we treat all except the Chromosome, Position, and Effect fields as optional. - *

- * - *

- * See also: @see SNPEff project page - *

- * - * @author David Roazen - * @since 2011 - */ -public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { - - public static final int EXPECTED_NUMBER_OF_FIELDS = 23; - public static final String FIELD_DELIMITER_PATTERN = "\\t"; - public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]"; - public static final String HEADER_LINE_START = "# "; - public static final String[] HEADER_FIELD_NAMES = { "Chromo", - "Position", - "Reference", - "Change", - "Change type", - "Homozygous", - "Quality", - "Coverage", - "Warnings", - "Gene_ID", - "Gene_name", - "Bio_type", - "Trancript_ID", // yes, this is how it's spelled in the SnpEff output - "Exon_ID", - "Exon_Rank", - "Effect", - "old_AA/new_AA", - "Old_codon/New_codon", - "Codon_Num(CDS)", - "CDS_size", - "Codons around", - "AAs around", - "Custom_interval_ID" - }; - - // The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line: - public static final int[] REQUIRED_FIELDS = { 0, 1, 15 }; - - public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE"; - - - public Feature decodeLoc ( String line ) { - return decode(line); - } - - public Feature decode ( String line ) { - String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1); - - if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS + - ") number of fields: found " + tokens.length + " fields.", line); - } - - try { - trimAllFields(tokens); - checkForRequiredFields(tokens, line); - - String contig = tokens[0]; - long position = Long.parseLong(tokens[1]); - - String reference = tokens[2].isEmpty() ? null : tokens[2]; - String change = tokens[3].isEmpty() ? null : tokens[3]; - ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]); - Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]); - Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]); - Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]); - String warnings = tokens[8].isEmpty() ? null : tokens[8]; - String geneID = tokens[9].isEmpty() ? null : tokens[9]; - String geneName = tokens[10].isEmpty() ? null : tokens[10]; - String bioType = tokens[11].isEmpty() ? null : tokens[11]; - String transcriptID = tokens[12].isEmpty() ? null : tokens[12]; - String exonID = tokens[13].isEmpty() ? null : tokens[13]; - Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]); - - boolean isNonCodingGene = isNonCodingGene(tokens[15]); - - // Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present, - // otherwise split it into two subfields. We need this limit to prevent the extra effect-related information - // in the final field (when present) from being inappropriately tokenized: - - int effectFieldTokenLimit = isNonCodingGene ? 3 : 2; - String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit); - EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene); - String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene); - - String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16]; - String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17]; - Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]); - Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]); - String codonsAround = tokens[20].isEmpty() ? null : tokens[20]; - String aasAround = tokens[21].isEmpty() ? null : tokens[21]; - String customIntervalID = tokens[22].isEmpty() ? null : tokens[22]; - - return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage, - warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene, - effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize, - codonsAround, aasAround, customIntervalID); - } - catch ( NumberFormatException e ) { - throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line); - } - catch ( IllegalArgumentException e ) { - throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line); - } - } - - private void trimAllFields ( String[] tokens ) { - for ( int i = 0; i < tokens.length; i++ ) { - tokens[i] = tokens[i].trim(); - } - } - - private void checkForRequiredFields ( String[] tokens, String line ) { - for ( int requiredFieldIndex : REQUIRED_FIELDS ) { - if ( tokens[requiredFieldIndex].isEmpty() ) { - throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" + - HEADER_FIELD_NAMES[requiredFieldIndex] + "\"", - line); - } - } - } - - private boolean isNonCodingGene ( String effectField ) { - return effectField.startsWith(NON_CODING_GENE_FLAG); - } - - private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) { - String effectName = ""; - - // If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield, - // otherwise it will be in the first subfield: - - if ( effectFieldTokens.length > 1 && isNonCodingGene ) { - effectName = effectFieldTokens[1].trim(); - } - else { - effectName = effectFieldTokens[0].trim(); - } - - return EffectType.valueOf(effectName); - } - - private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) { - - // The extra effect-related information, if present, will always be the last subfield: - - if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) { - return effectFieldTokens[effectFieldTokens.length - 1].trim(); - } - - return null; - } - - public Class getFeatureType() { - return SnpEffFeature.class; - } - - public Object readHeader ( LineReader reader ) { - String headerLine = ""; - - try { - headerLine = reader.readLine(); - } - catch ( IOException e ) { - throw new TribbleException("Unable to read header line from input file."); - } - - validateHeaderLine(headerLine); - return headerLine; - } - - private void validateHeaderLine ( String headerLine ) { - if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) { - throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START); - } - - String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN); - - if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" + - EXPECTED_NUMBER_OF_FIELDS + ") of columns."); - } - - for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) { - if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) { - throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" + - HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" + - headerTokens[columnIndex] + "\""); - } - } - } - - public boolean canDecode ( final File potentialInput ) { - try { - LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput)); - readHeader(reader); - } - catch ( Exception e ) { - return false; - } - - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java deleted file mode 100644 index 270db470f..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -/** - * A set of constants associated with the SnpEff codec. - * - * @author David Roazen - */ -public class SnpEffConstants { - - // Possible SnpEff biological effects and their associated impacts: - public enum EffectType { - START_GAINED (EffectImpact.HIGH), - START_LOST (EffectImpact.HIGH), - EXON_DELETED (EffectImpact.HIGH), - FRAME_SHIFT (EffectImpact.HIGH), - STOP_GAINED (EffectImpact.HIGH), - STOP_LOST (EffectImpact.HIGH), - SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH), - SPLICE_SITE_DONOR (EffectImpact.HIGH), - - NON_SYNONYMOUS_CODING (EffectImpact.MODERATE), - UTR_5_DELETED (EffectImpact.MODERATE), - UTR_3_DELETED (EffectImpact.MODERATE), - CODON_INSERTION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE), - CODON_DELETION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE), - - NONE (EffectImpact.LOW), - CHROMOSOME (EffectImpact.LOW), - INTERGENIC (EffectImpact.LOW), - UPSTREAM (EffectImpact.LOW), - UTR_5_PRIME (EffectImpact.LOW), - SYNONYMOUS_START (EffectImpact.LOW), - NON_SYNONYMOUS_START (EffectImpact.LOW), - CDS (EffectImpact.LOW), - GENE (EffectImpact.LOW), - TRANSCRIPT (EffectImpact.LOW), - EXON (EffectImpact.LOW), - SYNONYMOUS_CODING (EffectImpact.LOW), - CODON_CHANGE (EffectImpact.LOW), - SYNONYMOUS_STOP (EffectImpact.LOW), - NON_SYNONYMOUS_STOP (EffectImpact.LOW), - INTRON (EffectImpact.LOW), - UTR_3_PRIME (EffectImpact.LOW), - DOWNSTREAM (EffectImpact.LOW), - INTRON_CONSERVED (EffectImpact.LOW), - INTERGENIC_CONSERVED (EffectImpact.LOW), - CUSTOM (EffectImpact.LOW); - - private final EffectImpact impact; - - EffectType ( EffectImpact impact ) { - this.impact = impact; - } - - public EffectImpact getImpact() { - return impact; - } - } - - public enum EffectImpact { - LOW (1), - MODERATE (2), - HIGH (3); - - private final int severityRating; - - EffectImpact ( int severityRating ) { - this.severityRating = severityRating; - } - - public boolean isHigherImpactThan ( EffectImpact other ) { - return this.severityRating > other.severityRating; - } - } - - // The kinds of variants supported by the SnpEff output format: - public enum ChangeType { - SNP, - MNP, - INS, - DEL - } - - // Possible zygosities of SnpEff variants: - public enum Zygosity { - Hom, - Het - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java deleted file mode 100644 index 2f120b7d2..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.broad.tribble.Feature; - -import java.util.NoSuchElementException; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -/** - * Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output. - * - * Many fields are optional, and missing values are represented by nulls. You should always call the - * hasX() method before calling the corresponding getX() method. Required fields can never be null - * and do not have a hasX() method. - * - * @author David Roazen - */ -public class SnpEffFeature implements Feature { - - private String contig; // REQUIRED FIELD - private long position; // REQUIRED FIELD - private String reference; - private String change; - private ChangeType changeType; - private Zygosity zygosity; - private Double quality; - private Long coverage; - private String warnings; - private String geneID; - private String geneName; - private String bioType; - private String transcriptID; - private String exonID; - private Integer exonRank; - private boolean isNonCodingGene; // REQUIRED FIELD - private EffectType effect; // REQUIRED FIELD - private String effectExtraInformation; - private String oldAndNewAA; - private String oldAndNewCodon; - private Integer codonNum; - private Integer cdsSize; - private String codonsAround; - private String aasAround; - private String customIntervalID; - - public SnpEffFeature ( String contig, - long position, - String reference, - String change, - ChangeType changeType, - Zygosity zygosity, - Double quality, - Long coverage, - String warnings, - String geneID, - String geneName, - String bioType, - String transcriptID, - String exonID, - Integer exonRank, - boolean isNonCodingGene, - EffectType effect, - String effectExtraInformation, - String oldAndNewAA, - String oldAndNewCodon, - Integer codonNum, - Integer cdsSize, - String codonsAround, - String aasAround, - String customIntervalID ) { - - if ( contig == null || effect == null ) { - throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields"); - } - - this.contig = contig; - this.position = position; - this.reference = reference; - this.change = change; - this.changeType = changeType; - this.zygosity = zygosity; - this.quality = quality; - this.coverage = coverage; - this.warnings = warnings; - this.geneID = geneID; - this.geneName = geneName; - this.bioType = bioType; - this.transcriptID = transcriptID; - this.exonID = exonID; - this.exonRank = exonRank; - this.isNonCodingGene = isNonCodingGene; - this.effect = effect; - this.effectExtraInformation = effectExtraInformation; - this.oldAndNewAA = oldAndNewAA; - this.oldAndNewCodon = oldAndNewCodon; - this.codonNum = codonNum; - this.cdsSize = cdsSize; - this.codonsAround = codonsAround; - this.aasAround = aasAround; - this.customIntervalID = customIntervalID; - } - - public boolean isHigherImpactThan ( SnpEffFeature other ) { - - // If one effect is in a non-coding gene and the other is not, the effect NOT in the - // non-coding gene has higher impact: - - if ( ! isNonCodingGene() && other.isNonCodingGene() ) { - return true; - } - else if ( isNonCodingGene() && ! other.isNonCodingGene() ) { - return false; - } - - // Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts - // of the effects themselves as defined in the SnpEffConstants class: - - return getEffectImpact().isHigherImpactThan(other.getEffectImpact()); - } - - public String getChr() { - return contig; - } - - public int getStart() { - return (int)position; - } - - public int getEnd() { - return (int)position; - } - - public boolean hasReference() { - return reference != null; - } - - public String getReference() { - if ( reference == null ) throw new NoSuchElementException("This feature has no reference field"); - return reference; - } - - public boolean hasChange() { - return change != null; - } - - public String getChange() { - if ( change == null ) throw new NoSuchElementException("This feature has no change field"); - return change; - } - - public boolean hasChangeType() { - return changeType != null; - } - - public ChangeType getChangeType() { - if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field"); - return changeType; - } - - public boolean hasZygosity() { - return zygosity != null; - } - - public Zygosity getZygosity() { - if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field"); - return zygosity; - } - - public boolean hasQuality() { - return quality != null; - } - - public Double getQuality() { - if ( quality == null ) throw new NoSuchElementException("This feature has no quality field"); - return quality; - } - - public boolean hasCoverage() { - return coverage != null; - } - - public Long getCoverage() { - if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field"); - return coverage; - } - - public boolean hasWarnings() { - return warnings != null; - } - - public String getWarnings() { - if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field"); - return warnings; - } - - public boolean hasGeneID() { - return geneID != null; - } - - public String getGeneID() { - if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field"); - return geneID; - } - - public boolean hasGeneName() { - return geneName != null; - } - - public String getGeneName() { - if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field"); - return geneName; - } - - public boolean hasBioType() { - return bioType != null; - } - - public String getBioType() { - if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field"); - return bioType; - } - - public boolean hasTranscriptID() { - return transcriptID != null; - } - - public String getTranscriptID() { - if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field"); - return transcriptID; - } - - public boolean hasExonID() { - return exonID != null; - } - - public String getExonID() { - if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field"); - return exonID; - } - - public boolean hasExonRank() { - return exonRank != null; - } - - public Integer getExonRank() { - if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field"); - return exonRank; - } - - public boolean isNonCodingGene() { - return isNonCodingGene; - } - - public EffectType getEffect() { - return effect; - } - - public EffectImpact getEffectImpact() { - return effect.getImpact(); - } - - public boolean hasEffectExtraInformation() { - return effectExtraInformation != null; - } - - public String getEffectExtraInformation() { - if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field"); - return effectExtraInformation; - } - - public boolean hasOldAndNewAA() { - return oldAndNewAA != null; - } - - public String getOldAndNewAA() { - if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field"); - return oldAndNewAA; - } - - public boolean hasOldAndNewCodon() { - return oldAndNewCodon != null; - } - - public String getOldAndNewCodon() { - if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field"); - return oldAndNewCodon; - } - - public boolean hasCodonNum() { - return codonNum != null; - } - - public Integer getCodonNum() { - if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field"); - return codonNum; - } - - public boolean hasCdsSize() { - return cdsSize != null; - } - - public Integer getCdsSize() { - if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field"); - return cdsSize; - } - - public boolean hasCodonsAround() { - return codonsAround != null; - } - - public String getCodonsAround() { - if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field"); - return codonsAround; - } - - public boolean hadAasAround() { - return aasAround != null; - } - - public String getAasAround() { - if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field"); - return aasAround; - } - - public boolean hasCustomIntervalID() { - return customIntervalID != null; - } - - public String getCustomIntervalID() { - if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field"); - return customIntervalID; - } - - public boolean equals ( Object o ) { - if ( o == null || ! (o instanceof SnpEffFeature) ) { - return false; - } - - SnpEffFeature other = (SnpEffFeature)o; - - return contig.equals(other.contig) && - position == other.position && - (reference == null ? other.reference == null : reference.equals(other.reference)) && - (change == null ? other.change == null : change.equals(other.change)) && - changeType == other.changeType && - zygosity == other.zygosity && - (quality == null ? other.quality == null : quality.equals(other.quality)) && - (coverage == null ? other.coverage == null : coverage.equals(other.coverage)) && - (warnings == null ? other.warnings == null : warnings.equals(other.warnings)) && - (geneID == null ? other.geneID == null : geneID.equals(other.geneID)) && - (geneName == null ? other.geneName == null : geneName.equals(other.geneName)) && - (bioType == null ? other.bioType == null : bioType.equals(other.bioType)) && - (transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) && - (exonID == null ? other.exonID == null : exonID.equals(other.exonID)) && - (exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) && - isNonCodingGene == other.isNonCodingGene && - effect == other.effect && - (effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) && - (oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) && - (oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) && - (codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) && - (cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) && - (codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) && - (aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) && - (customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID)); - } - - public String toString() { - return "[Contig: " + contig + - " Position: " + position + - " Reference: " + reference + - " Change: " + change + - " Change Type: " + changeType + - " Zygosity: " + zygosity + - " Quality: " + quality + - " Coverage: " + coverage + - " Warnings: " + warnings + - " Gene ID: " + geneID + - " Gene Name: " + geneName + - " Bio Type: " + bioType + - " Transcript ID: " + transcriptID + - " Exon ID: " + exonID + - " Exon Rank: " + exonRank + - " Non-Coding Gene: " + isNonCodingGene + - " Effect: " + effect + - " Effect Extra Information: " + effectExtraInformation + - " Old/New AA: " + oldAndNewAA + - " Old/New Codon: " + oldAndNewCodon + - " Codon Num: " + codonNum + - " CDS Size: " + cdsSize + - " Codons Around: " + codonsAround + - " AAs Around: " + aasAround + - " Custom Interval ID: " + customIntervalID + - "]"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index eb01e5dca..fd1c74993 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -24,6 +24,7 @@ public class VCFHeader { private final Set mMetaData; private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); + private final Map mOtherMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -110,6 +111,9 @@ public class VCFHeader { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; mFormatMetaData.put(formatLine.getName(), formatLine); } + else { + mOtherMetaData.put(line.getKey(), line); + } } } @@ -185,6 +189,14 @@ public class VCFHeader { public VCFFormatHeaderLine getFormatHeaderLine(String key) { return mFormatMetaData.get(key); } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(String key) { + return mOtherMetaData.get(key); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1c65102ae..cfd59b504 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -817,6 +817,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); } + /** + * @param other VariantContext whose alternate alleles to compare against + * @return true if this VariantContext has the same alternate alleles as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + Set thisAlternateAlleles = getAlternateAlleles(); + Set otherAlternateAlleles = other.getAlternateAlleles(); + + if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { + return false; + } + + for ( Allele allele : thisAlternateAlleles ) { + if ( ! otherAlternateAlleles.contains(allele) ) { + return false; + } + } + + return true; + } + // --------------------------------------------------------------------------------------------------------- // // Working with genotypes diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java new file mode 100644 index 000000000..462abeba1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.SnpEffEffect; + +public class SnpEffUnitTest { + + @Test + public void testParseWellFormedEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); + } + + @Test + public void testParseInvalidEffectNameEffect() { + String effectName = "MADE_UP_EFFECT"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseInvalidEffectImpactEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseWrongNumberOfMetadataFieldsEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseSnpEffWarningEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + } + + @Test + public void testParseSnpEffErrorEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 832079807..f902ce276 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -129,12 +130,24 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testSnpEffAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -NO_HEADER -o %s -A SnpEff --variant " + - validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation + - "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000", + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", 1, - Arrays.asList("03eae1dab19a9358250890594bf53607") + Arrays.asList("a1c3ba9efc28ee0606339604095076ea") ); executeTest("Testing SnpEff annotations", spec); } + + @Test + public void testSnpEffAnnotationsUnsupportedVersion() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version)", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index e992684bc..d8f7ad3b6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.Arrays; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; + private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; @@ -14,6 +14,27 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; + @Test + public void testFunctionClassWithSnpeff() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-noST", + "-ST FunctionalClass", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") + ); + executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + } + @Test public void testStratifySamplesAndExcludeMonomorphicSites() { WalkerTestSpec spec = new WalkerTestSpec( @@ -21,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", + "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-ST Sample", diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java deleted file mode 100644 index 6d492565b..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.apache.commons.io.input.ReaderInputStream; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.testng.Assert; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -import java.io.StringReader; - -public class SnpEffCodecUnitTest { - - @Test - public void testParseWellFormedSnpEffHeaderLine() { - String wellFormedSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wellFormedSnpEffHeaderLine))); - String headerReturned = (String)codec.readHeader(reader); - - Assert.assertEquals(headerReturned, wellFormedSnpEffHeaderLine); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseWrongNumberOfFieldsSnpEffHeaderLine() { - String wrongNumberOfFieldsSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wrongNumberOfFieldsSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseMisnamedColumnSnpEffHeaderLine() { - String misnamedColumnSnpEffHeaderLine = "# Chromo\tPosition\tRef\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(misnamedColumnSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test - public void testParseSimpleEffectSnpEffLine() { - String simpleEffectSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 69428l, - "T", - "G", - ChangeType.SNP, - Zygosity.Hom, - 6049.69, - 61573l, - null, - "ENSG00000177693", - "OR4F5", - "mRNA", - "ENST00000326183", - "exon_1_69055_70108", - 1, - false, - EffectType.NON_SYNONYMOUS_CODING, - null, - "F/C", - "TTT/TGT", - 113, - 918, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(simpleEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseNonCodingRegionSnpEffLine() { - String nonCodingRegionSnpEffLine = "1\t1337592\tG\tC\tSNP\tHom\t1935.52\t21885\t\tENSG00000250188\t" + - "RP4-758J18.5\tmRNA\tENST00000514958\texon_1_1337454_1338076\t2\tWITHIN_NON_CODING_GENE, NON_SYNONYMOUS_CODING\t" + - "L/V\tCTA/GTA\t272\t952\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 1337592l, - "G", - "C", - ChangeType.SNP, - Zygosity.Hom, - 1935.52, - 21885l, - null, - "ENSG00000250188", - "RP4-758J18.5", - "mRNA", - "ENST00000514958", - "exon_1_1337454_1338076", - 2, - true, - EffectType.NON_SYNONYMOUS_CODING, - null, - "L/V", - "CTA/GTA", - 272, - 952, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(nonCodingRegionSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseExtraEffectInformationSnpEffLine() { - String extraEffectInformationSnpEffLine = "1\t879537\tT\tC\tSNP\tHom\t341.58\t13733\t\tENSG00000187634\tSAMD11\t" + - "mRNA\tENST00000341065\t\t\tUTR_3_PRIME: 4 bases from transcript end\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 879537l, - "T", - "C", - ChangeType.SNP, - Zygosity.Hom, - 341.58, - 13733l, - null, - "ENSG00000187634", - "SAMD11", - "mRNA", - "ENST00000341065", - null, - null, - false, - EffectType.UTR_3_PRIME, - "4 bases from transcript end", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(extraEffectInformationSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseMultiEffectSnpEffLine() { - String multiEffectSnpEffLine = "1\t901901\tC\tT\tSNP\tHom\t162.91\t4646\t\tENSG00000187583\tPLEKHN1\tmRNA\t" + - "ENST00000379410\texon_1_901877_901994\t1\tSTART_GAINED: ATG, UTR_5_PRIME: 11 bases from TSS\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 901901l, - "C", - "T", - ChangeType.SNP, - Zygosity.Hom, - 162.91, - 4646l, - null, - "ENSG00000187583", - "PLEKHN1", - "mRNA", - "ENST00000379410", - "exon_1_901877_901994", - 1, - false, - EffectType.START_GAINED, - "ATG, UTR_5_PRIME: 11 bases from TSS", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(multiEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseWrongNumberOfFieldsSnpEffLine() { - String wrongNumberOfFieldsSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(wrongNumberOfFieldsSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseBlankEffectFieldSnpEffLine() { - String blankEffectFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\t\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(blankEffectFieldSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseInvalidNumericFieldSnpEffLine() { - String invalidNumericFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\tfoo\t\t\t";; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(invalidNumericFieldSnpEffLine); - } -}