diff --git a/ivy.xml b/ivy.xml index 96c1de844..ee24bc367 100644 --- a/ivy.xml +++ b/ivy.xml @@ -76,7 +76,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f8e87aa58..2ceb4ab46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -480,7 +480,7 @@ public class GenomeAnalysisEngine { } } else if (walker instanceof ReadPairWalker) { if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file."); + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); if(intervals != null && !intervals.isEmpty()) throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index b39fdd79d..a14d999ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial return infoDB.getSample(paternalID); } + public ArrayList getParents(){ + ArrayList parents = new ArrayList(2); + Sample parent = getMother(); + if(parent != null) + parents.add(parent); + parent = getFather(); + if(parent != null) + parents.add(parent); + return parents; + } + /** * Get gender of the sample * @return property of key "gender" - must be of type Gender diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8098de5b1..ab38b69cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 85977bf8e..1956dac6c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public enum InfoFieldKey { EFFECT_KEY ("SNPEFF_EFFECT", -1), IMPACT_KEY ("SNPEFF_IMPACT", 0), - CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), - AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), - GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), - GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), - TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), - EXON_ID_KEY ("SNPEFF_EXON_ID", 7), - FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1); + FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7), + EXON_ID_KEY ("SNPEFF_EXON_ID", 8); // Actual text of the key private final String keyName; @@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // are validated against this list. public enum EffectType { // High-impact effects: - FRAME_SHIFT (EffectFunctionalClass.NONE, false), - STOP_GAINED (EffectFunctionalClass.NONSENSE, false), - START_LOST (EffectFunctionalClass.NONE, false), - SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false), - SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false), - EXON_DELETED (EffectFunctionalClass.NONE, false), - STOP_LOST (EffectFunctionalClass.NONE, false), + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + EXON_DELETED, + FRAME_SHIFT, + STOP_GAINED, + STOP_LOST, // Moderate-impact effects: - NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false), - CODON_CHANGE (EffectFunctionalClass.NONE, false), - CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_DELETION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false), - UTR_5_DELETED (EffectFunctionalClass.NONE, false), - UTR_3_DELETED (EffectFunctionalClass.NONE, false), + NON_SYNONYMOUS_CODING, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + UTR_5_DELETED, + UTR_3_DELETED, // Low-impact effects: - SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - START_GAINED (EffectFunctionalClass.NONE, false), + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + START_GAINED, + SYNONYMOUS_CODING, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, // Modifiers: - NONE (EffectFunctionalClass.NONE, true), - CHROMOSOME (EffectFunctionalClass.NONE, true), - INTERGENIC (EffectFunctionalClass.NONE, true), - UPSTREAM (EffectFunctionalClass.NONE, true), - UTR_5_PRIME (EffectFunctionalClass.NONE, true), - CDS (EffectFunctionalClass.NONE, true), - GENE (EffectFunctionalClass.NONE, true), - TRANSCRIPT (EffectFunctionalClass.NONE, true), - EXON (EffectFunctionalClass.NONE, true), - INTRON (EffectFunctionalClass.NONE, true), - UTR_3_PRIME (EffectFunctionalClass.NONE, true), - DOWNSTREAM (EffectFunctionalClass.NONE, true), - INTRON_CONSERVED (EffectFunctionalClass.NONE, true), - INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true), - REGULATION (EffectFunctionalClass.NONE, true), - CUSTOM (EffectFunctionalClass.NONE, true), - WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true); - - private final EffectFunctionalClass functionalClass; - private final boolean isModifier; - - EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) { - this.functionalClass = functionalClass; - this.isModifier = isModifier; - } - - public EffectFunctionalClass getFunctionalClass() { - return functionalClass; - } - - public boolean isModifier() { - return isModifier; - } + NONE, + CHROMOSOME, + CUSTOM, + CDS, + GENE, + TRANSCRIPT, + EXON, + INTRON_CONSERVED, + UTR_5_PRIME, + UTR_3_PRIME, + DOWNSTREAM, + INTRAGENIC, + INTERGENIC, + INTERGENIC_CONSERVED, + UPSTREAM, + REGULATION, + INTRON } - // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of - // classifying some of the LOW impact effects as MODIFIERs. + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER. public enum EffectImpact { MODIFIER (0), LOW (1), @@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio UNKNOWN } - // We assign a functional class to each SnpEff effect. + // SnpEff assigns a functional class to each effect. public enum EffectFunctionalClass { NONE (0), SILENT (1), @@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public List getKeyNames() { return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), InfoFieldKey.GENE_NAME_KEY.getKeyName(), InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), - InfoFieldKey.EXON_ID_KEY.getKeyName(), - InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName() + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } @@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return Arrays.asList( new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())), new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"), new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())) + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio protected static class SnpEffEffect { private EffectType effect; private EffectImpact impact; + private EffectFunctionalClass functionalClass; private String codonChange; private String aminoAcidChange; private String geneName; @@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private String parseError = null; private boolean isWellFormed = true; - private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11; - // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, - // errors come after warnings, not vice versa: - private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; - private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + // If there is either a warning OR an error, it will be in the last field. If there is both + // a warning AND an error, the warning will be in the second-to-last field, and the error will + // be in the last field. + private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1; + private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2; + private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1; - private static final int SNPEFF_CODING_FIELD_INDEX = 5; + // Position of the field indicating whether the effect is coding or non-coding. This field is used + // in selecting the most significant effect, but is not included in the annotations we return + // since it can be deduced from the SNPEFF_GENE_BIOTYPE field. + private static final int SNPEFF_CODING_FIELD_INDEX = 6; public SnpEffEffect ( String effectName, String[] effectMetadata ) { parseEffectName(effectName); @@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private void parseEffectMetadata ( String[] effectMetadata ) { if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { - if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { - parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) { + parseError(String.format("SnpEff issued the following warning or error: \"%s\"", + effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR])); } - else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { - parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) { + parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"", + effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR], + effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR])); } else { parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", @@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return; } - if ( effect != null && effect.isModifier() ) { - impact = EffectImpact.MODIFIER; + // The impact field will never be empty, and should always contain one of the enumerated values: + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } - else { + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + // The functional class field will be empty when the effect has no functional class associated with it: + if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) { try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()])); } } + else { + functionalClass = EffectFunctionalClass.NONE; + } codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + // The coding field will be empty when SnpEff has no coding info for the effect: if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); @@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return true; } else if ( impact.isSameImpactAs(other.impact) ) { - return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass()); + return functionalClass.isHigherPriorityThan(other.functionalClass); } return false; @@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString()); addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); - addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString()); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 489e963e8..74c55dbfe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param priors priors to use for GLs * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set - * - * @param useBAQedPileup + * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @return genotype likelihoods per sample for AA, AB, BB */ public abstract Allele getLikelihoods(RefMetaDataTracker tracker, @@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse, boolean useBAQedPileup); + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index bdd4e2c65..369c2d0c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -258,7 +258,7 @@ public class UnifiedGenotyper extends LocusWalker result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); return result; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3eedc2a28..847165e3e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -7,38 +7,83 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; /** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * */ -public class PhaseByTransmission extends RodWalker { +public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") - public ArrayList familySpecs = null; + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; + + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; @Output protected VCFWriter vcfWriter = null; @@ -46,241 +91,633 @@ public class PhaseByTransmission extends RodWalker { private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + public final double NO_TRANSMISSION_PROB = -1.0; - private class Trio { - private String mother; - private String father; - private String child; + private ArrayList trios = new ArrayList(); - public Trio(String mother, String father, String child) { - this.mother = mother; - this.father = father; - this.child = child; - } + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; - public Trio(String familySpec) { - String[] pieces = familySpec.split("[\\+\\=]"); + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; - this.mother = pieces[0]; - this.father = pieces[1]; - this.child = pieces[2]; - } + //Metrics counters hash keys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; - public String getMother() { return mother; } - public String getFather() { return father; } - public String getChild() { return child; } + //Random number generator + private Random rand = new Random(); + + private enum FamilyMember { + MOTHER, + FATHER, + CHILD } - private ArrayList trios = new ArrayList(); + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. + private class TrioPhase { - public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { - if (familySpecs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // spec list set, and treat the entries as if they had been specified on the command line. - ArrayList specs = new ArrayList(); - for (String familySpec : familySpecs) { - File specFile = new File(familySpec); + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; - try { - XReadLines reader = new XReadLines(specFile); + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); - List lines = reader.readLines(); - for (String line : lines) { - specs.add(new Trio(line)); - } - } catch (FileNotFoundException e) { - specs.add(new Trio(familySpec)); // not a file, so must be a family spec + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else{ + return null; + } + return alleles; + } + + private boolean isPhasable(Genotype.Type genotype){ + return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; + } + + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + } + + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + + //Special case for Het/Het as it is ambiguous + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + return; + } + + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the mother and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + } + } + + //Phases a family by transmission + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); } } - return specs; + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - return new ArrayList(); + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Take care of cases where one or more family members are no call + if(!isPhasable(child)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else if(!isPhasable(mother)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(!isPhasable(father)){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(!isPhasable(father)){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } + } + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) + phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); + + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); + + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double negLog10Error; + if(genotype.getType() == phasedGenotype.getType()) + negLog10Error = genotype.getNegLog10PError(); + else + negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + + return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); + } + + } /** - * Parse the familial relationship specification, and initialize VCF writer + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer */ public void initialize() { - trios = getFamilySpecsFromCommandLineInput(familySpecs); - ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + //Get the trios from the families passed as ped + setTrios(); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + + Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL"); + } - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); + /** + * Select trios and parent/child pairs only + */ + private void setTrios(){ + + Map> families = this.getSampleDB().getFamilies(); + Set family; + ArrayList parents; + for(String familyID : families.keySet()){ + family = families.get(familyID); + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size())); + } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); + break; + } + } + } + + } + - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; } - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - - return genotypes; + //Create the transmission matrices + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(Genotype.Type.class); + transmissionMatrix = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mother : Genotype.Type.values()){ + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + for(Genotype.Type child : Genotype.Type.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } } - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. + private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; } } - return matchingAlleles; - } - - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; - - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; - - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); - - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); - } - - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); - - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); - - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true); - - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); - } + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); } - ArrayList finalGenotypes = new ArrayList(); - - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); - - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); - - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true); - - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); - - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); - - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true); - - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); - - return finalGenotypes; - } + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; } - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; - return finalGenotypes; + //MV + return 1; } - private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. + private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } - if (mother.isCalled() && father.isCalled() && child.isCalled()) { - ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); - ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isCalled()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); + likelihoods.put(Genotype.Type.HET,1.0/3.0); + likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMotherGenotype = mother; - Genotype bestFatherGenotype = father; - Genotype bestChildGenotype = child; + //Returns the Genotype.Type; returns UNVAILABLE if given null + private Genotype.Type getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return Genotype.Type.UNAVAILABLE; + return genotype.getType(); + } - double norm = 0.0; - for (Genotype motherGenotype : possibleMotherGenotypes) { - for (Genotype fatherGenotype : possibleFatherGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); - norm += prior*configurationLikelihood; + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMotherGenotype = motherGenotype; - bestFatherGenotype = fatherGenotype; - bestChildGenotype = childGenotype; + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); + Genotype.Type pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); + pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + int configuration_index =0; + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + //Get the most likely combination + //Only check for most likely combination if at least a parent and the child have genotypes + if(child.isCalled() && parentsCalled > 0){ + int mvCount; + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); + bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(cumulativeMVCount/3); + } + configurationLikelihood = 0; } } } - if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { - Map attributes = new HashMap(); - attributes.putAll(bestChildGenotype.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; - finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); + //In case of multiple equally likely combinations, take a random one + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); } + + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } - return finalGenotypes; + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); + + } + + + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } } /** @@ -292,55 +729,156 @@ public class PhaseByTransmission extends RodWalker { * @return null */ @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + String mvfLine; + if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); Map genotypeMap = vc.getGenotypes(); - for (Trio trio : trios) { - Genotype mother = vc.getGenotype(trio.getMother()); - Genotype father = vc.getGenotype(trio.getFather()); - Genotype child = vc.getGenotype(trio.getChild()); + int mvCount; - ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); - genotypeMap.put(phasedMother.getSampleName(), phasedMother); - genotypeMap.put(phasedFather.getSampleName(), phasedFather); - genotypeMap.put(phasedChild.getSampleName(), phasedChild); + //Fill the genotype map with the new genotypes and increment metrics counters + genotypeMap.put(phasedChild.getSampleName(),phasedChild); + if(mother != null){ + genotypeMap.put(phasedMother.getSampleName(), phasedMother); + if(father != null){ + genotypeMap.put(phasedFather.getSampleName(), phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + } + else{ + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } + } + else{ + genotypeMap.put(phasedFather.getSampleName(),phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(mvCount>0 && mvFile != null) + mvFile.println(mvfLine); + } + VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); vcfWriter.add(newvc); } - - return null; + return metricsCounters; } /** - * Provide an initial value for reduce computations. + * Initializes the reporting counters. * - * @return Initial value of reduce. + * @return All counters initialized to 0 */ @Override - public Integer reduceInit() { - return null; + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + return metricsCounters; } /** - * Reduces a single map with the accumulator provided as the ReduceType. + * Adds the value of the site phased to the reporting counters. * - * @param value result of the map. - * @param sum accumulator for the reduce. + * @param value Site values + * @param sum accumulator for the reporting counters * @return accumulator with result of the map taken into account. */ @Override - public Integer reduce(Integer value, Integer sum) { - return null; + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + + return sum; + } + + + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java deleted file mode 100644 index e770418c1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.ReadPairWalker; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.List; - -/** - * Counts the number of read pairs encountered in a file sorted in - * query name order. Breaks counts down by total pairs and number - * of paired reads. - * - * - *

Input

- *

- * One or more bam files. - *

- * - *

Output

- *

- * Number of pairs seen. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CountPairs \
- *   -o output.txt \
- *   -I input.bam
- * 
- * - * @author mhanna - */ -public class CountPairsWalker extends ReadPairWalker { - @Output - private PrintStream out; - - /** - * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec. - */ - private long firstOfPair = 0; - - /** - * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec. - */ - private long secondOfPair = 0; - - /** - * A breakdown of the total number of reads seen with exactly the same read name. - */ - private List pairCountsByType = new ExpandingArrayList(); - - /** - * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser. - * @param reads Collection of reads having the same name. - * @return Semantics defined by implementer. - */ - @Override - public Integer map(Collection reads) { - if(pairCountsByType.get(reads.size()) != null) - pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1); - else - pairCountsByType.set(reads.size(),1L); - - for(SAMRecord read: reads) { - if(read.getFirstOfPairFlag()) firstOfPair++; - if(read.getSecondOfPairFlag()) secondOfPair++; - } - - return 1; - } - - /** - * No pairs at the beginning of a traversal. - * @return 0 always. - */ - @Override - public Long reduceInit() { - return 0L; - } - - /** - * Combine number of pairs seen in this iteration (always 1) with total number of pairs - * seen in previous iterations. - * @param value Pairs in this iteration (1), from the map function. - * @param sum Count of all pairs in prior iterations. - * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1). - */ - @Override - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Print summary statistics over the entire traversal. - * @param sum A count of all read pairs viewed. - */ - @Override - public void onTraversalDone(Long sum) { - out.printf("Total number of pairs : %d%n",sum); - out.printf("Total number of first reads in pair : %d%n",firstOfPair); - out.printf("Total number of second reads in pair: %d%n",secondOfPair); - for(int i = 1; i < pairCountsByType.size(); i++) { - if(pairCountsByType.get(i) == null) - continue; - out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i)); - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 609593acc..b92b1cee0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,10 +24,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; @@ -275,8 +273,8 @@ public class SelectVariants extends RodWalker { private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable - * number of variants. Use --select_random_fraction for larger numbers of variants. + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory + * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; @@ -532,7 +530,7 @@ public class SelectVariants extends RodWalker { } } if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub, ref.getBase()); + randomlyAddVariant(++variantNumber, sub); } else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); @@ -705,7 +703,7 @@ public class SelectVariants extends RodWalker { return sub; } - private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { + private void randomlyAddVariant(int rank, VariantContext vc) { if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e10bcbaa0..8cba183da 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -554,4 +554,54 @@ public class GenomeLocParser { return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } + /** + * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + + int start = loc.getStart() - maxBasePairs; + int stop = loc.getStart() - 1; + + if (start < 1) + start = 1; + if (stop < 1) + return null; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } + + /** + * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + int contigLength = contig.getSequenceLength(); + + int start = loc.getStop() + 1; + int stop = loc.getStop() + maxBasePairs; + + if (start > contigLength) + return null; + if (stop > contigLength) + stop = contigLength; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f0e164c87..159b145a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -233,8 +233,12 @@ public class IntervalUtils { * * Returns a null string if there are no differences, otherwise returns a string describing the difference * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference */ - public static final String equateIntervals(List masterArg, List testArg) { + public static String equateIntervals(List masterArg, List testArg) { LinkedList master = new LinkedList(masterArg); LinkedList test = new LinkedList(testArg); @@ -317,23 +321,6 @@ public class IntervalUtils { return lengths; } - /** - * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param locs The genome locs. - * @return The maximum number of parts the intervals can be split into. - */ - public static int countContigIntervals(List locs) { - int maxFiles = 0; - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } - } - return maxFiles; - } - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -373,7 +360,6 @@ public class IntervalUtils { * @return A list of lists of genome locs, split according to splits */ public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; int start = 0; List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { @@ -465,7 +451,7 @@ public class IntervalUtils { @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) @Ensures({"result != null"}) - final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { final List split = new ArrayList(); long size = 0; @@ -579,10 +565,101 @@ public class IntervalUtils { } } - public static final long intervalSize(final List locs) { + public static long intervalSize(final List locs) { long size = 0; for ( final GenomeLoc loc : locs ) size += loc.size(); return size; } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (String contig: locsByContig.keySet()) { + List contigLocs = locsByContig.get(contig); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 3fe1060dd..6d7c8dad9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -188,7 +188,9 @@ public class GATKSAMRecord extends BAMRecord { } public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + byte firstCount = getReducedReadCounts()[0]; + byte offsetCount = getReducedReadCounts()[i]; + return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index dba16cf86..8c8e4f257 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,7 +25,13 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.io.DirectOutputTracker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.jgrapht.util.MathUtil; + +import java.util.EnumMap; +import java.util.Map; public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; @@ -94,6 +100,48 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + + //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype + public double getNegLog10GQ(Genotype.Type genotype){ + + double qual = Double.NEGATIVE_INFINITY; + EnumMap likelihoods = getAsMap(false); + if(likelihoods == null) + return qual; + for(Map.Entry likelihood : likelihoods.entrySet()){ + if(likelihood.getKey() == genotype) + continue; + if(likelihood.getValue() > qual) + qual = likelihood.getValue(); + + } + + //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) + qual = likelihoods.get(genotype) - qual; + + //Quality of other genotypes 1-P(G) + if (qual < 0) { + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } + return qual; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index f52a7087b..204b4b841 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -223,7 +223,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param referenceBaseForIndel padded reference base */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false, true); } /** @@ -240,7 +240,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false, true); } /** @@ -261,7 +261,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param referenceBaseForIndel padded reference base */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true, true); } /** @@ -278,7 +278,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false, true); } /** @@ -291,7 +291,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles alleles */ public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false, true); } /** @@ -314,7 +314,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param other the VariantContext to copy */ public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false, true); } /** @@ -331,11 +331,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes * @param referenceBaseForIndel padded reference base * @param genotypesAreUnparsed true if the genotypes have not yet been parsed + * @param performValidation if true, call validate() as the final step in construction */ private VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, - Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { + Byte referenceBaseForIndel, boolean genotypesAreUnparsed, + boolean performValidation ) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; @@ -371,39 +373,57 @@ public class VariantContext implements Feature { // to enable tribble intergrati } } - validate(); + if ( performValidation ) { + validate(); + } } // --------------------------------------------------------------------------------------------------------- // // Partial-cloning routines (because Variant Context is immutable). + // + // IMPORTANT: These routines assume that the VariantContext on which they're called is already valid. + // Due to this assumption, they explicitly tell the constructor NOT to perform validation by + // calling validate(), and instead perform validation only on the data that's changed. + // // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. // // --------------------------------------------------------------------------------------------------------- public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false, false); + modifiedVC.validateGenotypes(); + return modifiedVC; } public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); + VariantContext modifiedVC = new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); + + // Since start and end have changed, we need to call both validateAlleles() and validateReferencePadding(), + // since those validation routines rely on the values of start and end: + modifiedVC.validateAlleles(); + modifiedVC.validateReferencePadding(); + + return modifiedVC; } public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); } public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true, false); } public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true); + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true, false); + modifiedVC.validateReferencePadding(); + return modifiedVC; } public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true, false); } // --------------------------------------------------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java index 462abeba1..5c8fa32a8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -33,7 +33,7 @@ public class SnpEffUnitTest { @Test public void testParseWellFormedEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); @@ -42,7 +42,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectNameEffect() { String effectName = "MADE_UP_EFFECT"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -51,7 +51,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectImpactEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -60,27 +60,27 @@ public class SnpEffUnitTest { @Test public void testParseWrongNumberOfMetadataFieldsEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); } @Test - public void testParseSnpEffWarningEffect() { + public void testParseSnpEffOneWarningOrErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") ); } @Test - public void testParseSnpEffErrorEffect() { + public void testParseSnpEffBothWarningAndErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index bde4c4a8f..3bfb81dd0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("8e7de435105499cd71ffc099e268a83e")); + Arrays.asList("9beb795536e95954f810835c6058f2ad")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64b6804cb1e27826e3a47089349be581")); + Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -64,7 +64,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); + Arrays.asList("49d989f467b8d6d8f98f7c1b67cd4a05")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -72,7 +72,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); + Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -80,7 +80,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b49fe03aa4b675db80a9db38a3552c95")); + Arrays.asList("33062eccd6eb73bc49440365430454c4")); executeTest("test exclude annotations", spec); } @@ -88,7 +88,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); + Arrays.asList("062155edec46a8c52243475fbf3a2943")); executeTest("test overwriting header", spec); } @@ -96,7 +96,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("16e3a1403fc376320d7c69492cad9345")); + Arrays.asList("06635f2dd91b539bfbce9bf7914d8e43")); executeTest("not passing it any reads", spec); } @@ -104,7 +104,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); + Arrays.asList("820eeba1f6e3a0758a69d937c524a38e")); executeTest("getting DB tag with dbSNP", spec); } @@ -112,7 +112,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); + Arrays.asList("31cc2ce157dd20771418c08d6b3be1fa")); executeTest("getting DB tag with HM3", spec); } @@ -120,7 +120,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31")); + Arrays.asList("074865f8f8c0ca7bfd58681f396c49e9")); executeTest("using expression", spec); } @@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1b4921085b26cbfe07d53b7c947de1e5")); + Arrays.asList("97b26db8135d083566fb585a677fbe8a")); executeTest("using expression with ID", spec); } @@ -148,9 +148,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("122321a85e448f21679f6ca15c5e22ad") + Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b80f214b1..0197f94e5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -5,10 +5,8 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Map; // ********************************************************************************** // @@ -30,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b27939251539439a382538e507e03507")); + Arrays.asList("364a6112e6034571a896d4c68b6ff02a")); executeTest("test MultiSample Pilot1", spec); } @@ -38,12 +36,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("8de2602679ffc92388da0b6cb4325ef6")); + Arrays.asList("6cce55734ecd1a56c33f0ecba67d1258")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e")); + Arrays.asList("b2a00130a4b6f9a3c31288a236f5d159")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -51,7 +49,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6762b72ae60155ad71738d7c76b80e4b")); + Arrays.asList("3ccce5d909f8f128e496f6841836e5f7")); executeTest("test SingleSample Pilot2", spec); } @@ -61,7 +59,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "bc71dba7bbdb23e7d5cc60461fdd897b"; + private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864"; @Test public void testCompressedOutput() { @@ -82,7 +80,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "b9504e446b9313559c3ed97add7e8dc1"; + String md5 = "95614280c565ad90f8c000376fef822c"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -113,8 +111,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "bb3f294eab3e2cf52c70e63b23aac5ee" ); - e.put( "--computeSLOD", "eb34979efaadba1e34bd82bcacf5c722" ); + e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); + e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -127,9 +125,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "d40114aa201aa33ff5f174f15b6b73af" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "3c681b053fd2280f3c42041d24243752" ); - e.put( "--output_mode EMIT_ALL_SITES", "eafa6d71c5ecd64dfee5d7a3f60e392e" ); + e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" ); + e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -143,12 +141,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("c71ca370947739cb7d87b59452be7a07")); + Arrays.asList("902327e8a45fe585c8dfd1a7c4fcf60f")); executeTest("test confidence 1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("1c0a599d475cc7d5e745df6e9b6c0d29")); + Arrays.asList("9ef66764a0ecfe89c3e4e1a93a69e349")); executeTest("test confidence 2", spec2); } @@ -160,8 +158,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "f84da90c310367bd51f2ab6e346fa3d8" ); - e.put( 1.0 / 1850, "5791e7fef40d4412b6d8f84e0a809c6c" ); + e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" ); + e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -185,7 +183,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("9cc9538ac83770e12bd0830d285bfbd0")); + Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); executeTest(String.format("test multiple technologies"), spec); } @@ -204,7 +202,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("eaf8043edb46dfbe9f97ae03baa797ed")); + Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); executeTest(String.format("test calling with BAQ"), spec); } @@ -223,7 +221,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); + Arrays.asList("a64d2e65b5927260e4ce0d948760cc5c")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -238,7 +236,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); + Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -251,7 +249,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); + Arrays.asList("69107157632714150fc068d412e31939")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -261,7 +259,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); + Arrays.asList("84f065e115004e4c9d9d62f1bf6b4f44")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -271,7 +269,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); + Arrays.asList("7ea8372e4074f90ec942f964e1863351")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -281,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); + Arrays.asList("94afa8c011b47af7323c821bf19106b4")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -290,7 +288,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("ad884e511a751b05e64db5314314365a")); + Arrays.asList("1e02f57fafaa41db71c531eb25e148e1")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index c663c1dd7..2cd76e7a5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -6,23 +6,131 @@ import org.testng.annotations.Test; import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { - private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission"; - private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; + private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/"; + private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped"; + private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; + private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; + private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; + //Tests using PbT on all genotypes with default parameters + //And all reporting options @Test - public void testBasicFunctionality() { + public void testTrueNegativeMV() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", "-NO_HEADER", "-R " + b37KGReference, - "--variant " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", + "--variant " + TNTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14") + ); + executeTest("testTrueNegativeMV", spec); + } + + @Test + public void testTruePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + TPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556") + ); + executeTest("testTruePositiveMV", spec); + } + + @Test + public void testFalsePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee") + ); + executeTest("testFalsePositiveMV", spec); + } + + @Test + public void testSpecialCases() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + SpecialTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3") + ); + executeTest("testSpecialCases", spec); + } + + //Test using a different prior + //Here the FP file is used but as the prior is lowered, 3 turn to TP + @Test + public void testPriorOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-prior 1e-4", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e") + ); + executeTest("testPriorOption", spec); + } + + //Test when running without MV reporting option + //This is the exact same test file as FP but should not generate a .mvf file + @Test + public void testMVFileOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", "-o %s" ), 1, - Arrays.asList("") + Arrays.asList("398724bc1e65314cc5ee92706e05a3ee") ); - executeTest("testBasicFunctionality", spec); + executeTest("testMVFileOption", spec); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3dceb9bd2..102d4715e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, - Arrays.asList("d9dcb352c53106f54fcc981f15d35a90") + Arrays.asList("a36414421621b377d6146d58d2fcecd0") ); executeTest("testFunctionClassWithSnpeff", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index f1f849bf5..e9f138a0e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigIndexValid() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } @@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigInfoKnownContig() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference } @@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } + + private static class FlankingGenomeLocTestData extends TestDataProvider { + final GenomeLocParser parser; + final int basePairs; + final GenomeLoc original, flankStart, flankStop; + + private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) { + super(FlankingGenomeLocTestData.class, name); + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.flankStart = flankStart == null ? null : parse(parser, flankStart); + this.flankStop = flankStop == null ? null : parse(parser, flankStop); + } + + private static GenomeLoc parse(GenomeLocParser parser, String str) { + return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str); + } + } + + @DataProvider(name = "flankingGenomeLocs") + public Object[][] getFlankingGenomeLocs() { + int contigLength = 10000; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength); + GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary()); + + new FlankingGenomeLocTestData("atStartBase1", parser, 1, + "chr1:1", null, "chr1:2"); + + new FlankingGenomeLocTestData("atStartBase50", parser, 50, + "chr1:1", null, "chr1:2-51"); + + new FlankingGenomeLocTestData("atStartRange50", parser, 50, + "chr1:1-10", null, "chr1:11-60"); + + new FlankingGenomeLocTestData("atEndBase1", parser, 1, + "chr1:" + contigLength, "chr1:" + (contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndBase50", parser, 50, + "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 10, contigLength), + String.format("chr1:%d-%d", contigLength - 60, contigLength - 11), + null); + + new FlankingGenomeLocTestData("nearStartBase1", parser, 1, + "chr1:2", "chr1:1", "chr1:3"); + + new FlankingGenomeLocTestData("nearStartRange50", parser, 50, + "chr1:21-30", "chr1:1-20", "chr1:31-80"); + + new FlankingGenomeLocTestData("nearEndBase1", parser, 1, + "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength); + + new FlankingGenomeLocTestData("nearEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 30, contigLength - 21), + String.format("chr1:%d-%d", contigLength - 80, contigLength - 31), + String.format("chr1:%d-%d", contigLength - 20, contigLength)); + + new FlankingGenomeLocTestData("beyondStartBase1", parser, 1, + "chr1:3", "chr1:2", "chr1:4"); + + new FlankingGenomeLocTestData("beyondStartRange50", parser, 50, + "chr1:101-200", "chr1:51-100", "chr1:201-250"); + + new FlankingGenomeLocTestData("beyondEndBase1", parser, 1, + "chr1:" + (contigLength - 3), + "chr1:" + (contigLength - 4), + "chr1:" + (contigLength - 2)); + + new FlankingGenomeLocTestData("beyondEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 200, contigLength - 101), + String.format("chr1:%d-%d", contigLength - 250, contigLength - 201), + String.format("chr1:%d-%d", contigLength - 100, contigLength - 51)); + + new FlankingGenomeLocTestData("unmapped", parser, 50, + "unmapped", null, null); + + new FlankingGenomeLocTestData("fullContig", parser, 50, + "chr1", null, null); + + return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStart); + assertEquals(actual, data.flankStart, description); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStop); + assertEquals(actual, data.flankStop, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 46134cd24..53368c339 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -16,7 +16,8 @@ public class ReadUtilsUnitTest extends BaseTest { GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30}; // just the offsets @BeforeTest public void init() { @@ -29,7 +30,7 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); } private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 3f5d05e66..7a2696b7b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest { double t6 = t.getElapsedTime(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - - t.stop().start(); - Assert.assertTrue(t.isRunning(), "second started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset"); - Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time } private final static void idleLoop() { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index f625af23c..ecb5a6d33 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -30,8 +30,10 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import org.testng.annotations.*; + +import java.util.LinkedList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -44,180 +46,214 @@ public class ReadClipperUnitTest extends BaseTest { // TODO: Add error messages on failed tests + //int debug = 0; + GATKSAMRecord read, expected; ReadClipper readClipper; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 - @BeforeClass + + public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) { + Assert.assertEquals(read.getReadBases(), readBases); + Assert.assertEquals(read.getBaseQualityString(), baseQuals); + Assert.assertEquals(read.getCigarString(), cigar); + } + + public class testParameter { + int inputStart; + int inputStop; + int substringStart; + int substringStop; + String cigar; + + public testParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) { + inputStart = InputStart; + inputStop = InputStop; + substringStart = SubstringStart; + substringStop = SubstringStop; + cigar = Cigar; + } + } + + // What the test read looks like + // Ref: 1 2 3 4 5 6 7 8 + // Read: 0 1 2 3 - - - - + // ----------------------------- + // Bases: A C T G - - - - + // Quals: ! + 5 ? - - - - + + @BeforeMethod public void init() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); readClipper = new ReadClipper(read); + //logger.warn(read.getCigarString()); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipBothEndsByReferenceCoordinates() { - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + //int debug = 1; //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader())); + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader())); + //clip 1 base - expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4); Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); Assert.assertEquals(expected.getCigarString(), "1H2M1H"); } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReadCoordinates(0,0); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(0,0,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(3,3,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(0,1,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(2,3,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReadCoordinates(3,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReadCoordinates(0,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReadCoordinates(2,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinates() { logger.warn("Executing testHardClipByReferenceCoordinates"); - + //logger.warn(debug); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinates(-1,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(-1,1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(4,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(-1,2,2,4,"2H2M"));//clip 2 bases at start + testList.add(new testParameter(3,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinates(3,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinates(-1,2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinates(2,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinates(p.inputStart,p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesLeftTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); + List testList = new LinkedList(); + testList.add(new testParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start + testList.add(new testParameter(2, -1, 2, 4, "2H2M"));//clip 2 bases at start - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) public void testHardClipByReferenceCoordinatesRightTail() { + init(); logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); //Clip whole read Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader())); - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + List testList = new LinkedList(); + testList.add(new testParameter(-1, 4, 0, 3, "3M1H"));//clip 1 base at end + testList.add(new testParameter(-1, 3, 0, 2, "2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } } - @Test ( enabled = false ) + @Test ( enabled = true ) // TODO This function is returning null reads public void testHardClipLowQualEnds() { - logger.warn("Executing testHardClipByReferenceCoordinates"); + logger.warn("Executing testHardClipByReferenceCoordinates"); //Clip whole read Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader())); - //clip 1 base at start - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 2 bases at start - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); + List testList = new LinkedList(); + testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start + testList.add(new testParameter(11,-1,2,4,"2H2M"));//clip 2 bases at start + for ( testParameter p : testList ) { + init(); + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + /* todo find a better way to test lowqual tail clipping on both sides // Reverse Quals sequence readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - //clip 1 base at end - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); + testList = new LinkedList(); + testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end + testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end - //clip 2 bases at end - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - // revert Qual sequence - readClipper.getRead().setBaseQualityString(QUALS); + for ( testParameter p : testList ) { + init(); + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); + testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), + BASES.substring(p.substringStart,p.substringStop).getBytes(), + QUALS.substring(p.substringStart,p.substringStop), + p.cigar ); + } + */ } -} + + public class CigarReadMaker { + + } + + @Test ( enabled = false ) + public void testHardClipSoftClippedBases() { + + // Generate a list of cigars to test + // We will use testParameter in the following way + // Right tail, left tail, + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 9c3b905c2..03d33d2c5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,8 +1,8 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest { List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 9243588ab..f3d0dedcd 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; + /** * Basic unit test for Genotype likelihoods objects @@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest { gl.getAsVector(); } + @Test + public void testGetAsMap(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(v); + //Log scale + EnumMap glMap = gl.getAsMap(false); + Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Linear scale + glMap = gl.getAsMap(true); + double [] vl = MathUtils.normalizeFromLog10(v); + Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + glMap = gl.getAsMap(false); + Assert.assertNull(glMap); + + } + + @Test + public void testGetNegLog10GQ(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); + + //GQ for the best guess genotype + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9); + + double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); + + //GQ for the other genotypes + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala new file mode 100644 index 000000000..d90db0de4 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Output, Argument, Input} +import java.io.File +import org.broadinstitute.sting.utils.interval.IntervalUtils + +class WriteFlankingIntervalsFunction extends InProcessFunction { + @Input(doc="The reference sequence") + var reference : File = _ + + @Input(doc="The interval list to flank") + var inputIntervals : File = _ + + @Output(doc="The output intervals file to write to") + var outputIntervals: File = _ + + @Argument(doc="Number of base pair to flank the input intervals") + var flankSize : Int = _ + + def run() { + IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala deleted file mode 100755 index 77eb3ccbc..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import collection.immutable.TreeSet - -// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { - @Input(doc="The interval list to expand") val inList : File = in - @Input(doc="The reference sequence") val refDict : File = ref - @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start - @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size - @Output(doc="The output intervals file to write to") val outList : File = out - @Argument(doc="The output format for the intervals") val outTypeStr = opType - @Argument(doc="The input format for the intervals") val inTypeStr = ipType - - var output : PrintStream = _ - var parser : GenomeLocParser = _ - var xrl : XReadLines = _ - val outType = IntervalFormatType.convert(outTypeStr) - val inType = IntervalFormatType.convert(inTypeStr) - - var offsetIn : Int = 0 - var offsetOut : Int = 0 - - var first : Boolean = true - var lastTwo : (GenomeLoc,GenomeLoc) = _ - - var intervalCache : TreeSet[GenomeLoc] = _ - val LINES_TO_CACHE : Int = 1000 - - def run = { - output = new PrintStream(outList) - intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{ - def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) } - }) - parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - xrl = new XReadLines(inList) - offsetIn = if (isBed(inType)) 1 else 0 - offsetOut = if( isBed(outType)) 1 else 0 - var line : String = xrl.next - while ( line.startsWith("@") ) { - line = xrl.next - } - var prevLoc: GenomeLoc = null - var curLoc: GenomeLoc = null - var nextLoc : GenomeLoc = parseGenomeInterval(line) - var linesProcessed : Int = 1 - while ( prevLoc != null || curLoc != null || nextLoc != null ) { - prevLoc = curLoc - curLoc = nextLoc - nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null - if ( curLoc != null ) { - val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc) - val right: GenomeLoc = refine(expandRight(curLoc),nextLoc) - if ( left != null ) { - intervalCache += left - } - if ( right != null ) { - intervalCache += right - } - } - linesProcessed += 1 - if ( linesProcessed % LINES_TO_CACHE == 0 ) { - val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt)) - intervalCache = intervalCache -- toPrint - toPrint.foreach(u => output.print("%s%n".format(repr(u)))) - } - //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc))) - } - - intervalCache.foreach(u => output.print("%s%n".format(repr(u)))) - - output.close() - } - - def expandLeft(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt) - } - - def expandRight(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt) - } - - def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = { - if ( borderG == null || ! newG.overlapsP(borderG) ) { - return newG - } else { - if ( newG.getStart < borderG.getStart ) { - if ( borderG.getStart - startInt > newG.getStart ) { - return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt) - } - } else { - if ( borderG.getStop + startInt < newG.getStop ){ - return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop) - } - } - } - - null - } - - def repr(loc : GenomeLoc) : String = { - if ( loc == null ) return "null" - if ( outType == IntervalFormatType.INTERVALS ) { - return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) - } - } - - def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { - t == IntervalFormatType.BED - } - - def parseGenomeInterval( s : String ) : GenomeLoc = { - val sp = s.split("\\s+") - // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) - } - - object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { - type IntervalFormatType = Value - val INTERVALS,BED,TDF = Value - - def convert(s : String) : IntervalFormatType = { - if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} - } - } -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala deleted file mode 100755 index e929477a1..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ /dev/null @@ -1,70 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import collection.JavaConversions._ -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} -import org.broadinstitute.sting.utils.text.XReadLines -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} - -class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { - @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals - @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false - @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - - - var outStream : PrintStream = _ - var contigs : List[String] = Nil - var dict : SAMSequenceDictionary = _ - var parser : GenomeLocParser = _ - - def run = { - outStream = new PrintStream(output) - dict = new SAMSequenceDictionary - // note: memory hog - val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex - sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) - parser = new GenomeLocParser(dict) - sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) - outStream.close() - } - - def writeOut(g : GenomeLoc) : Unit = { - outStream.print("%s%n".format(g.toString)) - } - - def parse(s : String) : (String,Int,Int) = { - if ( s.contains(":") ) { - val split1 = s.split(":") - val split2 = split1(1).split("-") - return (split1(0),split2(0).toInt,split2(1).toInt) - } else { - val split = s.split("\\s+") - return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) - } - } - - def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { - parser.createGenomeLoc(coords._1,coords._2,coords._3) - } - - def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { - if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { - a.init :+ (a.last :+ b) - } else { - a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) - } - } - - def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { - if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for - null - } - u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) - } - - def range(a: Int, b: Int) : Range = new Range(a,b+1,1) - -} \ No newline at end of file diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar old mode 100755 new mode 100644 similarity index 88% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar index bfd06f97f..ee5d02367 Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml similarity index 77% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml index f0568def4..5417641d3 100644 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml +++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml @@ -1,3 +1,3 @@ - +