diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index f8e87aa58..2ceb4ab46 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -480,7 +480,7 @@ public class GenomeAnalysisEngine {
}
} else if (walker instanceof ReadPairWalker) {
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
+ throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
if(intervals != null && !intervals.isEmpty())
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
index b39fdd79d..a14d999ea 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
@@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial
return infoDB.getSample(paternalID);
}
+ public ArrayList getParents(){
+ ArrayList parents = new ArrayList(2);
+ Sample parent = getMother();
+ if(parent != null)
+ parents.add(parent);
+ parent = getFather();
+ if(parent != null)
+ parents.add(parent);
+ return parents;
+ }
+
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index 85977bf8e..1956dac6c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
// We refuse to parse SnpEff output files generated by unsupported versions, or
// lacking a SnpEff version number in the VCF header:
- public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
+ public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" };
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
@@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
public enum InfoFieldKey {
EFFECT_KEY ("SNPEFF_EFFECT", -1),
IMPACT_KEY ("SNPEFF_IMPACT", 0),
- CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
- AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
- GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
- GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
- TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
- EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
- FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
+ FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1),
+ CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2),
+ AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3),
+ GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4),
+ GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5),
+ TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7),
+ EXON_ID_KEY ("SNPEFF_EXON_ID", 8);
// Actual text of the key
private final String keyName;
@@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
// are validated against this list.
public enum EffectType {
// High-impact effects:
- FRAME_SHIFT (EffectFunctionalClass.NONE, false),
- STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
- START_LOST (EffectFunctionalClass.NONE, false),
- SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
- SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
- EXON_DELETED (EffectFunctionalClass.NONE, false),
- STOP_LOST (EffectFunctionalClass.NONE, false),
+ SPLICE_SITE_ACCEPTOR,
+ SPLICE_SITE_DONOR,
+ START_LOST,
+ EXON_DELETED,
+ FRAME_SHIFT,
+ STOP_GAINED,
+ STOP_LOST,
// Moderate-impact effects:
- NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
- CODON_CHANGE (EffectFunctionalClass.NONE, false),
- CODON_INSERTION (EffectFunctionalClass.NONE, false),
- CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
- CODON_DELETION (EffectFunctionalClass.NONE, false),
- CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
- UTR_5_DELETED (EffectFunctionalClass.NONE, false),
- UTR_3_DELETED (EffectFunctionalClass.NONE, false),
+ NON_SYNONYMOUS_CODING,
+ CODON_CHANGE,
+ CODON_INSERTION,
+ CODON_CHANGE_PLUS_CODON_INSERTION,
+ CODON_DELETION,
+ CODON_CHANGE_PLUS_CODON_DELETION,
+ UTR_5_DELETED,
+ UTR_3_DELETED,
// Low-impact effects:
- SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
- SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
- NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
- SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
- NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
- START_GAINED (EffectFunctionalClass.NONE, false),
+ SYNONYMOUS_START,
+ NON_SYNONYMOUS_START,
+ START_GAINED,
+ SYNONYMOUS_CODING,
+ SYNONYMOUS_STOP,
+ NON_SYNONYMOUS_STOP,
// Modifiers:
- NONE (EffectFunctionalClass.NONE, true),
- CHROMOSOME (EffectFunctionalClass.NONE, true),
- INTERGENIC (EffectFunctionalClass.NONE, true),
- UPSTREAM (EffectFunctionalClass.NONE, true),
- UTR_5_PRIME (EffectFunctionalClass.NONE, true),
- CDS (EffectFunctionalClass.NONE, true),
- GENE (EffectFunctionalClass.NONE, true),
- TRANSCRIPT (EffectFunctionalClass.NONE, true),
- EXON (EffectFunctionalClass.NONE, true),
- INTRON (EffectFunctionalClass.NONE, true),
- UTR_3_PRIME (EffectFunctionalClass.NONE, true),
- DOWNSTREAM (EffectFunctionalClass.NONE, true),
- INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
- INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
- REGULATION (EffectFunctionalClass.NONE, true),
- CUSTOM (EffectFunctionalClass.NONE, true),
- WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
-
- private final EffectFunctionalClass functionalClass;
- private final boolean isModifier;
-
- EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
- this.functionalClass = functionalClass;
- this.isModifier = isModifier;
- }
-
- public EffectFunctionalClass getFunctionalClass() {
- return functionalClass;
- }
-
- public boolean isModifier() {
- return isModifier;
- }
+ NONE,
+ CHROMOSOME,
+ CUSTOM,
+ CDS,
+ GENE,
+ TRANSCRIPT,
+ EXON,
+ INTRON_CONSERVED,
+ UTR_5_PRIME,
+ UTR_3_PRIME,
+ DOWNSTREAM,
+ INTRAGENIC,
+ INTERGENIC,
+ INTERGENIC_CONSERVED,
+ UPSTREAM,
+ REGULATION,
+ INTRON
}
- // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
- // classifying some of the LOW impact effects as MODIFIERs.
+ // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER.
public enum EffectImpact {
MODIFIER (0),
LOW (1),
@@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
UNKNOWN
}
- // We assign a functional class to each SnpEff effect.
+ // SnpEff assigns a functional class to each effect.
public enum EffectFunctionalClass {
NONE (0),
SILENT (1),
@@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
public List getKeyNames() {
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
InfoFieldKey.IMPACT_KEY.getKeyName(),
+ InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(),
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
- InfoFieldKey.EXON_ID_KEY.getKeyName(),
- InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
+ InfoFieldKey.EXON_ID_KEY.getKeyName()
);
}
@@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return Arrays.asList(
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+ new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())),
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
+ new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
@@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
protected static class SnpEffEffect {
private EffectType effect;
private EffectImpact impact;
+ private EffectFunctionalClass functionalClass;
private String codonChange;
private String aminoAcidChange;
private String geneName;
@@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
private String parseError = null;
private boolean isWellFormed = true;
- private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
- private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
- private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
+ private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10;
+ private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11;
- // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
- // errors come after warnings, not vice versa:
- private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
- private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
+ // If there is either a warning OR an error, it will be in the last field. If there is both
+ // a warning AND an error, the warning will be in the second-to-last field, and the error will
+ // be in the last field.
+ private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1;
+ private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2;
+ private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1;
- private static final int SNPEFF_CODING_FIELD_INDEX = 5;
+ // Position of the field indicating whether the effect is coding or non-coding. This field is used
+ // in selecting the most significant effect, but is not included in the annotations we return
+ // since it can be deduced from the SNPEFF_GENE_BIOTYPE field.
+ private static final int SNPEFF_CODING_FIELD_INDEX = 6;
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
parseEffectName(effectName);
@@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
private void parseEffectMetadata ( String[] effectMetadata ) {
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
- if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
- parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
+ if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) {
+ parseError(String.format("SnpEff issued the following warning or error: \"%s\"",
+ effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR]));
}
- else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
- parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
+ else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) {
+ parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"",
+ effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR],
+ effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR]));
}
else {
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
@@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return;
}
- if ( effect != null && effect.isModifier() ) {
- impact = EffectImpact.MODIFIER;
+ // The impact field will never be empty, and should always contain one of the enumerated values:
+ try {
+ impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
- else {
+ catch ( IllegalArgumentException e ) {
+ parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
+ }
+
+ // The functional class field will be empty when the effect has no functional class associated with it:
+ if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) {
try {
- impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
+ functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
- parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
+ parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]));
}
}
+ else {
+ functionalClass = EffectFunctionalClass.NONE;
+ }
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
+ // The coding field will be empty when SnpEff has no coding info for the effect:
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
@@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
return true;
}
else if ( impact.isSameImpactAs(other.impact) ) {
- return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
+ return functionalClass.isHigherPriorityThan(other.functionalClass);
}
return false;
@@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
+ addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString());
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
- addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
return annotations;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
index 489e963e8..74c55dbfe 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
-import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Map;
@@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param priors priors to use for GLs
* @param GLs hash of sample->GL to fill in
* @param alternateAlleleToUse the alternate allele to use, null if not set
- *
- * @param useBAQedPileup
+ * @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @return genotype likelihoods per sample for AA, AB, BB
*/
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
@@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map GLs,
- Allele alternateAlleleToUse, boolean useBAQedPileup);
+ Allele alternateAlleleToUse,
+ boolean useBAQedPileup);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( BaseUtils.isRegularBase( p.getBase() ) )
- count++;
+ count += p.getRepresentativeCount();
}
return count;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 3eedc2a28..847165e3e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -7,38 +7,83 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
-import java.io.File;
-import java.io.FileNotFoundException;
+import java.io.PrintStream;
import java.util.*;
/**
- * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio,
- * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype
- * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents
- * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the
- * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to
- * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally
- * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or
- * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
- * begin.
+ * Computes the most likely genotype combination and phases trios and parent/child pairs
+ *
+ *
+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases
+ * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability.
+ * Ambiguous sites are:
+ *
+ * - Sites where all individuals are heterozygous
+ * - Sites where there is a Mendelian violation
+ *
+ * Missing genotypes are handled as follows:
+ *
+ * - In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
+ * - In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
+ * - In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
+ * - In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
+ *
+ *
+ * Input
+ *
+ *
+ * - A VCF variant set containing trio(s) and/or parent/child pair(s).
+ * - A PED pedigree file containing the description of the individuals relationships.
+ *
+ *
+ *
+ * Options
+ *
+ *
+ * - MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype
+ * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability,
+ * and each individual genotype, depth, allelic depth and likelihoods.
+ * - DeNovoPrior: Mutation prio; default is 1e-8
+ *
+ *
+ *
+ * Output
+ *
+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous..
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PhaseByTransmission \
+ * -V input.vcf \
+ * -ped input.ped \
+ * -o output.vcf
+ *
+ *
*/
-public class PhaseByTransmission extends RodWalker {
+public class PhaseByTransmission extends RodWalker, HashMap> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
- @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.")
- public ArrayList familySpecs = null;
+ @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
+ private PrintStream mvFile = null;
+
+ @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8")
+ private double deNovoPrior=1e-8;
@Output
protected VCFWriter vcfWriter = null;
@@ -46,241 +91,633 @@ public class PhaseByTransmission extends RodWalker {
private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP";
private final String SOURCE_NAME = "PhaseByTransmission";
- private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
+ public final double NO_TRANSMISSION_PROB = -1.0;
- private class Trio {
- private String mother;
- private String father;
- private String child;
+ private ArrayList trios = new ArrayList();
- public Trio(String mother, String father, String child) {
- this.mother = mother;
- this.father = father;
- this.child = child;
- }
+ //Matrix of priors for all genotype combinations
+ private EnumMap>> mvCountMatrix;
- public Trio(String familySpec) {
- String[] pieces = familySpec.split("[\\+\\=]");
+ //Matrix of allele transmission
+ private EnumMap>> transmissionMatrix;
- this.mother = pieces[0];
- this.father = pieces[1];
- this.child = pieces[2];
- }
+ //Metrics counters hash keys
+ private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
+ private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
+ private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
+ private final Byte NUM_TRIO_HET_HET_HET = 3;
+ private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10;
+ private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
+ private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
+ private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
+ private final Byte NUM_PAIR_HET_HET = 8;
+ private final Byte NUM_PAIR_VIOLATIONS = 9;
+ private final Byte NUM_GENOTYPES_MODIFIED = 11;
- public String getMother() { return mother; }
- public String getFather() { return father; }
- public String getChild() { return child; }
+ //Random number generator
+ private Random rand = new Random();
+
+ private enum FamilyMember {
+ MOTHER,
+ FATHER,
+ CHILD
}
- private ArrayList trios = new ArrayList();
+ //Stores a conceptual trio or parent/child pair genotype combination along with its phasing.
+ //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method.
+ private class TrioPhase {
- public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) {
- if (familySpecs != null) {
- // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
- // spec list set, and treat the entries as if they had been specified on the command line.
- ArrayList specs = new ArrayList();
- for (String familySpec : familySpecs) {
- File specFile = new File(familySpec);
+ //Create 2 fake alleles
+ //The actual bases will never be used but the Genotypes created using the alleles will be.
+ private final Allele REF = Allele.create("A",true);
+ private final Allele VAR = Allele.create("A",false);
+ private final Allele NO_CALL = Allele.create(".",false);
+ private final String DUMMY_NAME = "DummySample";
- try {
- XReadLines reader = new XReadLines(specFile);
+ private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
- List lines = reader.readLines();
- for (String line : lines) {
- specs.add(new Trio(line));
- }
- } catch (FileNotFoundException e) {
- specs.add(new Trio(familySpec)); // not a file, so must be a family spec
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(REF);
+ alleles.add(REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(REF);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else{
+ return null;
+ }
+ return alleles;
+ }
+
+ private boolean isPhasable(Genotype.Type genotype){
+ return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR;
+ }
+
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ }
+
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+
+ //Special case for Het/Het as it is ambiguous
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ return;
+ }
+
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+
+ //If there is a possible phasing between the mother and child => phase
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ //This is a Mendelian Violation => Do not phase
+ else{
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ }
+ }
+
+ //Phases a family by transmission
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
+
+ //Build all possible child genotypes for the given parent's genotypes
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
}
}
- return specs;
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
+ else
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
+ else
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- return new ArrayList();
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Take care of cases where one or more family members are no call
+ if(!isPhasable(child)){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else if(!isPhasable(mother)){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(!isPhasable(father)){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else
+ phasePairAlleles(father, child, FamilyMember.FATHER);
+ }
+ else if(!isPhasable(father)){
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ phaseFamilyAlleles(mother, father, child);
+ }
+ }
+
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
+ }
+
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+
+ int phredScoreTransmission = -1;
+ if(transmissionProb != NO_TRANSMISSION_PROB)
+ phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
+
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ //In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
+ if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
+ return genotype;
+
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
+
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
+
+ }
+
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double negLog10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ negLog10Error = genotype.getNegLog10PError();
+ else
+ negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
+
+ return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
+ }
+
+
}
/**
- * Parse the familial relationship specification, and initialize VCF writer
+ * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer
*/
public void initialize() {
- trios = getFamilySpecsFromCommandLineInput(familySpecs);
-
ArrayList rodNames = new ArrayList();
rodNames.add(variantCollection.variants.getName());
-
Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ //Get the trios from the families passed as ped
+ setTrios();
+ if(trios.size()<1)
+ throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted.");
+
+
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
+
+ buildMatrices();
+
+ if(mvFile != null)
+ mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL");
+
}
- private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) {
- double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector());
- double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector());
- double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector());
+ /**
+ * Select trios and parent/child pairs only
+ */
+ private void setTrios(){
+
+ Map> families = this.getSampleDB().getFamilies();
+ Set family;
+ ArrayList parents;
+ for(String familyID : families.keySet()){
+ family = families.get(familyID);
+ if(family.size()<2 || family.size()>3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size()));
+ }
+ else{
+ for(Sample familyMember : family){
+ parents = familyMember.getParents();
+ if(parents.size()>0){
+ if(family.containsAll(parents))
+ this.trios.add(familyMember);
+ else
+ logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
+ break;
+ }
+ }
+ }
+
+ }
+
- int momIndex = mom.getType().ordinal() - 1;
- int dadIndex = dad.getType().ordinal() - 1;
- int childIndex = child.getType().ordinal() - 1;
- return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex];
}
- private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) {
- List homRefAlleles = new ArrayList();
- homRefAlleles.add(refAllele);
- homRefAlleles.add(refAllele);
- Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List hetAlleles = new ArrayList();
- hetAlleles.add(refAllele);
- hetAlleles.add(altAllele);
- Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List homVarAlleles = new ArrayList();
- homVarAlleles.add(altAllele);
- homVarAlleles.add(altAllele);
- Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- ArrayList genotypes = new ArrayList();
- genotypes.add(homRef);
- genotypes.add(het);
- genotypes.add(homVar);
-
- return genotypes;
+ //Create the transmission matrices
+ private void buildMatrices(){
+ mvCountMatrix = new EnumMap>>(Genotype.Type.class);
+ transmissionMatrix = new EnumMap>>(Genotype.Type.class);
+ for(Genotype.Type mother : Genotype.Type.values()){
+ mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ for(Genotype.Type father : Genotype.Type.values()){
+ mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ for(Genotype.Type child : Genotype.Type.values()){
+ mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
+ transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
+ }
+ }
+ }
}
- private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) {
- List alleles = g.getAlleles();
- int matchingAlleles = 0;
+ //Returns the number of Mendelian Violations for a given genotype combination.
+ //If one of the parents genotype is missing, it will consider it as a parent/child pair
+ //If the child genotype or both parents genotypes are missing, 0 is returned.
+ private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- for (Allele a : alleles) {
- if (!alleleToMatch.equals(a)) {
- matchingAlleles++;
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return 0;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return 0;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
}
}
- return matchingAlleles;
- }
-
- private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) {
- int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0;
- int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0;
-
- int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0;
- int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0;
-
- int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child);
- int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child);
-
- return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles);
- }
-
- private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) {
- Set possiblePhasedChildGenotypes = new HashSet();
-
- for (Allele momAllele : mom.getAlleles()) {
- for (Allele dadAllele : dad.getAlleles()) {
- ArrayList possiblePhasedChildAlleles = new ArrayList();
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(dadAllele);
-
- Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true);
-
- possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype);
- }
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return 0;
+ else return (parents.size()-parentsNumRefAlleles);
}
- ArrayList finalGenotypes = new ArrayList();
-
- for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) {
- if (child.sameGenotype(phasedChildGenotype, true)) {
- Allele momTransmittedAllele = phasedChildGenotype.getAllele(0);
- Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1);
-
- ArrayList phasedMomAlleles = new ArrayList();
- phasedMomAlleles.add(momTransmittedAllele);
- phasedMomAlleles.add(momUntransmittedAllele);
-
- Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true);
-
- Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1);
- Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1);
-
- ArrayList phasedDadAlleles = new ArrayList();
- phasedDadAlleles.add(dadTransmittedAllele);
- phasedDadAlleles.add(dadUntransmittedAllele);
-
- Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true);
-
- finalGenotypes.add(phasedMomGenotype);
- finalGenotypes.add(phasedDadGenotype);
- finalGenotypes.add(phasedChildGenotype);
-
- return finalGenotypes;
- }
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return 0;
+ else return parents.size()-parentsNumAltAlleles;
}
- finalGenotypes.add(mom);
- finalGenotypes.add(dad);
- finalGenotypes.add(child);
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return 0;
- return finalGenotypes;
+ //MV
+ return 1;
}
- private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) {
- ArrayList finalGenotypes = new ArrayList();
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
+ //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
+ private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
+ int count = 0;
+ if(motherOriginal!=motherNew)
+ count++;
+ if(fatherOriginal!=fatherNew)
+ count++;
+ if(childOriginal!=childNew)
+ count++;
+ return count;
+ }
- if (mother.isCalled() && father.isCalled() && child.isCalled()) {
- ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother);
- ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father);
- ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child);
+ //Get a Map of genotype likelihoods.
+ //In case of null, unavailable or no call, all likelihoods are 1/3.
+ private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
+ if(genotype == null || !genotype.isCalled()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
+ likelihoods.put(Genotype.Type.HET,1.0/3.0);
+ likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
+ return likelihoods;
+ }
+ return genotype.getLikelihoods().getAsMap(true);
+ }
- double bestConfigurationLikelihood = 0.0;
- double bestPrior = 0.0;
- Genotype bestMotherGenotype = mother;
- Genotype bestFatherGenotype = father;
- Genotype bestChildGenotype = child;
+ //Returns the Genotype.Type; returns UNVAILABLE if given null
+ private Genotype.Type getTypeSafeNull(Genotype genotype){
+ if(genotype == null)
+ return Genotype.Type.UNAVAILABLE;
+ return genotype.getType();
+ }
- double norm = 0.0;
- for (Genotype motherGenotype : possibleMotherGenotypes) {
- for (Genotype fatherGenotype : possibleFatherGenotypes) {
- for (Genotype childGenotype : possibleChildGenotypes) {
- double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
- double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype);
- norm += prior*configurationLikelihood;
+ /**
+ * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair.
+ * @param ref: Reference allele
+ * @param alt: Alternative allele
+ * @param mother: Mother's genotype
+ * @param father: Father's genotype
+ * @param child: Child's genotype
+ * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
+ * @return
+ */
+ private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
- if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestPrior = prior;
- bestMotherGenotype = motherGenotype;
- bestFatherGenotype = fatherGenotype;
- bestChildGenotype = childGenotype;
+ //Check whether it is a pair or trio
+ //Always assign the first parent as the parent having genotype information in pairs
+ //Always assign the mother as the first parent in trios
+ int parentsCalled = 0;
+ Map firstParentLikelihoods;
+ Map secondParentLikelihoods;
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
+ ArrayList bestChildGenotype = new ArrayList();
+ Genotype.Type pairSecondParentGenotype = null;
+ if(mother == null || !mother.isCalled()){
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ bestFirstParentGenotype.add(getTypeSafeNull(father));
+ bestSecondParentGenotype.add(getTypeSafeNull(mother));
+ pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
+ if(father != null && father.isCalled())
+ parentsCalled = 1;
+ }
+ else{
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
+ if(father == null || !father.isCalled()){
+ parentsCalled = 1;
+ pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
+ }else{
+ parentsCalled = 2;
+ }
+ }
+ Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
+ bestChildGenotype.add(getTypeSafeNull(child));
+
+ //Prior vars
+ double bestConfigurationLikelihood = 0.0;
+ double norm = 0.0;
+ int configuration_index =0;
+ ArrayList bestMVCount = new ArrayList();
+ bestMVCount.add(0);
+
+ //Get the most likely combination
+ //Only check for most likely combination if at least a parent and the child have genotypes
+ if(child.isCalled() && parentsCalled > 0){
+ int mvCount;
+ int cumulativeMVCount = 0;
+ double configurationLikelihood = 0;
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){
+ for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
+ //For parent/child pairs, sum over the possible genotype configurations of the missing parent
+ if(parentsCalled<2){
+ cumulativeMVCount += mvCount;
+ configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
}
+ //Evaluate configurations of trios
+ else{
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(mvCount);
+ bestFirstParentGenotype.clear();
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.clear();
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(mvCount);
+ }
+ }
+ }
+ //Evaluate configurations of parent/child pairs
+ if(parentsCalled<2){
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(cumulativeMVCount/3);
+ bestChildGenotype.clear();
+ bestFirstParentGenotype.clear();
+ bestSecondParentGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(cumulativeMVCount/3);
+ }
+ configurationLikelihood = 0;
}
}
}
- if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) {
- Map attributes = new HashMap();
- attributes.putAll(bestChildGenotype.getAttributes());
- attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
- bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);
+ //normalize the best configuration probability
+ bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
- finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+ //In case of multiple equally likely combinations, take a random one
+ if(bestFirstParentGenotype.size()>1){
+ configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1);
}
+
+ }
+ else{
+ bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
- return finalGenotypes;
+ TrioPhase phasedTrioGenotypes;
+ if(parentsCalled < 2 && mother == null || !mother.isCalled())
+ phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ else
+ phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
+ return bestMVCount.get(configuration_index);
+
+ }
+
+
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){
+
+ //Increment metrics counters
+ if(parent.isCalled() && child.isCalled()){
+ counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
+ if(parent.isPhased())
+ counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
+ else{
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
+ if(parent.isHet() && child.isHet())
+ counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
+ }
+ }else{
+ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
+ }
+
+ }
+
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){
+
+ //Increment metrics counters
+ if(mother.isCalled() && father.isCalled() && child.isCalled()){
+ counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
+ if(mother.isPhased())
+ counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
+
+ else{
+ if(mvCount > 0){
+ if(mvCount >1)
+ counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
+ else
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ }
+ else if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
+
+ }
+ }else{
+ counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
+ }
}
/**
@@ -292,55 +729,156 @@ public class PhaseByTransmission extends RodWalker {
* @return null
*/
@Override
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
+ String mvfLine;
+
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
Map genotypeMap = vc.getGenotypes();
- for (Trio trio : trios) {
- Genotype mother = vc.getGenotype(trio.getMother());
- Genotype father = vc.getGenotype(trio.getFather());
- Genotype child = vc.getGenotype(trio.getChild());
+ int mvCount;
- ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child);
+ for (Sample sample : trios) {
+ Genotype mother = vc.getGenotype(sample.getMaternalID());
+ Genotype father = vc.getGenotype(sample.getPaternalID());
+ Genotype child = vc.getGenotype(sample.getID());
+
+ //Keep only trios and parent/child pairs
+ if(mother == null && father == null || child == null)
+ continue;
+
+ ArrayList trioGenotypes = new ArrayList(3);
+ mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
Genotype phasedChild = trioGenotypes.get(2);
- genotypeMap.put(phasedMother.getSampleName(), phasedMother);
- genotypeMap.put(phasedFather.getSampleName(), phasedFather);
- genotypeMap.put(phasedChild.getSampleName(), phasedChild);
+ //Fill the genotype map with the new genotypes and increment metrics counters
+ genotypeMap.put(phasedChild.getSampleName(),phasedChild);
+ if(mother != null){
+ genotypeMap.put(phasedMother.getSampleName(), phasedMother);
+ if(father != null){
+ genotypeMap.put(phasedFather.getSampleName(), phasedFather);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ }
+ else{
+ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters);
+ if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
+ }
+ else{
+ genotypeMap.put(phasedFather.getSampleName(),phasedFather);
+ updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
+ if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
+
+ //Report violation if set so
+ //TODO: ADAPT FOR PAIRS TOO!!
+ if(mvCount>0 && mvFile != null)
+ mvFile.println(mvfLine);
+
}
+
VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap);
vcfWriter.add(newvc);
}
-
- return null;
+ return metricsCounters;
}
/**
- * Provide an initial value for reduce computations.
+ * Initializes the reporting counters.
*
- * @return Initial value of reduce.
+ * @return All counters initialized to 0
*/
@Override
- public Integer reduceInit() {
- return null;
+ public HashMap reduceInit() {
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
+ return metricsCounters;
}
/**
- * Reduces a single map with the accumulator provided as the ReduceType.
+ * Adds the value of the site phased to the reporting counters.
*
- * @param value result of the map.
- * @param sum accumulator for the reduce.
+ * @param value Site values
+ * @param sum accumulator for the reporting counters
* @return accumulator with result of the map taken into account.
*/
@Override
- public Integer reduce(Integer value, Integer sum) {
- return null;
+ public HashMap reduce(HashMap value, HashMap sum) {
+ sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
+ sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
+ sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
+ sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET));
+ sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS));
+ sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED));
+ sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL));
+ sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
+ sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
+ sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
+ sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS));
+ sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED));
+
+ return sum;
+ }
+
+
+ /**
+ * Reports statistics on the phasing by transmission process.
+ * @param result Accumulator with all counters.
+ */
+ @Override
+ public void onTraversalDone(HashMap result) {
+ logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
+ logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
+ logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
+ logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS));
+ logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
+ logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
+ logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET));
+ logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS));
+ logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED));
+
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
deleted file mode 100644
index e770418c1..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2010.
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.qc;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.walkers.ReadPairWalker;
-import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
-
-import java.io.PrintStream;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Counts the number of read pairs encountered in a file sorted in
- * query name order. Breaks counts down by total pairs and number
- * of paired reads.
- *
- *
- * Input
- *
- * One or more bam files.
- *
- *
- * Output
- *
- * Number of pairs seen.
- *
- *
- * Examples
- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- * -R ref.fasta \
- * -T CountPairs \
- * -o output.txt \
- * -I input.bam
- *
- *
- * @author mhanna
- */
-public class CountPairsWalker extends ReadPairWalker {
- @Output
- private PrintStream out;
-
- /**
- * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec.
- */
- private long firstOfPair = 0;
-
- /**
- * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec.
- */
- private long secondOfPair = 0;
-
- /**
- * A breakdown of the total number of reads seen with exactly the same read name.
- */
- private List pairCountsByType = new ExpandingArrayList();
-
- /**
- * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser.
- * @param reads Collection of reads having the same name.
- * @return Semantics defined by implementer.
- */
- @Override
- public Integer map(Collection reads) {
- if(pairCountsByType.get(reads.size()) != null)
- pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1);
- else
- pairCountsByType.set(reads.size(),1L);
-
- for(SAMRecord read: reads) {
- if(read.getFirstOfPairFlag()) firstOfPair++;
- if(read.getSecondOfPairFlag()) secondOfPair++;
- }
-
- return 1;
- }
-
- /**
- * No pairs at the beginning of a traversal.
- * @return 0 always.
- */
- @Override
- public Long reduceInit() {
- return 0L;
- }
-
- /**
- * Combine number of pairs seen in this iteration (always 1) with total number of pairs
- * seen in previous iterations.
- * @param value Pairs in this iteration (1), from the map function.
- * @param sum Count of all pairs in prior iterations.
- * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1).
- */
- @Override
- public Long reduce(Integer value, Long sum) {
- return value + sum;
- }
-
- /**
- * Print summary statistics over the entire traversal.
- * @param sum A count of all read pairs viewed.
- */
- @Override
- public void onTraversalDone(Long sum) {
- out.printf("Total number of pairs : %d%n",sum);
- out.printf("Total number of first reads in pair : %d%n",firstOfPair);
- out.printf("Total number of second reads in pair: %d%n",secondOfPair);
- for(int i = 1; i < pairCountsByType.size(); i++) {
- if(pairCountsByType.get(i) == null)
- continue;
- out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i));
- }
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index dba16cf86..8c8e4f257 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,7 +25,13 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.jgrapht.util.MathUtil;
+
+import java.util.EnumMap;
+import java.util.Map;
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
@@ -94,6 +100,48 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
+ //Returns null in case of missing likelihoods
+ public EnumMap getAsMap(boolean normalizeFromLog10){
+ //Make sure that the log10likelihoods are set
+ double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ if(likelihoods == null)
+ return null;
+ EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
+ likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
+ return likelihoodsMap;
+ }
+
+ //Return the neg log10 Genotype Quality (GQ) for the given genotype
+ //Returns Double.NEGATIVE_INFINITY in case of missing genotype
+ public double getNegLog10GQ(Genotype.Type genotype){
+
+ double qual = Double.NEGATIVE_INFINITY;
+ EnumMap likelihoods = getAsMap(false);
+ if(likelihoods == null)
+ return qual;
+ for(Map.Entry likelihood : likelihoods.entrySet()){
+ if(likelihood.getKey() == genotype)
+ continue;
+ if(likelihood.getValue() > qual)
+ qual = likelihood.getValue();
+
+ }
+
+ //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
+ qual = likelihoods.get(genotype) - qual;
+
+ //Quality of other genotypes 1-P(G)
+ if (qual < 0) {
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ }
+ return qual;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
index 462abeba1..5c8fa32a8 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
@@ -33,7 +33,7 @@ public class SnpEffUnitTest {
@Test
public void testParseWellFormedEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertTrue( effect.isWellFormed() && effect.isCoding() );
@@ -42,7 +42,7 @@ public class SnpEffUnitTest {
@Test
public void testParseInvalidEffectNameEffect() {
String effectName = "MADE_UP_EFFECT";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
@@ -51,7 +51,7 @@ public class SnpEffUnitTest {
@Test
public void testParseInvalidEffectImpactEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
+ String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
@@ -60,27 +60,27 @@ public class SnpEffUnitTest {
@Test
public void testParseWrongNumberOfMetadataFieldsEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
Assert.assertFalse(effect.isWellFormed());
}
@Test
- public void testParseSnpEffWarningEffect() {
+ public void testParseSnpEffOneWarningOrErrorEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
- Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") );
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") );
}
@Test
- public void testParseSnpEffErrorEffect() {
+ public void testParseSnpEffBothWarningAndErrorEffect() {
String effectName = "NON_SYNONYMOUS_CODING";
- String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" };
+ String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" };
SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata);
- Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") );
+ Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") );
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index bde4c4a8f..919e3d9bd 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -148,9 +148,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
- "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
+ "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
1,
- Arrays.asList("122321a85e448f21679f6ca15c5e22ad")
+ Arrays.asList("51258f5c880bd1ca3eb45a1711335c66")
);
executeTest("Testing SnpEff annotations", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
index c663c1dd7..2cd76e7a5 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
@@ -6,23 +6,131 @@ import org.testng.annotations.Test;
import java.util.Arrays;
public class PhaseByTransmissionIntegrationTest extends WalkerTest {
- private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission";
- private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf";
+ private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/";
+ private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped";
+ private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf";
+ private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf";
+ private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf";
+ private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf";
+ //Tests using PbT on all genotypes with default parameters
+ //And all reporting options
@Test
- public void testBasicFunctionality() {
+ public void testTrueNegativeMV() {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T PhaseByTransmission",
"-NO_HEADER",
"-R " + b37KGReference,
- "--variant " + fundamentalTestVCF,
- "-f NA12892+NA12891=NA12878",
+ "--variant " + TNTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14")
+ );
+ executeTest("testTrueNegativeMV", spec);
+ }
+
+ @Test
+ public void testTruePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + TPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556")
+ );
+ executeTest("testTruePositiveMV", spec);
+ }
+
+ @Test
+ public void testFalsePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee")
+ );
+ executeTest("testFalsePositiveMV", spec);
+ }
+
+ @Test
+ public void testSpecialCases() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + SpecialTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3")
+ );
+ executeTest("testSpecialCases", spec);
+ }
+
+ //Test using a different prior
+ //Here the FP file is used but as the prior is lowered, 3 turn to TP
+ @Test
+ public void testPriorOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-prior 1e-4",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e")
+ );
+ executeTest("testPriorOption", spec);
+ }
+
+ //Test when running without MV reporting option
+ //This is the exact same test file as FP but should not generate a .mvf file
+ @Test
+ public void testMVFileOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
"-o %s"
),
1,
- Arrays.asList("")
+ Arrays.asList("398724bc1e65314cc5ee92706e05a3ee")
);
- executeTest("testBasicFunctionality", spec);
+ executeTest("testMVFileOption", spec);
}
+
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index 3dceb9bd2..102d4715e 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
- "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
"-noEV",
"-EV TiTvVariantEvaluator",
"-noST",
"-ST FunctionalClass",
- "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
+ "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
"-o %s"
),
1,
- Arrays.asList("d9dcb352c53106f54fcc981f15d35a90")
+ Arrays.asList("a36414421621b377d6146d58d2fcecd0")
);
executeTest("testFunctionClassWithSnpeff", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
index 3f5d05e66..7a2696b7b 100755
--- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java
@@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest {
double t6 = t.getElapsedTime();
Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart");
Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer");
-
- t.stop().start();
- Assert.assertTrue(t.isRunning(), "second started timer isn't running");
- Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset");
- Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time
}
private final static void idleLoop() {
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
index 9243588ab..f3d0dedcd 100755
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
@@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
+import org.broadinstitute.sting.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import java.util.EnumMap;
+
/**
* Basic unit test for Genotype likelihoods objects
@@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest {
gl.getAsVector();
}
+ @Test
+ public void testGetAsMap(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(v);
+ //Log scale
+ EnumMap glMap = gl.getAsMap(false);
+ Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Linear scale
+ glMap = gl.getAsMap(true);
+ double [] vl = MathUtils.normalizeFromLog10(v);
+ Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ glMap = gl.getAsMap(false);
+ Assert.assertNull(glMap);
+
+ }
+
+ @Test
+ public void testGetNegLog10GQ(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString);
+
+ //GQ for the best guess genotype
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9);
+
+ double[] test = MathUtils.normalizeFromLog10(gl.getAsVector());
+
+ //GQ for the other genotypes
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1]));
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1]));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY);
+
+ }
+
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) {