From 9818c69df67fe5c930df4b62d895b5b9b15fac07 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 25 Jan 2012 09:32:52 -0500 Subject: [PATCH 01/22] Can now specify active regions to process at the command line, mainly for debugging purposes --- .../traversals/TraverseActiveRegions.java | 4 +-- .../gatk/walkers/ActiveRegionWalker.java | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index ebfcc0c29..cf15cc92b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -92,7 +92,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends Walker { + @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) + protected PrintStream activeRegionOutStream = null; + + @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) + protected List> activeRegionBindings = null; + + public GenomeLocSortedSet presetActiveRegions = null; + + @Override + public void initialize() { + if( activeRegionBindings == null ) { return; } + List allIntervals = new ArrayList(0); + for ( IntervalBinding intervalBinding : activeRegionBindings ) { + List intervals = intervalBinding.getIntervals(this.getToolkit()); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); + } + + presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); + } + // Do we actually want to operate on the context? public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { return true; // We are keeping all the reads From bbefe4a272f440262c717eef51750b4f6a138c37 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 25 Jan 2012 09:47:06 -0500 Subject: [PATCH 02/22] Added option to be able to write out the active regions to an interval list file --- .../sting/gatk/traversals/TraverseActiveRegions.java | 11 ++++++++++- .../sting/gatk/walkers/ActiveRegionWalker.java | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index cf15cc92b..f5e936a09 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -128,7 +128,16 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = integrateActiveList( isActiveList ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); - workQueue.addAll( activeRegions ); + if( walker.activeRegionOutStream == null ) { + workQueue.addAll( activeRegions ); + } else { // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : activeRegions ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them if( !workQueue.isEmpty() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 508aebb5c..98308ee11 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -40,7 +40,7 @@ import java.util.List; public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) - protected PrintStream activeRegionOutStream = null; + public PrintStream activeRegionOutStream = null; @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) protected List> activeRegionBindings = null; From ea3d4d60f2f99e1211ecccbfafc25c4d43ef12b7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 11:35:13 -0500 Subject: [PATCH 04/22] This annotation requires rods and should be annotated as such --- .../sting/gatk/walkers/annotator/MVLikelihoodRatio.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b9e6a5b2b..889cc634c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -23,7 +24,7 @@ import java.util.*; * Time: 12:24 PM * To change this template use File | Settings | File Templates. */ -public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; private String motherId; From e349b4b14b2f66efd56119dc75014f7a5190ee90 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 11:35:54 -0500 Subject: [PATCH 05/22] Allow appending with the dbSNP ID even if a (different) ID is already present for the variant rod. --- .../walkers/annotator/VariantAnnotator.java | 9 +++++++-- .../annotator/VariantAnnotatorEngine.java | 17 +++++++++++++---- .../interfaces/AnnotatorCompatibleWalker.java | 2 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../VariantAnnotatorIntegrationTest.java | 8 ++++++++ 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 69560c7cb..5312c4136 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.BaseUtils; @@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker implements Ann @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - public RodBinding getVariantRodBinding() { return variantCollection.variants; } /** * The INFO field will be annotated with information on the most biologically-significant effect @@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; + /** + * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty. + */ + @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated") + protected Boolean ALWAYS_APPEND_DBSNP_ID = false; + public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; } + @Hidden @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 98d2fe17b..90d0ad740 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -195,11 +195,20 @@ public class VariantAnnotatorEngine { private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + + // put the DB key into the INFO field infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); - // annotate dbsnp id if available and not already there - if ( rsID != null && vc.emptyID() ) - vc = new VariantContextBuilder(vc).id(rsID).make(); + + // add the ID if appropriate + if ( rsID != null ) { + if ( vc.emptyID() ) { + vc = new VariantContextBuilder(vc).id(rsID).make(); + } else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) { + final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID; + vc = new VariantContextBuilder(vc).id(newRsID).make(); + } + } } else { boolean overlapsComp = false; for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java index 7200f841b..1331ad5df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java @@ -8,9 +8,9 @@ import java.util.List; public interface AnnotatorCompatibleWalker { // getter methods for various used bindings - public abstract RodBinding getVariantRodBinding(); public abstract RodBinding getSnpEffRodBinding(); public abstract RodBinding getDbsnpRodBinding(); public abstract List> getCompRodBindings(); public abstract List> getResourceRodBindings(); + public abstract boolean alwaysAppendDbsnpId(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 369c2d0c6..5a269087c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } - public RodBinding getVariantRodBinding() { return null; } public RodBinding getSnpEffRodBinding() { return null; } public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } /** * A raw, unfiltered, highly specific callset in VCF format. diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 14f7457b8..0d9d9bcd8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -110,6 +110,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("getting DB tag with dbSNP", spec); } + @Test + public void testMultipleIdsWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3withIDs.vcf -L " + validationDataLocation + "vcfexample3withIDs.vcf", 1, + Arrays.asList("cd7e3d43b8f5579c461b3e588a295fa8")); + executeTest("adding multiple IDs with dbSNP", spec); + } + @Test public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( From fb863dc6a70ca1b23d8e5c07ab1ad237c840e63a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 11:50:12 -0500 Subject: [PATCH 06/22] Warn user when trying to run with EMIT_ALL_SITES with indels; better docs for that option. --- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 6 ++++++ .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 5a269087c..5f84f62ec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -205,6 +205,12 @@ public class UnifiedGenotyper extends LocusWalker samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index ee5aed3e5..ba4b22445 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -54,8 +54,9 @@ public class UnifiedGenotyperEngine { EMIT_VARIANTS_ONLY, /** produces calls at variant sites and confident reference sites */ EMIT_ALL_CONFIDENT_SITES, - /** produces calls at any callable site regardless of confidence; this argument is intended for point - * mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */ + /** produces calls at any callable site regardless of confidence; this argument is intended only for point + * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by + * no means produce a comprehensive set of indels in DISCOVERY mode */ EMIT_ALL_SITES } From 96b62daff39a21d478db8d2e443c13e27b863792 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 11:55:33 -0500 Subject: [PATCH 07/22] Minor tweak to the warning message. --- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 5f84f62ec..b1495ac7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -209,7 +209,7 @@ public class UnifiedGenotyper extends LocusWalker samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); From 2799a1b686763543b95e7815809aa898cc2de101 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 12:15:51 -0500 Subject: [PATCH 08/22] Catch exception for bad type and throw as a TribbleException --- .../sting/utils/codecs/vcf/VCFCompoundHeaderLine.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index bb822f2ed..97166833b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; @@ -149,7 +150,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF count = Integer.valueOf(numberStr); } - type = VCFHeaderLineType.valueOf(mapping.get("Type")); + try { + type = VCFHeaderLineType.valueOf(mapping.get("Type")); + } catch (Exception e) { + throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); + } if (type == VCFHeaderLineType.Flag && !allowFlagValues()) throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); From 05816955aa7945bc0422c8d3d117691b9a43bb52 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 14:28:21 -0500 Subject: [PATCH 09/22] It was possible that we'd clean up a matrix column too early when a dependent column aborted early (with not enough probability mass) because we weren't being smart about the order in which we created dependencies. Fixed. --- .../genotyper/ExactAFCalculationModel.java | 79 +++++++++++-------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 1594c92cb..24d7696b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -177,11 +176,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); - // optimization: create the temporary storage for computing L(j,k) just once - final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; - final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; - for ( int i = 0; i < maxPossibleDependencies; i++ ) - tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; + // optimization: create the temporary storage for computing L(j,k) just once + final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; + final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; + for ( int i = 0; i < maxPossibleDependencies; i++ ) + tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; @@ -204,7 +203,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final double[][] tempLog10ConformationLikelihoods) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); @@ -256,14 +255,24 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different if ( ACwiggle > 1 ) { - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( int allele_i = 0; allele_i < numAltAlleles - 1; allele_i++ ) { + for ( int allele_j = allele_i + 1; allele_j < numAltAlleles; allele_j++ ) { + if ( allele_i == allele_j ) + continue; final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); } } + + // now we can deal with the case where the 2 new alleles are the same + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele_i] += 2; + lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); + } } // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate @@ -298,6 +307,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // add the given dependency to the set + //if ( DEBUG ) + // System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts); final ExactACset set = indexesToACset.get(index); set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); return wasInQueue ? null : set; @@ -317,7 +328,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final double[][] tempLog10ConformationLikelihoods) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -329,40 +340,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // k > 0 for at least one k else { - // deal with the non-AA possible conformations - int conformationIndex = 1; - for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { - //if ( DEBUG ) - // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); + // deal with the non-AA possible conformations + int conformationIndex = 1; + for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { + //if ( DEBUG ) + // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); - ExactACset dependent = indexesToACset.get(mapping.getKey()); + ExactACset dependent = indexesToACset.get(mapping.getKey()); - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][conformationIndex] = - determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + tempLog10ConformationLikelihoods[j][conformationIndex] = + determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; } else { - tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; - } + tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; + } } - conformationIndex++; - } + conformationIndex++; + } - // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value final int numPaths = set.ACsetIndexToPLIndex.size() + 1; - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - } else { - tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; - } + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + } else { + tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; + } - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths); set.log10Likelihoods[j] = log10Max - logDenominator; } From 8e2d372ab0649a003745ffbd319e64e6f5df2e25 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 14:41:34 -0500 Subject: [PATCH 10/22] Use remove instead of setting the value to null --- .../sting/gatk/walkers/genotyper/ExactAFCalculationModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 24d7696b5..aee003089 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -214,7 +214,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // clean up memory if ( !preserveData ) { for ( ExactACcounts index : set.dependentACsetsToDelete ) { - indexesToACset.put(index, null); + indexesToACset.remove(index); //if ( DEBUG ) // System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); } @@ -229,7 +229,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // no reason to keep this data around because nothing depends on it if ( !preserveData ) - indexesToACset.put(set.ACcounts, null); + indexesToACset.remove(set.ACcounts); return log10LofK; } From ef335a5812e6f6e18ea8227b95361bde9c3826df Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 15:15:42 -0500 Subject: [PATCH 11/22] Better implementation of the fix; PL index is now traversed in order. --- .../genotyper/ExactAFCalculationModel.java | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index aee003089..d75be23be 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -194,6 +195,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + private static double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, final double maxLog10L, @@ -255,24 +266,27 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different if ( ACwiggle > 1 ) { - // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering - for ( int allele_i = 0; allele_i < numAltAlleles - 1; allele_i++ ) { - for ( int allele_j = allele_i + 1; allele_j < numAltAlleles; allele_j++ ) { - if ( allele_i == allele_j ) - continue; + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; - lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); + + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); } } - // now we can deal with the case where the 2 new alleles are the same - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); - ACcountsClone[allele_i] += 2; - lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); - } + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + lastSet = updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + for ( DependentSet dependent : sameAlleles ) + lastSet = updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); } // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate From db645a94ca0e5f533402284367b5ce042de61ad7 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 25 Jan 2012 16:10:59 -0500 Subject: [PATCH 14/22] Added options to make the batch-merger more all-inclusive: keep all indels, SNPs (even filtered ones) but maintain their annotations. Also, VariantContextUtils.simpleMerge can now merge variants of all types using the Hidden non-default enum MultipleAllelesMergeType=MIX_TYPES --- .../genotyper/ExactAFCalculationModel.java | 5 +- .../walkers/genotyper/UGCalcLikelihoods.java | 114 ------------- .../walkers/genotyper/UGCallVariants.java | 152 ------------------ .../walkers/variantutils/CombineVariants.java | 29 +++- .../variantcontext/VariantContextUtils.java | 13 ++ 5 files changed, 38 insertions(+), 275 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 1594c92cb..a91928bc3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -39,6 +38,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + // TODO: PERMITS WALKER USED TO HAVE A TEMPORARY FIX to prevent NullPointerException caused by bug: + public static boolean PRESERVE_AC_DATA = false; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); @@ -51,7 +52,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numAlleles = alleles.size(); //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, PRESERVE_AC_DATA); } private static final ArrayList getGLs(GenotypesContext GLs) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java deleted file mode 100755 index c7e577393..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashSet; -import java.util.Set; - - -/** - * Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to - * the name 'allele' so we know which alternate allele to use at each site. - */ -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@Reference(window=@Window(start=-200,stop=200)) -@By(DataSource.READS) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UGCalcLikelihoods extends LocusWalker implements TreeReducible { - - @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable extended events for indels - public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; } - - public void initialize() { - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - // initialize the header - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - - writer.writeHeader(new VCFHeader(headerInfo, samples)) ; - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, true); - } - - public Integer reduceInit() { return 0; } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - writer.add(value); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer sum) { - logger.info(String.format("Visited bases: %d", sum)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java deleted file mode 100755 index 97f7b21eb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.util.*; - -/** - * Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number - * of GL/PL-annotated VCFs bound to a name starting with 'variant'. - */ -public class UGCallVariants extends RodWalker { - - @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // variant track names - private Set trackNames = new HashSet(); - - public void initialize() { - - for ( RodBinding rb : variants ) - trackNames.add(rb.getName()); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ) - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - // initialize the header - writer.writeHeader(new VCFHeader(headerInfo, samples)); - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - List VCs = tracker.getValues(variants, context.getLocation()); - - VariantContext mergedVC = mergeVCsWithGLs(VCs); - if ( mergedVC == null ) - return null; - - return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC); - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - VariantContextBuilder builder = new VariantContextBuilder(value); - VariantContextUtils.calculateChromosomeCounts(builder, true); - writer.add(builder.make()); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer result) { - logger.info(String.format("Visited sites: %d", result)); - } - - private static VariantContext mergeVCsWithGLs(List VCs) { - // we can't use the VCUtils classes because our VCs can all be no-calls - if ( VCs.size() == 0 ) - return null; - - VariantContext variantVC = null; - GenotypesContext genotypes = GenotypesContext.create(); - for ( VariantContext vc : VCs ) { - if ( variantVC == null && vc.isVariant() ) - variantVC = vc; - genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes())); - } - - if ( variantVC == null ) { - VariantContext vc = VCs.get(0); - throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart()); - } - - return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make(); - } - - private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) { - GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size()); - for ( final Genotype g : genotypes ) { - if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null ) - genotypesWithGLs.add(g); - } - return genotypesWithGLs; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index af05c0dc4..684b9102a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker { @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + @Hidden + @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) + public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + /** * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. */ @@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker { return 0; List mergedVCs = new ArrayList(); - Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); - // iterate over the types so that it's deterministic - for ( VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) - mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + + if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); + // iterate over the types so that it's deterministic + for (VariantContext.Type type : VariantContext.Type.values()) { + if (VCsByType.containsKey(type)) + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + } + else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + else { + logger.warn("Ignoring all records at site " + ref.getLocus()); } for ( VariantContext mergedVC : mergedVCs ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 39045ea21..179c91660 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -471,6 +472,18 @@ public class VariantContextUtils { KEEP_UNCONDITIONAL } + @Hidden + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with From add6918f32d1322debba7bd5dcfc5d22a27c6c2f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 16:21:10 -0500 Subject: [PATCH 15/22] Cleaner, more efficient way of determining the last dependent set in the queue. --- .../genotyper/ExactAFCalculationModel.java | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index d75be23be..363b74ceb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -166,7 +166,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numChr = 2*numSamples; // queue of AC conformations to process - final Queue ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(numChr+1); @@ -210,7 +210,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final double maxLog10L, final int numChr, final boolean preserveData, - final Queue ACqueue, + final LinkedList ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result, @@ -250,7 +250,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; - ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing final int numAltAlleles = set.ACcounts.getCounts().length; // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. @@ -261,7 +260,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for ( int allele = 0; allele < numAltAlleles; allele++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele]++; - lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -284,20 +283,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - lastSet = updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); for ( DependentSet dependent : sameAlleles ) - lastSet = updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); } - // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate - // over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early) - if ( !preserveData && lastSet == null ) { - //if ( DEBUG ) - // System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts); - lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + // determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column + if ( !preserveData ) { + final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + if ( lastSet != null ) + lastSet.dependentACsetsToDelete.add(set.ACcounts); } - if ( lastSet != null ) - lastSet.dependentACsetsToDelete.add(set.ACcounts); return log10LofK; } @@ -305,19 +301,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also adds it as a dependency to the given callingSetIndex. // returns the ExactACset if that set was not already in the queue and null otherwise. - private static ExactACset updateACset(final int[] ACcounts, - final int numChr, - final ExactACset callingSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset) { + private static void updateACset(final int[] ACcounts, + final int numChr, + final ExactACset callingSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset) { final ExactACcounts index = new ExactACcounts(ACcounts); - boolean wasInQueue = true; if ( !indexesToACset.containsKey(index) ) { ExactACset set = new ExactACset(numChr/2 +1, index); indexesToACset.put(index, set); ACqueue.add(set); - wasInQueue = false; } // add the given dependency to the set @@ -325,16 +319,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts); final ExactACset set = indexesToACset.get(index); set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); - return wasInQueue ? null : set; } - private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue ACqueue) { - ExactACset set = null; - for ( ExactACset queued : ACqueue ) { - if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) - set = queued; + private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList ACqueue) { + Iterator reverseIterator = ACqueue.descendingIterator(); + while ( reverseIterator.hasNext() ) { + final ExactACset queued = reverseIterator.next(); + if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) ) + return queued; } - return set; + + // shouldn't get here + throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!"); } private static void computeLofK(final ExactACset set, From ddaf51a50ffea902068ad554367eceadcf37c86b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 19:18:51 -0500 Subject: [PATCH 16/22] Updated one integration test for indels --- .../gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e9b4fc211..7285b0fb8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("877de5b0cc61dc54636062df6399b978")); + Arrays.asList("1d1956fd7b0f0d30935674b2f5019860")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } From 9a60887567d677dacc43fb4aced8a93ed308e54e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 19:41:41 -0500 Subject: [PATCH 17/22] Lost an import in the merge --- .../gatk/walkers/genotyper/ExactAFCalculationModel.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 560ade2d8..d604e8d62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -38,9 +39,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - // TODO: PERMITS WALKER USED TO HAVE A TEMPORARY FIX to prevent NullPointerException caused by bug: - public static boolean PRESERVE_AC_DATA = false; - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } @@ -52,7 +50,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numAlleles = alleles.size(); //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, PRESERVE_AC_DATA); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); } private static final ArrayList getGLs(GenotypesContext GLs) { From 702a2d768fc437411f55b2a95e37d4845ad3c24f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 25 Jan 2012 19:42:55 -0500 Subject: [PATCH 18/22] Initial version of multi-allelic summary module in VariantEval --- .../evaluators/MultiallelicSummary.java | 181 ++++++++++++++++++ .../variantcontext/VariantContextUtils.java | 8 + 2 files changed, 189 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java new file mode 100644 index 000000000..835f6ca8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +@Analysis(description = "Evaluation summary for multi-allelic variants") +public class MultiallelicSummary extends VariantEvaluator implements StandardEval { + final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); + + public enum Type { + SNP, INDEL + } + + // basic counts on various rates found + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public int nSNPs = 0; + @DataPoint(description = "Number of multi-allelic SNPs") + public int nMultiSNPs = 0; + @DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f") + public double processedMultiSnpRatio = 0; + @DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f") + public double variantMultiSnpRatio = 0; + + @DataPoint(description = "Number of Indels") + public int nIndels = 0; + @DataPoint(description = "Number of multi-allelic Indels") + public int nMultiIndels = 0; + @DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f") + public double processedMultiIndelRatio = 0; + @DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f") + public double variantMultiIndelRatio = 0; + + @DataPoint(description = "Number of Transitions") + public int nTi = 0; + @DataPoint(description = "Number of Transversions") + public int nTv = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + + @DataPoint(description = "Multi-allelic SNPs partially known") + public int knownSNPsPartial = 0; + @DataPoint(description = "Multi-allelic SNPs completely known") + public int knownSNPsComplete = 0; + @DataPoint(description = "Multi-allelic SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + + @DataPoint(description = "Multi-allelic Indels partially known") + public int knownIndelsPartial = 0; + @DataPoint(description = "Multi-allelic Indels completely known") + public int knownIndelsComplete = 0; + @DataPoint(description = "Multi-allelic Indel Novelty Rate") + public String indelNoveltyRate = "NA"; + + // TODO -- Also, AF distributions (pairwise like TiTv) + + public void initialize(VariantEvalWalker walker) {} + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) + return null; + + // update counts + switch ( eval.getType() ) { + case SNP: + nSNPs++; + if ( !eval.isBiallelic() ) { + nMultiSNPs++; + calculatePairwiseTiTv(eval); + calculateSNPPairwiseNovelty(eval, comp); + } + break; + case INDEL: + nIndels++; + if ( !eval.isBiallelic() ) { + nMultiIndels++; + calculateIndelPairwiseNovelty(eval, comp); + } + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + return null; // we don't capture any interesting sites + } + + private void calculatePairwiseTiTv(VariantContext vc) { + for ( Allele alt : vc.getAlternateAlleles() ) { + if ( VariantContextUtils.isTransition(vc.getReference(), alt) ) + nTi++; + else + nTv++; + } + } + + private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) { + if ( comp == null ) + return; + + int knownAlleles = 0; + for ( Allele alt : eval.getAlternateAlleles() ) { + if ( comp.getAlternateAlleles().contains(alt) ) + knownAlleles++; + } + + if ( knownAlleles == eval.getAlternateAlleles().size() ) + knownSNPsComplete++; + else if ( knownAlleles > 0 ) + knownSNPsPartial++; + } + + private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) { + } + + private final String noveltyRate(final int all, final int known) { + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; + variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; + processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; + variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels; + + TiTvRatio = (double)nTi / (double)nTv; + + SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); + indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 179c91660..c79bbaace 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -1073,6 +1073,14 @@ public class VariantContextUtils { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + /** * create a genome location, given a variant context * @param genomeLocParser parser From c5e81be9781881cc9a79f99e696b47cb578b8ba2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 26 Jan 2012 00:37:06 -0500 Subject: [PATCH 21/22] Adding pairwise AF table. Not polished at all, but usable none-the-less. --- .../evaluators/MultiallelicSummary.java | 138 ++++++++++++------ 1 file changed, 95 insertions(+), 43 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java index 835f6ca8c..6094385e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -31,10 +31,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -90,7 +90,59 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva @DataPoint(description = "Multi-allelic Indel Novelty Rate") public String indelNoveltyRate = "NA"; - // TODO -- Also, AF distributions (pairwise like TiTv) + @DataPoint(description="Histogram of allele frequencies") + AFHistogram AFhistogram = new AFHistogram(); + + /* + * AF histogram table object + */ + static class AFHistogram implements TableType { + private Object[] colKeys, rowKeys = {"pairwise_AF"}; + private int[] AFhistogram; + + private static final double AFincrement = 0.01; + private static final int numBins = (int)(1.00 / AFincrement); + + public AFHistogram() { + colKeys = initColKeys(); + AFhistogram = new int[colKeys.length]; + } + + public Object[] getColumnKeys() { + return colKeys; + } + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object getCell(int row, int col) { + return AFhistogram[col]; + } + + private static Object[] initColKeys() { + ArrayList keyList = new ArrayList(numBins + 1); + for ( double a = 0.00; a <= 1.01; a += AFincrement ) { + keyList.add(String.format("%.2f", a)); + } + return keyList.toArray(); + } + + public String getName() { return "AFHistTable"; } + + public void update(VariantContext vc) { + final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null); + if ( obj == null || !(obj instanceof List) ) + return; + + List list = (List)obj; + for ( String str : list ) { + final double AF = Double.valueOf(str); + final int bin = (int)(numBins * MathUtils.round(AF, 2)); + AFhistogram[bin]++; + } + } + } public void initialize(VariantEvalWalker walker) {} @@ -104,58 +156,58 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); } - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( eval == null || eval.isMonomorphicInSamples() ) - return null; + return null; // update counts switch ( eval.getType() ) { - case SNP: - nSNPs++; - if ( !eval.isBiallelic() ) { - nMultiSNPs++; - calculatePairwiseTiTv(eval); - calculateSNPPairwiseNovelty(eval, comp); - } - break; + case SNP: + nSNPs++; + if ( !eval.isBiallelic() ) { + nMultiSNPs++; + calculatePairwiseTiTv(eval); + calculateSNPPairwiseNovelty(eval, comp); + } + break; case INDEL: - nIndels++; - if ( !eval.isBiallelic() ) { - nMultiIndels++; - calculateIndelPairwiseNovelty(eval, comp); - } - break; + nIndels++; + if ( !eval.isBiallelic() ) { + nMultiIndels++; + calculateIndelPairwiseNovelty(eval, comp); + } + break; default: throw new UserException.BadInput("Unexpected variant context type: " + eval); } - + AFhistogram.update(eval); + return null; // we don't capture any interesting sites } private void calculatePairwiseTiTv(VariantContext vc) { - for ( Allele alt : vc.getAlternateAlleles() ) { - if ( VariantContextUtils.isTransition(vc.getReference(), alt) ) - nTi++; - else - nTv++; - } + for ( Allele alt : vc.getAlternateAlleles() ) { + if ( VariantContextUtils.isTransition(vc.getReference(), alt) ) + nTi++; + else + nTv++; + } } private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) { - if ( comp == null ) - return; + if ( comp == null ) + return; - int knownAlleles = 0; - for ( Allele alt : eval.getAlternateAlleles() ) { - if ( comp.getAlternateAlleles().contains(alt) ) - knownAlleles++; - } + int knownAlleles = 0; + for ( Allele alt : eval.getAlternateAlleles() ) { + if ( comp.getAlternateAlleles().contains(alt) ) + knownAlleles++; + } - if ( knownAlleles == eval.getAlternateAlleles().size() ) - knownSNPsComplete++; - else if ( knownAlleles > 0 ) - knownSNPsPartial++; + if ( knownAlleles == eval.getAlternateAlleles().size() ) + knownSNPsComplete++; + else if ( knownAlleles > 0 ) + knownSNPsPartial++; } private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) { @@ -168,14 +220,14 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva } public void finalizeEvaluation() { - processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; - variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; - processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; - variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels; + processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; + variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; + processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; + variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels; TiTvRatio = (double)nTi / (double)nTv; - SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); - indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); + indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); } } From 859dd882c90ababb92f165479d6171b8b8ec9ce7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 26 Jan 2012 00:38:16 -0500 Subject: [PATCH 22/22] Don't make it standard for now --- .../walkers/varianteval/evaluators/MultiallelicSummary.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java index 6094385e6..9113e7538 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @Analysis(description = "Evaluation summary for multi-allelic variants") -public class MultiallelicSummary extends VariantEvaluator implements StandardEval { +public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval { final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); public enum Type {