From 9c088fe3fee42a9a4b81e4b449bd4105cce53b9d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 12:41:24 -0400 Subject: [PATCH 01/26] Actually a better implementation of GATKSAMRecord.getSoftStart(). Last commit was all wrong. Oops. --- .../broadinstitute/sting/utils/sam/GATKSAMRecord.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 8c3d83874..1feb76517 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -388,9 +388,14 @@ public class GATKSAMRecord extends BAMRecord { public int getSoftStart() { if (softStart < 0) { softStart = getAlignmentStart(); - final CigarElement firstCig = getCigar().getCigarElement(0); - if (firstCig.getOperator() == CigarOperator.HARD_CLIP) - softStart -= firstCig.getLength(); + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP) + softStart -= cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) + break; + } } return softStart; } From 4622896312de494ed876a8c29dd00b8d5f22351b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 13:04:05 -0400 Subject: [PATCH 02/26] Oops, killed contracts --- .../sting/gatk/walkers/compression/reducereads/BaseCounts.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 3a3905710..778b8300a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -227,7 +227,7 @@ import com.google.java.contract.Requires; * @param base base * @return the proportion of this base over all other bases except indels */ - @Requires("index.isNucleotide()") + @Requires("base.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportionWithoutIndels(final BaseIndex base) { final int total = totalCountWithoutIndels(); From 2ef456d51a024b551e8a6ba761b3e36191b6f20f Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 19 Oct 2012 13:19:56 -0400 Subject: [PATCH 03/26] Added explicit @ClassType annotations to @Argument for Option[Int] or Option[Double] since scala seems to change the reflected type to Option[Object] on some systems. Changed ReflectionUtils.getGenericTypes' order of looking for @ClassType since the primitive generic wasn't completely erased, only changed to Object which is incorrect. More fixes to @Arguments labeled as java.io.File via incorrect @Input annotation. Put in a default undocumented implementation of @Argument doc() to match the one added to @Input. --- .../broadinstitute/sting/commandline/Argument.java | 2 +- .../queue/qscripts/PacbioProcessingPipeline.scala | 12 ++++++------ .../org/broadinstitute/sting/queue/QSettings.scala | 7 ++++++- .../sting/queue/util/ReflectionUtils.scala | 9 ++++----- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java index 33592287d..67ce8a863 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/Argument.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Argument.java @@ -62,7 +62,7 @@ public @interface Argument { * --help argument is specified. * @return Doc string associated with this command-line argument. */ - String doc(); + String doc() default "Undocumented option"; /** * Is this argument required. If true, the command-line argument system will diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index a4a6636fe..ef73840b3 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -27,28 +27,28 @@ class PacbioProcessingPipeline extends QScript { @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) var dbSNP: File = _ - @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + @Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) var threads: Int = 0 - @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + @Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) var sample: String = "NA" @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ - @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) + @Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) var BLASR_BAM: Boolean = false @Hidden - @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + @Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) var dbq: Int = 20 @Hidden - @Input(shortName="bwastring", required=false) + @Argument(shortName="bwastring", required=false) var bwastring: String = "" @Hidden - @Input(shortName = "test", fullName = "test_mode", required = false) + @Argument(shortName = "test", fullName = "test_mode", required = false) var testMode: Boolean = false val queueLogDir: String = ".qlog/" diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 429428c4c..2c0f43bac 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -25,7 +25,7 @@ package org.broadinstitute.sting.queue import java.io.File -import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.commandline.{ClassType, Argument} /** * Default settings settable on the command line and passed to CommandLineFunctions. @@ -41,6 +41,7 @@ class QSettings { var jobQueue: String = _ @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) + @ClassType(classOf[Int]) var jobPriority: Option[Int] = None @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) @@ -53,15 +54,19 @@ class QSettings { var jobEnvironmentNames: Seq[String] = Nil @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + @ClassType(classOf[Double]) var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var memoryLimitThreshold: Option[Double] = None @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentLimit: Option[Double] = None @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentRequest: Option[Double] = None @Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala index 980a22e8e..15101fd75 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala @@ -159,12 +159,11 @@ object ReflectionUtils { private def getGenericTypes(field: Field): Option[Array[Class[_]]] = { // TODO: Refactor: based on java code in org.broadinstitute.sting.commandline.ArgumentTypeDescriptor // If this is a parameterized collection, find the contained type. If blow up if only one type exists. - if (field.getGenericType.isInstanceOf[ParameterizedType]) { + if (hasAnnotation(field, classOf[ClassType])) { + Some(Array(getAnnotation(field, classOf[ClassType]).value)) + } else if (field.getGenericType.isInstanceOf[ParameterizedType]) { val parameterizedType = field.getGenericType.asInstanceOf[ParameterizedType] Some(parameterizedType.getActualTypeArguments.map(_.asInstanceOf[Class[_]])) - } else if (hasAnnotation(field, classOf[ClassType])) { - Some(Array(getAnnotation(field, classOf[ClassType]).value)) - } - else None + } else None } } From 637e0cf1512c68e8964b3e1d5bce6d0755cad61b Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 11 Oct 2012 13:59:21 -0400 Subject: [PATCH 06/26] CountReads does not permit the use of output files --- .../src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 301fa5b9b..1d2c6c9cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T CountReads \ - * -o output.txt \ * -I input.bam \ * [-L input.intervals] * From 45f64425a3f4401761845791e25693476c065afe Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 19 Oct 2012 14:29:02 -0400 Subject: [PATCH 07/26] Update read metrics per shard rather than locus --- .../sting/gatk/traversals/TraverseActiveRegions.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 2b7b2f9f5..5d38df0f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -104,10 +104,11 @@ public class TraverseActiveRegions extends TraversalEngine Date: Sat, 20 Oct 2012 16:38:18 -0400 Subject: [PATCH 09/26] Refactoring the PairHMM util class to allow for multiple implementations which can be specified by the callers via an enum argument. Adding an optimized PairHMM implementation which caches per-read calculations as well as a logless implementation which drastically reduces the runtime of the HMM while also increasing the precision of the result. In the HaplotypeCaller we now lexicographically sort the haplotypes to take maximal benefit of the haplotype offset optimization which only recalculates the HMM matrices after the first differing base in the haplotype. Many thanks to Mauricio for all the initial groundwork for these optimizations. The change to the one HC integration test is in the fourth decimal of HaplotypeScore. --- .../gatk/walkers/genotyper/ErrorModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../LikelihoodCalculationEngine.java | 53 ++-- .../sting/utils/pairhmm/CachingPairHMM.java | 181 ++++++++++++ .../utils/pairhmm/LoglessCachingPairHMM.java | 187 +++++++++++++ .../HaplotypeCallerIntegrationTest.java | 2 +- .../sting/utils/pairhmm}/PairHMMUnitTest.java | 242 ++++++---------- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 13 +- .../indels/PairHMMIndelErrorModel.java | 42 +-- .../broadinstitute/sting/utils/Haplotype.java | 16 ++ .../broadinstitute/sting/utils/PairHMM.java | 259 ------------------ .../sting/utils/pairhmm/ExactPairHMM.java | 107 ++++++++ .../sting/utils/pairhmm/OriginalPairHMM.java | 105 +++++++ .../sting/utils/pairhmm/PairHMM.java | 45 +++ 16 files changed, 813 insertions(+), 457 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java rename {public/java/test/org/broadinstitute/sting/utils => protected/java/test/org/broadinstitute/sting/utils/pairhmm}/PairHMMUnitTest.java (56%) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index f76225134..8042c15d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -72,7 +72,7 @@ public class ErrorModel { haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index fc0c526bc..f09a1ea3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); haplotypeMap = new LinkedHashMap(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 71e4f5f8a..5f2b5775c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; @@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } @@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + // evaluate each sample's reads against all haplotypes final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 14c1cd59d..62554c4ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine { private final boolean DEBUG; private final PairHMM pairHMM; - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { - pairHMM = new PairHMM( noBanded ); + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + pairHMM = new CachingPairHMM(); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessCachingPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + } + this.constantGCP = constantGCP; DEBUG = debug; } @@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine { X_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2; - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); } } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); @@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, - haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); readCounts[jjj][iii] = readCount; } } @@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine { return iii; } } - return b1.length; + return Math.min(b1.length, b2.length); } @Requires({"haplotypes.size() > 0"}) @@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype + bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype // set up the default 1-to-1 haplotype mapping object final ArrayList> haplotypeMapping = new ArrayList>(); for( final Haplotype h : haplotypes ) { @@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } + public static int findReferenceIndex( final List haplotypes ) { + for( final Haplotype h : haplotypes ) { + if( h.isReference() ) { return haplotypes.indexOf(h); } + } + throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); + } + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java new file mode 100644 index 000000000..282db45d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class CachingPairHMM extends OriginalPairHMM { + + double[][] constantMatrix = null; // The cache in the CachingPairHMM + double[][] distanceMatrix = null; // The cache in the CachingPairHMM + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProbLog10(DEFAULT_GCP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GOP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GCP), + 0.0, + 0.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + constantMatrix[l+1][4] = 0.0; + constantMatrix[l+1][5] = 0.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior + + MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0], + XMetricArray[indI - 1][indJ - 1] + constants[1], + YMetricArray[indI - 1][indJ - 1] + constants[1] ); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2], + XMetricArray[indI - 1][indJ] + constants[3]); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4], + YMetricArray[indI][indJ - 1] + constants[5]); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java new file mode 100644 index 000000000..d2aef5bb5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class LoglessCachingPairHMM extends CachingPairHMM { + + protected static final double SCALE_FACTOR_LOG10 = 300.0; + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProb(DEFAULT_GCP), + QualityUtils.qualToErrorProb(DEFAULT_GOP), + QualityUtils.qualToErrorProb(DEFAULT_GCP), + 1.0, + 1.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], 0.0); + Arrays.fill(XMetricArray[iii], 0.0); + Arrays.fill(YMetricArray[iii], 0.0); + } + + // the initial condition + matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0); + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + constantMatrix[l+1][4] = 1.0; + constantMatrix[l+1][5] = 1.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + + XMetricArray[indI - 1][indJ - 1] * constants[1] + + YMetricArray[indI - 1][indJ - 1] * constants[1] ); + XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; + YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a8ea4b7da..a441e6c77 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 56% rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 22bcb1bbf..6281054b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -23,24 +23,26 @@ */ // our package -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.pairhmm; // the imports for unit testing. - import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; - PairHMM hmm = new PairHMM( false ); // reference implementation - PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation + PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation + PairHMM cachingHMM = new CachingPairHMM(); + PairHMM loglessHMM = new LoglessCachingPairHMM(); // -------------------------------------------------------------------------------- // @@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } @@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest { } public double expectedLogL() { - return expectedQual / -10.0; + return (expectedQual / -10.0) + 0.03 ; } - public double tolerance() { - return 0.1; // TODO FIXME arbitrary + public double toleranceFromTheoretical() { + return 0.2; } - public double calcLogL() { + public double toleranceFromReference() { + return 1E-4; + } - double logL = hmm.computeReadLikelihoodGivenHaplotype( + public double toleranceFromExact() { + return 1E-9; + } + + public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, - qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), - qualAsBytes(gcp, false)); - - return logL; + qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), + qualAsBytes(gcp, false, anchorIndel), 0, true); } private final byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } - private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { final byte phredQuals[] = new byte[readBasesWithContext.length]; - // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM - Arrays.fill(phredQuals, (byte)100); - // update just the bases corresponding to the provided micro read with the quality scores - if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; - } else { - for ( int i = 0; i < read.length(); i++) - phredQuals[i + CONTEXT.length()] = (byte)phredQual; - } + if( anchorIndel ) { + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); - return phredQuals; - } - } - - final Random random = new Random(87865573); - private class BandedLikelihoodTestProvider extends TestDataProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; - final int baseQual, insQual, delQual, gcp; - final int expectedQual; - final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; - final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; - final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; - final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; - final byte[] baseQuals, insQuals, delQuals, gcps; - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); - } - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); - this.baseQual = baseQual; - this.delQual = delQual; - this.insQual = insQual; - this.gcp = gcp; - this.read = read; - this.ref = ref; - this.expectedQual = expectedQual; - - refBasesWithContext = asBytes(ref, left, right); - readBasesWithContext = asBytes(read, false, false); - baseQuals = qualAsBytes(baseQual); - insQuals = qualAsBytes(insQual); - delQuals = qualAsBytes(delQual); - gcps = qualAsBytes(gcp, false); - } - - public double expectedLogL() { - double logL = hmm.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - public double tolerance() { - return 0.2; // TODO FIXME arbitrary - } - - public double calcLogL() { - - double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); - } - - private byte[] qualAsBytes(final int phredQual) { - return qualAsBytes(phredQual, true); - } - - private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { - final byte phredQuals[] = new byte[readBasesWithContext.length]; - Arrays.fill(phredQuals, (byte)phredQual); - if(addRandom) { - for( int iii = 0; iii < phredQuals.length; iii++) { - phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; } + } else { + Arrays.fill(phredQuals, (byte)phredQual); } + return phredQuals; } } @@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest { // test all combinations final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { @@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest { for ( boolean insertionP : Arrays.asList(true, false)) { final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); + final String big = Utils.dupString((char) base, size); final String ref = insertionP ? small : big; final String read = insertionP ? big : small; @@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest { return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); - double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); - } - - @DataProvider(name = "BandedLikelihoodTestProvider") - public Object[][] makeBandedLikelihoodTests() { + final Random random = new Random(87860573); + @DataProvider(name = "OptimizedLikelihoodTestProvider") + public Object[][] makeOptimizedLikelihoodTests() { // context on either side is ACGTTGCA REF ACGTTGCA // test all combinations - final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); - final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { - - // test substitutions - for ( final byte refBase : BaseUtils.BASES ) { - for ( final byte readBase : BaseUtils.BASES ) { - final String ref = new String(new byte[]{refBase}); - final String read = new String(new byte[]{readBase}); - final int expected = refBase == readBase ? 0 : baseQual; - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - } - } - - // test insertions and deletions - for ( final int size : sizes ) { - for ( final byte base : BaseUtils.BASES ) { - final int expected = indelQual + (size - 2) * gcp; - - for ( boolean insertionP : Arrays.asList(true, false)) { - final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); - - final String ref = insertionP ? small : big; - final String read = insertionP ? big : small; - - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + for ( final int refSize : sizes ) { + for ( final int readSize : sizes ) { + String ref = ""; + String read = ""; + for( int iii = 0; iii < refSize; iii++) { + ref += (char) BaseUtils.BASES[random.nextInt(4)]; } + for( int iii = 0; iii < readSize; iii++) { + read += (char) BaseUtils.BASES[random.nextInt(4)]; + } + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true); } } } } } - return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) - public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, true ); + double calculatedLogL = cfg.calcLogL( originalHMM, true ); + double optimizedLogL = cfg.calcLogL( cachingHMM, true ); + double loglessLogL = cfg.calcLogL( loglessHMM, true ); double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); + } + + @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true) + public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, false ); + double calculatedLogL = cfg.calcLogL( originalHMM, false ); + double optimizedLogL = cfg.calcLogL( cachingHMM, false ); + double loglessLogL = cfg.calcLogL( loglessHMM, false ); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); } @Test @@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); @@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ebfbc49fe..e0ffb2ba6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 885463fcb..3eda2017c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -221,10 +224,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; uac.exactCallsLog = exactCallsLog; + uac.pairHMM = pairHMM; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 9234a9fe8..3d287057c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+2; final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { + if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } int startIndexInHaplotype = 0; @@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel { startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); if (DEBUG) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index befd24307..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} From 2c624f76c83f8cfec61001f89928ebe39ab8d713 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 20:35:54 -0400 Subject: [PATCH 10/26] Refactoring the Unified (and Standard) Argument Collections because it was really ugly that the subclass had to do all the cloning for the super class. The clone() method is really not recommended best practice in Java anyways, so I changed it so that we use standard overloaded constructors. Confirmed that the Haplotype Caller --help docs do not include UG-specific arguments. --- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../StandardCallerArgumentCollection.java | 26 ++++++ .../genotyper/UnifiedArgumentCollection.java | 90 +++++++------------ 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5f2b5775c..6d6351fc5 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -241,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = UAC.clone(); + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 085a60191..9b9f04228 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -69,7 +69,33 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Hidden + @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_PERCENTAGE = 0.0; + @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + + + public StandardCallerArgumentCollection() { } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.exactCallsLog = SCAC.exactCallsLog; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 3eda2017c..17137c5e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -186,63 +186,41 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; - uac.exactCallsLog = exactCallsLog; - uac.pairHMM = pairHMM; - - // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - return uac; - } - public UnifiedArgumentCollection() { } - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.exactCallsLog = SCAC.exactCallsLog; + public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { + super(SCAC); + } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.NO_SLOD = uac.NO_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; + + // todo- arguments to remove + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } From 841a906f210ac363817fe3e84794281501e4b30f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 23:31:56 -0400 Subject: [PATCH 11/26] Adding a hidden (for now) argument to UG (and HC) that tells the caller that the incoming samples are contaminated by N% and to fix it by aggressively down-sampling all alleles. This actually works. Yes, you read that right: given that we know what N is, we can make good calls on bams that have N% contamination. Only hooked up for SNPS right now. No tests added yet. --- ...NPGenotypeLikelihoodsCalculationModel.java | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 76ba72017..d053e13a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) + pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); @@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } + public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { + // special case removal of all reads + if ( contaminationPercentage >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() > numReadsToRemove ) + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } + private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( !itemsToRemove.get(i) ) + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + private static class SampleGenotypeData { public final String name; From d44d5b827538789bae68c47c1634080e6c7d04c9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 01:29:59 -0400 Subject: [PATCH 12/26] Fix RawHapMapCodec so that it can build indexes. Minor fixes to VCF codec. --- .../sting/utils/codecs/hapmap/RawHapMapCodec.java | 9 +++++++++ .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); From 0616b98551e7b3dbfa571ff93c65c9a1ef645029 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 08:26:26 -0400 Subject: [PATCH 13/26] Not sure why we were setting the UAC variables instead of the simpleUAC ones when that's what we wanted. --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6d6351fc5..a08614deb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -242,13 +242,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); From 67b9e7319e9be26d49ae1d75ea68aedbb0a0cfd0 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 21 Oct 2012 12:38:33 -0400 Subject: [PATCH 14/26] Fix for integration tests: new criterion in AF exact calculation model to trim alleles based on likelihoods does produce better results and resulting alleles changed in 2 sites at integration tests (and all subsequent sites after this had minor annotation differences due to RankSum dithering) --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 989f06ec5..652489a71 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9acfe0019efdc91217ee070acb071228"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c1d4dd793f61710a1b1fc5d82803210f"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); } @Test(enabled = true) From 9c63cee9fcdb69a7a8e8d77a771ddb2afa18f7cd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 18:36:14 -0400 Subject: [PATCH 15/26] Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller --- .../arguments/StandardCallerArgumentCollection.java | 10 +++++++++- .../walkers/genotyper/UnifiedArgumentCollection.java | 11 +++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 9b9f04228..a511364f9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -82,7 +83,6 @@ public class StandardCallerArgumentCollection { @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; - public StandardCallerArgumentCollection() { } // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! @@ -97,5 +97,13 @@ public class StandardCallerArgumentCollection { this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; this.exactCallsLog = SCAC.exactCallsLog; + this.AFmodel = SCAC.AFmodel; } + + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ + @Advanced + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 17137c5e9..abf0b4420 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +<<<<<<< HEAD import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +======= +>>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -38,13 +41,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -219,7 +215,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; - // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From 99c9031cb4c20cf996202329ca17978ea1fde59e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 20:41:33 -0400 Subject: [PATCH 16/26] Merge AFCalcResultTracker into StateTracker, cleanup -- These two classes were really the same, and now they are actually the same! -- Cleanuped the interfaces, removed duplicate data -- Added lots of contracts, some of which found numerical issues with GeneralPloidyExactAFCalc (which have been patched over but not fixed) -- Moved goodProbability and goodProbabilityVector utilities to MathUtils. Very useful for contracts! --- .../afcalc/GeneralPloidyExactAFCalc.java | 320 ++++++------------ ...neralPloidyAFCalculationModelUnitTest.java | 9 +- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 30 +- .../genotyper/afcalc/AFCalcResult.java | 55 +-- .../genotyper/afcalc/AFCalcResultTracker.java | 256 -------------- .../genotyper/afcalc/DiploidExactAFCalc.java | 55 ++- .../afcalc/OriginalDiploidExactAFCalc.java | 4 - .../genotyper/afcalc/StateTracker.java | 304 ++++++++++++++--- .../broadinstitute/sting/utils/MathUtils.java | 33 ++ 9 files changed, 440 insertions(+), 626 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 0e97c090c..3916c2549 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -69,8 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); - return resultFromTracker(vc, log10AlleleFrequencyPriors); + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } /** @@ -171,13 +170,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param resultTracker object to fill with output values */ - protected static void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { final ArrayList genotypeLikelihoods = getGLs(GLs, true); @@ -196,24 +193,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { if ( genotypeLikelihoods.size() <= 1 ) { // no meaningful GLs at all, just set the tracker to non poly values - resultTracker.reset(); // just mimic-ing call below - resultTracker.setLog10LikelihoodOfAFzero(0.0); + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); } else { for (int p=1; p ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(); @@ -230,16 +227,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - StateTracker stateTracker = new StateTracker(); while ( !ACqueue.isEmpty() ) { - resultTracker.incNEvaluations(); + getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset); - - // adjust max likelihood seen if needed - if ( log10LofKs > stateTracker.getMaxLog10L()) - stateTracker.update(log10LofKs, ACset.getACcounts()); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -260,39 +252,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param resultTracker AFResult object - * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood */ - private static double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final AFCalcResultTracker resultTracker, - final StateTracker stateTracker, - final LinkedList ACqueue, - final HashMap indexesToACset) { + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); // add to new pool if (!Double.isInfinite(log10LofK)) newPool.add(set); - // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { - if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { return log10LofK; } @@ -323,67 +308,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } - /** - * Naive combiner of two multiallelic pools - number of alt alleles must be the same. - * Math is generalization of biallelic combiner. - * - * For vector K representing an allele count conformation, - * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) - * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) - * @param originalPool First log-likelihood pool GL vector - * @param yy Second pool GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param numAlleles Number of alleles - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - */ - public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { -/* - final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); - final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); - - if (dim1 != originalPool.getLength() || dim2 != yy.length) - throw new ReviewedStingException("BUG: Inconsistent vector length"); - - if (ploidy2 == 0) - return; - - final int newPloidy = ploidy1 + ploidy2; - - // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) - // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); - final double[] x = originalPool.getLikelihoodsAsVector(true); - while(firstIterator.hasNext()) { - x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); - firstIterator.next(); - } - - GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - final double[] y = yy.clone(); - while(secondIterator.hasNext()) { - y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); - secondIterator.next(); - } - - // initialize output to -log10(choose(m1+m2,[k1 k2...]) - final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); - - - // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K - while(outputIterator.hasNext()) { - final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); - double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); - - originalPool.add(likelihood, set, outputIterator.getLinearIndex()); - outputIterator.next(); - } -*/ - } +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } /** * Compute likelihood of a particular AC conformation and update AFresult object @@ -394,15 +379,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param resultTracker AFResult object * @return log-likehood of requested conformation */ - private static double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2, - final AFCalcResultTracker resultTracker) { + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { final int newPloidy = ploidy1 + ploidy2; @@ -420,8 +403,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; set.getLog10Likelihoods()[0] = log10Lof0; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -464,14 +447,14 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - resultTracker.updateMLEifNeeded(log10LofK, altCounts); + getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, altCounts); + getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); return log10LofK; } @@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { return (sum == ploidy); } - /** - * Combines naively two biallelic pools (of arbitrary size). - * For two pools of size m1 and m2, we can compute the combined likelihood as: - * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) - * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i - * @param newPLVector Second GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - * @return Combined likelihood vector - */ - public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, - final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { - - final int newPloidy = ploidy1 + ploidy2; - - final double[] combinedLikelihoods = new double[1+newPloidy]; - - /** Pre-fill result array and incorporate weights into input vectors - * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) - * and L2(k) = Pr(D|AC2=k) * choose(m2,k) - * equation reduces to - * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) - * which is just plain convolution of L1 and L2 (with pre-existing vector) - */ - - // intialize result vector to -infinity - Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); - - final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); - for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) - x[k] = Double.NEGATIVE_INFINITY; - - final double[] y = newPLVector.clone(); - - - final double log10Lof0 = x[0]+y[0]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - - double maxElement = log10Lof0; - int maxElementIdx = 0; - int[] alleleCounts = new int[1]; - for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { - double[] acc = new double[k+1]; - Arrays.fill(acc,Double.NEGATIVE_INFINITY); - double innerMax = Double.NEGATIVE_INFINITY; - - for (int j=0; j <=k; j++) { - double x1,y1; - - - if (k-j>=0 && k-j < y.length) - y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); - else - continue; - - if (j < x.length) - x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); - else - continue; - - if (Double.isInfinite(x1) || Double.isInfinite(y1)) - continue; - acc[j] = x1 + y1; - if (acc[j] > innerMax) - innerMax = acc[j]; - else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) - break; - } - combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); - maxElementIdx = k; - double maxDiff = combinedLikelihoods[k] - maxElement; - if (maxDiff > 0) - maxElement = combinedLikelihoods[k]; - else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { - break; - } - - alleleCounts[0] = k; - resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); - - - } - - - return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); - } - - /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * including updating the PL's, and assign genotypes accordingly @@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * * @return genotype */ - private static void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { final int numNewAltAlleles = allelesToUse.size() - 1; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 48f282901..1b3a4c0c0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - - final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; - -// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele]; Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 07f88c9e3..927fadd94 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -50,7 +50,7 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private final AFCalcResultTracker resultTracker; + private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { @@ -62,7 +62,7 @@ public abstract class AFCalc implements Cloneable { this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } public void enableProcessLog(final File exactCallsLog) { @@ -83,10 +83,10 @@ public abstract class AFCalc implements Cloneable { public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); - if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); // reset the result, so we can store our new result there - resultTracker.reset(); + stateTracker.reset(); final VariantContext vcWorking = reduceScope(vc); @@ -100,10 +100,20 @@ public abstract class AFCalc implements Cloneable { return result; } - @Deprecated - protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { - resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); - return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() > 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); } // --------------------------------------------------------------------------- @@ -162,8 +172,8 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public AFCalcResultTracker getResultTracker() { - return resultTracker; + public StateTracker getStateTracker() { + return stateTracker; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 7cacb2060..a65772444 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -83,8 +83,8 @@ public class AFCalcResult { if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); - if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); this.alleleCountsOfMLE = alleleCountsOfMLE; this.nEvaluations = nEvaluations; @@ -147,7 +147,7 @@ public class AFCalcResult { * Due to computational / implementation constraints this may be smaller than * the actual list of alleles requested * - * @return a non-empty list of alleles used during genotyping + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele */ @Ensures({"result != null", "! result.isEmpty()"}) public List getAllelesUsedInGenotyping() { @@ -159,7 +159,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFEq0() { return log10PosteriorsOfAC[AF0]; } @@ -169,7 +169,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFGT0() { return log10PosteriorsOfAC[AF1p]; } @@ -179,7 +179,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFEq0() { return log10LikelihoodsOfAC[AF0]; } @@ -189,7 +189,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFGT0() { return log10LikelihoodsOfAC[AF1p]; } @@ -199,7 +199,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFEq0() { return log10PriorsOfAC[AF0]; } @@ -209,7 +209,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFGT0() { return log10PriorsOfAC[AF1p]; } @@ -263,7 +263,7 @@ public class AFCalcResult { * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping * @return the log10 probability that allele is segregating at this site */ - @Ensures("goodLog10Probability(result)") + @Ensures("MathUtils.goodLog10Probability(result)") public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { final Double log10pNonRef = log10pNonRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); @@ -279,7 +279,7 @@ public class AFCalcResult { * @return freshly allocated log10 normalized posteriors vector */ @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") - @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) @@ -287,29 +287,6 @@ public class AFCalcResult { return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - /** * Computes the offset into linear vectors indexed by alt allele for allele * @@ -331,14 +308,4 @@ public class AFCalcResult { else return index - 1; } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @return true if result is really well formed - */ - private static boolean goodLog10Probability(final double result) { - return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java deleted file mode 100644 index 5c926a4d8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - * - * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? - */ -class AFCalcResultTracker { - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - protected double log10MLE; - protected double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int LIKELIHOODS_CACHE_SIZE = 5000; - private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; - private int currentLikelihoodsCacheIndex = 0; - protected Double log10LikelihoodsMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - private int[] AClimits; - - int nEvaluations = 0; - - /** - * The list of alleles actually used in computing the AF - */ - private List allelesUsedInGenotyping = null; - - /** - * Create a results object capability of storing results for calls with up to maxAltAlleles - * - * @param maxAltAlleles an integer >= 1 - */ - public AFCalcResultTracker(final int maxAltAlleles) { - if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); - - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - - reset(); - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MLE - * - * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, - * starting from index 0 (i.e., the first alt allele is at 0). The vector is always - * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values - * are meaningful. - * - * @return a vector with allele counts, not all of which may be meaningful - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MAP - * - * @see #getAlleleCountsOfMLE() for the encoding of results in this vector - * - * @return a non-null vector of ints - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return - */ - public double getLog10LikelihoodOfAFNotZero() { - if ( log10LikelihoodsMatrixSum == null ) { - if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have - log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; - else - log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - } - return log10LikelihoodsMatrixSum; - } - - public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { - return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { - final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; - final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); - for ( int i = 0; i < subACOfMLE.length; i++ ) { - final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); - } - - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); - } - - // -------------------------------------------------------------------------------- - // - // Protected mutational methods only for use within the calculation models themselves - // - // -------------------------------------------------------------------------------- - - /** - * Reset the data in this results object, so that it can be used in a subsequent AF calculation - * - * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer - */ - protected void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentLikelihoodsCacheIndex = 0; - log10LikelihoodsMatrixSum = null; - allelesUsedInGenotyping = null; - nEvaluations = 0; - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - } - - /** - * Tell this result we used one more evaluation cycle - */ - protected void incNEvaluations() { - nEvaluations++; - } - - protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToLikelihoodsCache(log10LofK); - - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToLikelihoodsCache(final double log10LofK) { - // add to the cache - log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { - final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - log10LikelihoodsMatrixValues[0] = temporarySum; - currentLikelihoodsCacheIndex = 1; - } - } - - protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } - - protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { - if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) - throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); - - this.allelesUsedInGenotyping = allelesUsedInGenotyping; - } - - protected void setAClimits(int[] AClimits) { - this.AClimits = AClimits; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 49915c515..6b345dcf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,10 +36,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { @@ -60,29 +56,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); - // keep processing while we have AC conformations that need to be calculated - final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); - while ( !ACqueue.isEmpty() ) { - getResultTracker().incNEvaluations(); // keep track of the number of evaluations + getStateTracker().incNEvaluations(); // keep track of the number of evaluations // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( stateTracker.withinMaxACs(set.getACcounts()) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - // adjust max likelihood seen if needed - stateTracker.update(log10LofKs, set.getACcounts()); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } - return resultFromTracker(vc, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } @Override @@ -153,23 +141,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -188,7 +174,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -213,9 +199,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -223,8 +209,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final StateTracker stateTracker, - final int[] newSetCounts, + private void updateACset(final int[] newSetCounts, final int numChr, final ExactACset dependentSet, final int PLsetIndex, @@ -246,8 +231,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -258,8 +242,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; } @@ -281,14 +265,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 093bf47d5..88f5e06e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -16,10 +16,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 19e253277..3eb32d35e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -1,35 +1,85 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + /** - * Keeps track of the best state seen by the exact model and the max states to visit - * allowing us to abort the search before we visit the entire matrix of AC x samples + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) */ final class StateTracker { - public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - final private int[] maxACsToConsider; - - private ExactACcounts ACsAtMax = null; - private double maxLog10L = Double.NEGATIVE_INFINITY; - - public StateTracker() { - this(null); - } - - public StateTracker(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 /** - * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state - * - * @param log10LofKs the likelihood of our current configuration state + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > getMaxLog10L()) { - this.setMaxLog10L(log10LofKs); - this.ACsAtMax = ACs; - } + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); } /** @@ -39,58 +89,200 @@ final class StateTracker { * @param log10LofK the log10 likelihood of the configuration we're considering analyzing * @return true if the configuration cannot meaningfully contribute to our likelihood sum */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; } /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } - final int[] otherACcounts = otherACs.getCounts(); + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; } /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + * Returns the likelihoods summed across all AC values for AC > 0 + * + * @return */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; + private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + } - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); + /** + * @return + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC + * + * @return + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + // TODO -- replace with more meaningful computation + // TODO -- refactor this calculation into the ref calculation + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); } - return true; + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); } - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); } - public double getMaxLog10L() { - return maxLog10L; + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; } - public void setMaxLog10L(double maxLog10L) { - this.maxLog10L = maxLog10L; + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3740d5d7c..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1194,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that From 695cf8367598d3cb312e590b2cb3fb6a0cd65faf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 07:38:32 -0400 Subject: [PATCH 17/26] More docs and contracts for classes in genotyper.afcalc -- Future protection of the output of GeneralPloidyExactAFCalc, which produces in some cases bad likelihoods (positive values) --- .../afcalc/GeneralPloidyExactAFCalc.java | 6 ++-- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 28 +++++++++++++++++-- .../IndependentAllelesDiploidExactAFCalc.java | 14 ++++++++-- .../afcalc/OriginalDiploidExactAFCalc.java | 2 +- .../genotyper/afcalc/StateTracker.java | 12 +++----- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 3916c2549..9c7883ab8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -447,14 +447,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); return log10LofK; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 927fadd94..e3abdeb24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -53,6 +53,16 @@ public abstract class AFCalc implements Cloneable { private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; + /** + * Create a new AFCalc object capable of calculating the prob. that alleles are + * segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels + * for indels for samples with ploidy + * + * @param nSamples number of samples, must be > 0 + * @param maxAltAlleles maxAltAlleles for SNPs + * @param maxAltAllelesForIndels for indels + * @param ploidy the ploidy, must be > 0 + */ protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); @@ -65,10 +75,20 @@ public abstract class AFCalc implements Cloneable { this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } + /** + * Enable exact call logging to file + * + * @param exactCallsLog the destination file + */ public void enableProcessLog(final File exactCallsLog) { exactCallLogger = new ExactCallLogger(exactCallsLog); } + /** + * Use this logger instead of the default logger + * + * @param logger + */ public void setLogger(Logger logger) { this.logger = logger; } @@ -109,7 +129,7 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors the priors by AC vector * @return a AFCalcResult describing the result of this calculation */ - @Requires("stateTracker.getnEvaluations() > 0") + @Requires("stateTracker.getnEvaluations() >= 0") @Ensures("result != null") protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); @@ -144,11 +164,13 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors priors * @return a AFCalcResult object describing the results of this calculation */ - // TODO -- add consistent requires among args + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors); /** + * Subset VC to the just allelesToUse, updating genotype likelihoods + * * Must be overridden by concrete subclasses * * @param vc variant context with alleles and genotype likelihoods @@ -172,7 +194,7 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public StateTracker getStateTracker() { + protected StateTracker getStateTracker() { return stateTracker; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index c0edee291..2f85a5246 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -303,7 +303,13 @@ import java.util.*; /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * - * TODO -- add more docs + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ @@ -350,8 +356,10 @@ import java.util.*; }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 88f5e06e6..dea38e46c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -11,7 +11,7 @@ import java.util.Map; /** * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ -public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { +class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 3eb32d35e..301891a99 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -129,9 +129,7 @@ final class StateTracker { } /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return + * @return the likelihoods summed across all AC values for AC > 0 */ private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { if ( log10LikelihoodsForAFGt0Sum == null ) { @@ -144,7 +142,7 @@ final class StateTracker { } /** - * @return + * @return the log10 likelihood of AF == 0 */ private double getLog10LikelihoodOfAFzero() { return log10LikelihoodOfAFzero; @@ -157,9 +155,9 @@ final class StateTracker { * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful * values, etc. * - * @param log10PriorsByAC + * @param log10PriorsByAC the priors by AC * - * @return + * @return an AFCalcResult summarizing the final results of this calculation */ @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { @@ -167,8 +165,6 @@ final class StateTracker { final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); From 326f42927050ad326ec41a64f6d8c72c17ecc22f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:06:41 -0400 Subject: [PATCH 18/26] Bugfixes to make new AFCalc system pass integrationtests -- GeneralPloidyExactAFCalc turns -Infinity values into -Double.MAX_VALUE, so our calculations pass unit tests -- Bugfix for GeneralPloidyGenotypeLikelihoodsCalculationModel, return a null VC when the only allele we get from our final alleles to use method is the reference base -- Fix calculation of reference posteriors when P(AF == 0) = 0.0 and P(AF == 0) = X for some meaningful value of X. Added unit test to ensure this behavior is correct -- Fix horrible sorting bug in IndependentAllelesDiploidExactAFCalc that applied the theta^N priors in the wrong order. Add contract to ensure this doesn't ever happen again -- Bugfix in GLBasedSampleSelector, where VCs without any polymorphic alleles were being sent to the exact model -- --- ...dyGenotypeLikelihoodsCalculationModel.java | 2 +- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +-- .../afcalc/AFCalcResultUnitTest.java | 5 ++++ .../genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../IndependentAllelesDiploidExactAFCalc.java | 28 +++++++++++++++++-- .../GLBasedSampleSelector.java | 3 ++ 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 4c20700ac..2522fc16e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G // find the alternate allele(s) that we should be using final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); - if (alleles == null || alleles.isEmpty()) + if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference())) return null; // start making the VariantContext final GenomeLoc loc = ref.getLocus(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 9c7883ab8..51b7fb633 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 1070642e9..cbe2eb268 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -56,6 +56,11 @@ public class AFCalcResultUnitTest extends BaseTest { tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); } + // test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost + for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) { + tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})}); + } + return tests.toArray(new Object[][]{}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 25df0f6d2..9c6c8e8ab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -185,7 +185,7 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true, dataProvider = "badGLs") + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") public void testBadGLs(GetGLsTest cfg) { testResultSimple(cfg); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 2f85a5246..ea89d3802 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -100,7 +100,7 @@ import java.util.*; private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } @@ -313,6 +313,7 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ + @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -321,8 +322,9 @@ import java.util.*; final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in log space + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); @@ -337,6 +339,7 @@ import java.util.*; // the AF > 0 case requires us to store the normalized likelihood for later summation if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -348,7 +351,16 @@ import java.util.*; // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { // L + prior = posterior => L = poster - prior log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], @@ -362,4 +374,14 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } + + private static boolean sortedByPosteriorGT(final List sortedVCs) { + double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); + for ( final AFCalcResult vc : sortedVCs ) { + if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + return false; + lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); + } + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index f8c871e7d..48a2d2700 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -48,6 +48,9 @@ public class GLBasedSampleSelector extends SampleSelector { // first subset to the samples VariantContext subContext = vc.subContextFromSamples(samples); + if ( ! subContext.isPolymorphicInSamples() ) + return false; + // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { From eaffb814d381e19aa8d522338f726c481634ab98 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 13:34:51 -0400 Subject: [PATCH 19/26] IndependentExactAFCalc is now the default EXACT model implementation -- Changed UG / HC to use this one via the StandardCallerArgumentCollection -- Update the AFCalcFactory.Calculation to have a getDefault() value instead of having a duplicate entry in the enums --- .../gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../arguments/StandardCallerArgumentCollection.java | 2 +- .../gatk/walkers/genotyper/afcalc/AFCalcFactory.java | 11 +++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 9c6c8e8ab..ab967fbe1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -495,7 +495,7 @@ public class AFCalcUnitTest extends BaseTest { // list of all high-quality models in the system final List models = Arrays.asList( - AFCalcFactory.Calculation.EXACT, + AFCalcFactory.Calculation.getDefaultModel(), AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index a511364f9..84dfa694b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -105,5 +105,5 @@ public class StandardCallerArgumentCollection { */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 7d67815cf..80de555ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -24,15 +24,12 @@ public class AFCalcFactory { * the needs of the request (i.e., considering ploidy). */ public enum Calculation { - /** The default implementation */ - EXACT(ReferenceDiploidExactAFCalc.class, 2, -1), - - /** reference implementation of multi-allelic EXACT model */ - EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), @@ -60,6 +57,8 @@ public class AFCalcFactory { return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); } + + public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } } private static final Map> afClasses; From 0fb82745077af0d1795253a1b3c2e5c03551645d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 17:11:31 -0400 Subject: [PATCH 20/26] Fix contact on sorting of AFCalcResults -- The thing that must be sorted is the pre-theta^N list, which is not checked in the routine that applies the theta^N prior. --- .../IndependentAllelesDiploidExactAFCalc.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index ea89d3802..804b560b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -285,10 +285,14 @@ import java.util.*; // sort the results, so the most likely allele is first Collections.sort(sorted, compareAFCalcResultsByPNonRef); + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); for ( int i = 0; i < sorted.size(); i++ ) { - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; @@ -313,7 +317,6 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ - @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -374,14 +377,4 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } - - private static boolean sortedByPosteriorGT(final List sortedVCs) { - double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); - for ( final AFCalcResult vc : sortedVCs ) { - if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) - return false; - lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); - } - return true; - } } From 0fcd358ace4c232236e577d53627ea06959b4766 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 19:33:47 -0400 Subject: [PATCH 21/26] Original EXACT model implementation lives, providing another reference (bi-allelic only) EXACT model -- Potentially a very fast implementation (it's very clean) but restricted to the biallelic case -- A starting point for future bi-allelic only optimized (logless) or generalized (bi-allelic general ploidy) implementations -- Added systematic unit tests covering this implementation, and comparing it to others -- Uncovered a nasty normalization bug in StateTracker that was capping our likelihoods at 0, even after summing up multiple likelihoods, which is just not safe to do and was causing us to lose likelihood in some cases -- Removed the restriction that a likelihood be <= 0 in StateTracker, and the protection for these cases in GeneralPloidyExactAFCalc which just wasn't right --- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +- .../genotyper/afcalc/AFCalcUnitTest.java | 149 ++++++++++++++---- .../afcalc/OriginalDiploidExactAFCalc.java | 38 +++-- .../genotyper/afcalc/StateTracker.java | 14 +- 4 files changed, 149 insertions(+), 56 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 51b7fb633..2b247430c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ab967fbe1..ef4318a40 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest { new GetGLsTest(model, 1, genotypes, priors, priorName); // tri-allelic - if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) ) for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) new GetGLsTest(model, 2, genotypes, priors, priorName); } @@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest { return GetGLsTest.getTests(GetGLsTest.class); } - @DataProvider(name = "badGLs") - public Object[][] createBadGLs() { - final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); - final int nSamples = genotypes.size(); +// @DataProvider(name = "badGLs") +// public Object[][] createBadGLs() { +// final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); +// final int nSamples = genotypes.size(); +// +// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); +// +// final int nPriorValues = 2*nSamples+1; +// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors +// for ( AFCalc model : Arrays.asList(indCalc) ) { +// final String priorName = "flat"; +// new GetGLsTest(model, 2, genotypes, priors, priorName); +// } +// +// return GetGLsTest.getTests(GetGLsTest.class); +// } - final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); - - final int nPriorValues = 2*nSamples+1; - final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - for ( AFCalc model : Arrays.asList(indCalc) ) { - final String priorName = "flat"; - new GetGLsTest(model, 2, genotypes, priors, priorName); - } - - return GetGLsTest.getTests(GetGLsTest.class); - } +// +// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") +// public void testBadGLs(GetGLsTest cfg) { +// testResultSimple(cfg); +// } @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") public void testBiallelicGLs(GetGLsTest cfg) { @@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") - public void testBadGLs(GetGLsTest cfg) { - testResultSimple(cfg); - } - private static class NonInformativeData { final Genotype nonInformative; final List called; @@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors for ( AFCalc model : calcs ) { + if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc ) + continue; + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); for ( int rotation = 0; rotation < nSamples; rotation++ ) { @@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest { "Actual pNonRef not within tolerance " + tolerance + " of expected"); } + @DataProvider(name = "PNonRefBiallelicSystematic") + public Object[][] makePNonRefBiallelicSystematic() { + List tests = new ArrayList(); + + final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000); + final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true)); + + for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) { + + if ( false ) { // for testing only + tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))}); + } else { + if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK + + // test all combinations of PLs for 1 sample + for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + + + final List> smallDiploidPLs = new LinkedList>(); + for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) { + for ( int i = 0; i < 2; i++ ) { + List pls = new ArrayList(Collections.nCopies(3, nonRefPL)); + pls.set(i, 0); + smallDiploidPLs.add(pls); + } + } + + for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + final List> removeBadPLs(List> listOfPLs) { + List> clean = new LinkedList>(); + + for ( final List PLs : listOfPLs ) { + int x = PLs.get(0); + boolean bad = false; + for ( int pl1 : PLs ) + if ( pl1 > x ) + bad = true; + else + x = pl1; + if ( ! bad ) clean.add(PLs); + } + + return clean; + } + + private List toGenotypes(final List> PLsPerSample) { + final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List genotypes = new ArrayList(PLsPerSample.size()); + + for ( final List PLs : PLsPerSample ) { + final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3])); + final int min = MathUtils.arrayMin(pls); + for ( int i = 0; i < pls.length; i++ ) pls[i] -= min; + genotypes.add(makePL(nocall, pls)); + } + + return genotypes; + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic") + private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) { + //logger.warn("Running " + modelType + " with " + genotypes); + final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human); + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human); + + final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C)); + vcb.genotypes(genotypes); + + final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + final double tolerance = 1e-3; + Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(), + "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE())); + } + // -------------------------------------------------------------------------------- // // Test priors diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index dea38e46c..ac4634666 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -20,15 +21,22 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; - final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); - final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double pNonRef = lastK > 0 ? 0.0 : -1000.0; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); - return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); } /** @@ -68,11 +76,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { } } - public int linearExact(final VariantContext vc, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyLikelihoods, - double[] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -81,7 +89,7 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { double maxLog10L = Double.NEGATIVE_INFINITY; boolean done = false; - int lastK = -1; + int lastK = -1, mleK = -1; for (int k=0; k <= numChr && ! done; k++ ) { final double[] kMinus0 = logY.getkMinus0(); @@ -127,7 +135,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { // can we abort early? lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; @@ -136,6 +148,6 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { logY.rotate(); } - return lastK; + return new Pair(lastK, mleK); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 301891a99..b82ec1d29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -131,14 +131,14 @@ final class StateTracker { /** * @return the likelihoods summed across all AC values for AC > 0 */ - private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + private double getLog10LikelihoodOfAFNotZero() { if ( log10LikelihoodsForAFGt0Sum == null ) { if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; else log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); } - return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + return log10LikelihoodsForAFGt0Sum; } /** @@ -162,7 +162,7 @@ final class StateTracker { @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); @@ -210,7 +210,7 @@ final class StateTracker { * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToLikelihoodsCache(log10LofK); @@ -227,7 +227,7 @@ final class StateTracker { * @param log10PofK the posterior of our current configuration state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { if ( log10PofK > log10MAP ) { @@ -236,7 +236,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) private void addToLikelihoodsCache(final double log10LofK) { // add to the cache log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; @@ -250,7 +249,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { @@ -259,7 +257,7 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero; From 6b6caf8e3a299553daa1dc5b64b33e0de14144a3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:42:00 -0400 Subject: [PATCH 22/26] Bugfix for indel DP calculations using reduced reads -- Adding tests for SNP and indel calling on reduced BAM --- ...delGenotypeLikelihoodsCalculationModel.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index e0ffb2ba6..1761f9cec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 72724e46a..3bb630d18 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -452,4 +452,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); executeTest("test calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", ""); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", ""); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } } From d21e42608ac43b58ade9984436d8c2b428eb6037 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:10:43 -0400 Subject: [PATCH 23/26] Updating integration tests for minor changes due to switching to EXACT_INDEPENDENT model by default --- ...GenotyperGeneralPloidyIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 8 ++-- .../UnifiedGenotyperIntegrationTest.java | 44 +++++++++---------- .../SelectVariantsIntegrationTest.java | 4 +- .../NanoSchedulerIntegrationTest.java | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 652489a71..b447a1113 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -60,12 +60,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592"); } @Test(enabled = true) @@ -80,7 +80,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a441e6c77..870967f09 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75013fa6a884104f0b1797502b636698"); + HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9"); } @Test @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "3cd3363976b1937d801f9f82996f4abe"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,7 +64,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3bb630d18..044d70fc2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b3abf320f7d02d0e3b2883833419130e")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc")); + Arrays.asList("afb8768f31ab57eb43f75c1115eadc99")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("48b4f4b05461be276bffc91350f08cbc")); + Arrays.asList("73c9b926c5e971a113de347a64fdcf20")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("04affcc9d720ee17bc221759707e0cd2")); + Arrays.asList("7f8d13690cb7d4173787afa00c496f12")); executeTest("test reverse trim", spec); } @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("112e7bedfd284d4d9390aa006118c733")); + Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19")); executeTest("test mismatched PLs", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2"; + private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10"; @Test public void testCompressedOutput() { @@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "360d1274c1072a1ae9868e4e106c2650"; + String md5 = "d408b4661b820ed86272415b8ea08780"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c")); + Arrays.asList("839ecd30d354a36b5dfa2b5e99859765")); executeTest("test min_base_quality_score 26", spec); } @@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8800c58715c2bb434b69e1873cb77de6"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "639df9f4c029792ac2e46069efc82b20"); } private void testOutputParameters(final String args, final String md5) { @@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" ); + testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" ); + testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" ); } private void testHeterozosity(final double arg, final String md5) { @@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("950fb032cc9902ae48bd21f272d2fd52")); + Arrays.asList("98058fc913b61c22d44875da1f5ea89c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b3df138254ed141b61a758df87757e0d")); + Arrays.asList("650c53774afacfc07a595675e8cdde17")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("63fd9488daadd4baaef0a98f02916996")); + Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("52b5a432092995c92fe71e1942689ba8")); + Arrays.asList("5f2721c3323de5390d2d47446139f32b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("863ee56b3594f09795644127f2f9539f")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("503ca1b75cc7b2679eaa80f7b5e7ef1c")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("945a2f994eaced8efdf8de24b58f2680")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -449,18 +449,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); + Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", ""); + testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", ""); + testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 58d3677c7..f29ac8cad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRegenotype--" + testFile, spec); @@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index c1b28314c..37614f15f 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); + tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); } return tests.toArray(new Object[][]{}); From 5296de8251ddb1cd9036bf9b7e57fc362a78f92b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:20:39 -0400 Subject: [PATCH 24/26] Fix UnifiedArgumentCollection constructor logic error -- The old way of overloading constructors and calling super didn't work (might have been a consequence of merge). This is the right way to do the copy constructor with the call to super() --- .../genotyper/UnifiedArgumentCollection.java | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index abf0b4420..e50f25fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,15 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -<<<<<<< HEAD -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; -======= ->>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) @@ -182,14 +177,30 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - public UnifiedArgumentCollection() { } - - public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { - super(SCAC); + /** + * Create a new UAC with defaults for all UAC arguments + */ + public UnifiedArgumentCollection() { + super(); } - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + /** + * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments + * @param scac + */ + public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { + super(scac); + } + + /** + * Create a new UAC with all parameters having the values in uac + * + * @param uac + */ public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + super(uac); + this.GLmodel = uac.GLmodel; this.AFmodel = uac.AFmodel; this.PCR_error = uac.PCR_error; @@ -215,6 +226,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; + // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From eb6c9a1a79b8979353d85723d26138be2d52ff6a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 09:57:31 -0400 Subject: [PATCH 25/26] Disable EfficiencyMonitoringThreadFactoryUnitTest -- This is no longer a core GATK activity, and the tests need to run for so long (2 min each) that it's just too painful to run them. Should be re-eabled if we come to care about this capability again, or if we can run these tests all in parallel in the future. --- .../threading/EfficiencyMonitoringThreadFactoryUnitTest.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 7381bebc4..c072c808d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) + // NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads + // aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again + // in the future, or we can run these in parallel + @Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); From 9f2851d769e3f14b19f4e92c07c25d4b4689e250 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 20:23:11 -0400 Subject: [PATCH 26/26] Updating UnifiedGenotyperGeneralPloidyIntegrationTest following rebasing -- Created a JIRA ticket https://jira.broadinstitute.org/browse/GSA-623 for Guillermo to look at the differences as the multi-allelic nature of many sites seems to change with the new more protected infrastructure. This may be due to implementation issues in the pooled caller, problems with my interface, or could be a genuine improvement. --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index b447a1113..ae3befe66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8"); } @Test(enabled = true)