From 9a9a68409e279799469038770e14b8a4cca7bee7 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 31 Jul 2014 15:47:46 -0400 Subject: [PATCH] ReadLikelihoods class introduction final changes before merging Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 Changes: Done some changes that I missed in relation with making sure that all PairHMM implentations use the same interface; as a consequence we were running always the standard PairHMM. Fixed some additional bugs detected when running it on full wgs single sample and exom multi sample data set. Updated some integration test md5s. Fixing GraphBased bugs with new master code Fixed ReadLikelihoods.changeReads difficult to spot bug. Changed PairHMM interface to fix a bug Fixed missing changes for various PairHMM implementations to get them to use the new structure. Fixed various bugs only detectable when running with full sample(s). Believe to have fixed the lack of annotations in UG runs Fixed integrationt test MD5s Updating some md5s Fixed yet another md5 probably left out by mistake --- ...GraphBasedLikelihoodCalculationEngine.java | 7 +- .../HaplotypeCallerGenotypingEngine.java | 5 +- .../PairHMMLikelihoodCalculationEngine.java | 13 ++- .../indels/PairHMMIndelErrorModel.java | 37 ++++--- .../gatk/utils/pairhmm/CnyPairHMM.java | 45 ++++----- .../utils/pairhmm/DebugJNILoglessPairHMM.java | 55 +++++------ .../utils/pairhmm/VectorLoglessPairHMM.java | 50 ++++------ ...perGeneralPloidySuite1IntegrationTest.java | 2 +- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...dGenotyperIndelCallingIntegrationTest.java | 14 +-- ...GenotyperNormalCallingIntegrationTest.java | 13 +-- ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../genotyper/ReadLikelihoodsUnitTest.java | 52 +++++++++- .../gatk/utils/genotyper/ReadLikelihoods.java | 62 +++++++++++- .../gatk/utils/pairhmm/PairHMM.java | 97 +++---------------- 16 files changed, 245 insertions(+), 215 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 7ba81ca8c..ace4f22d2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -119,9 +119,12 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); final List haplotypes = assemblyResultSet.getHaplotypeList(); final List supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList(); - if (supportedHaplotypes.size() != haplotypes.size()) + final ReadLikelihoods result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, samples, perSampleReadList); + if (supportedHaplotypes.size() != haplotypes.size()) { logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); - final ReadLikelihoods result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes,samples,perSampleReadList); + result.addMissingAlleles(haplotypes,Double.NEGATIVE_INFINITY); + } + if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result); return result; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 32f54436c..a1bf09043 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -338,7 +338,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes) { final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); final Event refEvent = new Event(null); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index e662806fe..4feed71d9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -275,13 +275,24 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities final List processedReads = modifyReadQualities(likelihoods.reads()); + final Map gapContinuationPenalties = buildGapContinuationPenalties(processedReads,constantGCP); // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype - pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,constantGCP); + pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,gapContinuationPenalties); if (WRITE_LIKELIHOODS_TO_FILE) writeDebugLikelihoods(likelihoods); } + private Map buildGapContinuationPenalties(final List processedReads, final byte gcp) { + final Map result = new HashMap<>(processedReads.size()); + for (final GATKSAMRecord read : processedReads) { + final byte[] readGcpArray = new byte[read.getReadLength()]; + Arrays.fill(readGcpArray,gcp); + result.put(read,readGcpArray); + } + return result; + } + private void writeDebugLikelihoods(final ReadLikelihoods.Matrix likelihoods) { final List reads = likelihoods.reads(); final List haplotypes = likelihoods.alleles(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index 32e1dcd0e..a2422c544 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -47,11 +47,13 @@ package org.broadinstitute.gatk.tools.walkers.indels; import com.google.java.contract.Ensures; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.Log10PairHMM; @@ -61,12 +63,8 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; -import htsjdk.variant.variantcontext.Allele; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.Map; +import java.util.*; public class PairHMMIndelErrorModel { @@ -297,10 +295,8 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final LinkedList readList = new LinkedList<>(); - final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; - for (PileupElement p: pileup) { + for (final PileupElement p: pileup) { // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { @@ -439,28 +435,31 @@ public class PairHMMIndelErrorModel { // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); - readList.add(processedRead); // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM - readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); + final Map readGCPArrayMap = Collections.singletonMap(processedRead,contextLogGapContinuationProbabilities); // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); + // Apparently more than one allele can map to the same haplotype after trimming + final Set distinctHaplotypesSet = new LinkedHashSet<>(trimmedHaplotypeMap.values()); + final List distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()])); // Get the likelihoods for our clipped read against each of our trimmed haplotypes. - final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); + final ReadLikelihoods rl = new ReadLikelihoods<>( + + Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead))); + final ReadLikelihoods.Matrix dummySampleLikelihoods = rl.sampleMatrix(0); + pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap); // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array - for (Allele a: trimmedHaplotypeMap.keySet()){ - double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); - perReadAlleleLikelihoodMap.add(p, a, readLikelihood); + for (final Allele a: trimmedHaplotypeMap.keySet()){ + final Haplotype h = trimmedHaplotypeMap.get(a); + final int hIndex = rl.alleleIndex(h); + final double readLikelihood = dummySampleLikelihoods.get(hIndex,0); readLikelihoods[readIdx][j++] = readLikelihood; + perReadAlleleLikelihoodMap.add(read,a,readLikelihood); } - // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually - readList.remove(processedRead); - - // The same is true for the read/GCP-array map - readGCPArrayMap.remove(processedRead); } } readIdx++; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java index 80be36f02..f506b7ba2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java @@ -46,10 +46,10 @@ package org.broadinstitute.gatk.utils.pairhmm; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.File; import java.lang.reflect.Field; @@ -192,47 +192,42 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp){ // initialize the pairHMM if necessary if (! initialized) { - int readMaxLength = findMaxReadLength(reads); - int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + int readMaxLength = findMaxReadLength(processedReads); + int haplotypeMaxLength = findMaxHaplotypeLength(likelihoods.alleles()); initialize(readMaxLength, haplotypeMaxLength); } // Pass the read bases/quals, and the haplotypes as a list into the HMM - performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + performBatchAdditions(processedReads, likelihoods.alleles(), gcp); - // Get the log10-likelihoods for each read/haplotype ant pack into the results map - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); - - return likelihoodMap; + // Get the log10-likelihoods for each read/haplotype ant pack into the read-likelihoods matrix. + collectLikelihoodResults(likelihoods); } - private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { - for(final GATKSAMRecord read : reads){ - final double[] likelihoods = batchGetResult(); - int jjj = 0; - for (Allele allele : alleleHaplotypeMap.keySet()){ - final double log10l = likelihoods[jjj]; - likelihoodMap.add(read, allele, log10l); - jjj++; - } + private void collectLikelihoodResults(ReadLikelihoods.Matrix likelihoods) { + final int numHaplotypes = likelihoods.alleleCount(); + final int numReads = likelihoods.readCount(); + for(int r = 0; r < numReads; r++) { + final double[] likelihoodValues = batchGetResult(); + + for (int h = 0; h < numHaplotypes; h++) + likelihoods.set(h,r, likelihoodValues[h]); } } - private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { - final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + private void performBatchAdditions(final List reads, final List haplotypes, Map gcp) { for(final GATKSAMRecord read : reads){ final byte[] readBases = read.getReadBases(); final byte[] readQuals = read.getBaseQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); + final byte[] overallGCP = gcp.get(read); - batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java index 89c94a3e2..5be872f0e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java @@ -49,23 +49,21 @@ package org.broadinstitute.gatk.utils.pairhmm; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.gatk.utils.QualityUtils; - +import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*; -import java.util.List; -import java.util.Map; -import java.util.HashMap; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; -import java.io.BufferedWriter; -import java.util.Map; -import java.util.HashMap; import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*; /** @@ -156,10 +154,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + public void computeLikelihoods( final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map GCPArrayMap ) { // (re)initialize the pairHMM only if necessary - final int readMaxLength = verify ? findMaxReadLength(reads) : 0; - final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; + final int readMaxLength = verify ? findMaxReadLength(processedReads) : 0; + final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(likelihoods.alleles()) : 0; if(verify) { if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) @@ -167,15 +165,15 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); } - int readListSize = reads.size(); - int numHaplotypes = alleleHaplotypeMap.size(); + int readListSize = processedReads.size(); + int numHaplotypes = likelihoods.alleleCount(); int numTestcases = readListSize*numHaplotypes; if(debug0_1) System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); int idx = 0; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { - byte [] overallGCP = GCPArrayMap.get(read); + final byte [] overallGCP = GCPArrayMap.get(read); if(debug0_1) System.out.println("Java read length "+read.getReadBases().length); if(debug3) @@ -195,9 +193,9 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { if(verify) { idx = 0; - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + for (final Haplotype h : likelihoods.alleles()) //order is important - access in same order always { - byte[] haplotypeBases = currEntry.getValue().getBases(); + byte[] haplotypeBases = h.getBases(); if(debug0_1) System.out.println("Java haplotype length "+haplotypeBases.length); if(debug3) @@ -212,10 +210,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { PerReadAlleleLikelihoodMap likelihoodMap = null; if(verify) { - jniPairHMM.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); + jniPairHMM.computeLikelihoods(likelihoods, processedReads, GCPArrayMap); likelihoodArray = jniPairHMM.getLikelihoodArray(); //to compare values - likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); + super.computeLikelihoods(likelihoods, processedReads, GCPArrayMap); } else { @@ -234,13 +232,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { idx = 0; int idxInsideHaplotypeList = 0; int readIdx = 0; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { for(int j=0;j currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always + for (final Haplotype haplotype : likelihoods.alleles())//order is important - access in same order always { - idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype); likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList]; ++idx; } @@ -271,13 +269,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { idx = 0; System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); boolean firstLine = true; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { byte [] overallGCP = GCPArrayMap.get(read); byte[] tmpByteArray = new byte[read.getReadBases().length]; - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + for (final Haplotype haplotype : likelihoods.alleles()) //order is important - access in same order always { - byte[] haplotypeBases = currEntry.getValue().getBases(); + byte[] haplotypeBases = haplotype.getBases(); debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true); debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true); for(int k=0;k haplotypeToHaplotypeListIdxMap = new HashMap(); + private HashMap haplotypeToHaplotypeListIdxMap = new HashMap<>(); private JNIHaplotypeDataHolderClass[] mHaplotypeDataArray; @Override public HashMap getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; } @@ -204,22 +194,23 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + public void computeLikelihoods( final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp ) { + if (processedReads.isEmpty()) + return; if(doProfiling) startTime = System.nanoTime(); - int readListSize = reads.size(); - int numHaplotypes = alleleHaplotypeMap.size(); - int numTestcases = readListSize*numHaplotypes; + int readListSize = processedReads.size(); + int numHaplotypes = likelihoods.alleleCount(); JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; int idx = 0; - for(GATKSAMRecord read : reads) + for(GATKSAMRecord read : processedReads) { readDataArray[idx] = new JNIReadDataHolderClass(); readDataArray[idx].readBases = read.getReadBases(); readDataArray[idx].readQuals = read.getBaseQualities(); readDataArray[idx].insertionGOP = read.getBaseInsertionQualities(); readDataArray[idx].deletionGOP = read.getBaseDeletionQualities(); - readDataArray[idx].overallGCP = GCPArrayMap.get(read); + readDataArray[idx].overallGCP = gcp.get(read); ++idx; } @@ -230,20 +221,18 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { // for(haplotypes) // compute_full_prob() jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, mHaplotypeDataArray, mLikelihoodArray, 12); - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - idx = 0; - int idxInsideHaplotypeList = 0; + int readIdx = 0; - for(GATKSAMRecord read : reads) + for(int r = 0; r < readListSize; r++) { - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always - { + int hapIdx = 0; + for (final Haplotype haplotype : likelihoods.alleles()) { + //Since the order of haplotypes in the List and alleleHaplotypeMap is different, //get idx of current haplotype in the list and use this idx to get the right likelihoodValue - idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); - likelihoodMap.add(read, currEntry.getKey(), mLikelihoodArray[readIdx + idxInsideHaplotypeList]); - ++idx; + final int idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype); + likelihoods.set(hapIdx,r,mLikelihoodArray[readIdx + idxInsideHaplotypeList]); + ++hapIdx; } readIdx += numHaplotypes; } @@ -256,7 +245,6 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { pairHMMSetupTime += threadLocalSetupTimeDiff; } } - return likelihoodMap; } /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 7ac0c86df..3d6103966 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "ae90d6be28c19e455083a47bc95c3b1b"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "a9730f883a2e01e93d56b11321c9fd52"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 6d95098fe..98a1586f9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9fc4f04105bde31bafc93548745cb67e"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","93cf7ed2645599d0ddd319c6989eadcc"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index e8971f820..286096111 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("8a4de9e1f59cffe80a4372cf02fe809e")); + Arrays.asList("24d5f65860507ec7f8d9441bfee8653a")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("2b92df91a9337b9d9f03db5699bb41f2")); + Arrays.asList("188dc4d70dc5f4de09f645270cc3f4e1")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("591c9a7bd7d64974b4a2dd932fdef138")); + Arrays.asList("31aaad9adbf12bb7fb12d53080e5c2f5")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("e784b6d1b3725f72a2c42b19e8d24cb1")); + Arrays.asList("fecb8ae4e8f5ea2b2bf3826d674298f7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("d101beb3d1f71a14a31438cce9786881")); + Arrays.asList("0662ec6fe6d6e9807f758b1c82adf056")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("96c2ab043f523c7676e4bc36dd3c9d0b")); + Arrays.asList("3d5140ffc02a125ee26c63ec55449192")); executeTest("test minIndelFraction 0.0", spec); } @@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("53ce3aa9ee16302ab2ec21f742b5e157")); + Arrays.asList("7c62eb9142d9124e49c123dcc052ce57")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 2c61d097e..8398b1776 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -53,18 +53,19 @@ import java.util.Arrays; public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; // -------------------------------------------------------------------------------------------------------------- // // testing normal calling // - // -------------------------------------------------------------------------------------------------------------- + // ---------------------------------------------------- ---------------------------------------------------------- @Test public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("058b54cff14527485c7b1ab035ebb6c4")); + Arrays.asList("c6462121e5a419546f50d53646274fd3")); executeTest("test MultiSample Pilot1", spec); } @@ -96,7 +97,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("120270ddf48dfd461f756c89ea1ab074")); + Arrays.asList("85e8ba3cfeef5e3b72418bcfaec11668")); executeTest("test Multiple SNP alleles", spec); } @@ -112,7 +113,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("bc5a143868e3ad3acc9bb7c09798cdf2")); + Arrays.asList("bd867fb341e75a9f1fc78d25bda93f63")); executeTest("test reverse trim", spec); } @@ -120,7 +121,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("02fd6357bd1082115822c8a931cbb6a3")); + Arrays.asList("e87419cd42fb9684ee3674ee54abe567")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 2e8fc67db..573e61df7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "7096aa0ba002757cdcb9e1868d51a85f"); + "1a65d432437670d2816a42ea270c06a1"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "45be554608485d8d3979284277d00f95"); + "a5bb8f7eb1f7db997be9fd8928a788f6"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 033fff46a..86aa7ca21 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "07c5df4cbd91170d0061355c1a8f7fc9"); + "4047ea70f37b59b782402f678ea2dfee"); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java index c339a2269..61a6860fd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java @@ -388,6 +388,55 @@ public class ReadLikelihoodsUnitTest } + @Test(dataProvider = "dataSets") + public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + + // If all the alleles pass are present in the read-likelihoods collection there is no change. + result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + // If the allele list passed is empty there is no effect. + result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + final Allele newOne; + final Allele newTwo; + final Allele newThree; + + // We add a single missing. + result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); + Assert.assertEquals(original.alleleCount() + 1, result.alleleCount()); + + // We add too more amongst exisisting alleles: + result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.allele(1), + result.allele(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); + + Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); + + final List expectedAlleles = new ArrayList<>(original.alleles()); + expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); + + Assert.assertEquals(result.alleles(),expectedAlleles); + + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); + final int sampleReadCount = original.sampleReadCount(s); + final int originalAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); + newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); + newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); @@ -480,7 +529,8 @@ public class ReadLikelihoodsUnitTest {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, {Allele.create("A",true)}, {Allele.create("ATTTA"), Allele.create("A",true)}, - {Allele.create("A"), Allele.create("AT",true)} + {Allele.create("A"), Allele.create("AT",true)}, + {Allele.create("A",false), Allele.create("AT",false)}, }; @DataProvider(name="marginalizationDataSets") diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java index f03dfbe78..bf12d3a33 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -95,7 +95,7 @@ public class ReadLikelihoods implements Cloneable { /** * Index of the reference allele if any, otherwise -1. */ - private final int referenceAlleleIndex; + private int referenceAlleleIndex; /** * Caches the read-list per sample list returned by {@link #sampleReads} @@ -568,12 +568,66 @@ public class ReadLikelihoods implements Cloneable { final GATKSAMRecord replacement = readRealignments.get(read); if (replacement == null) continue; - readIndex.put(sampleReads[r] = replacement,r); readIndex.remove(read); + readIndex.put(sampleReads[r] = replacement,r); } } } + /** + * Add alleles that are missing in the read-likelihoods collection giving all reads a default + * likelihood value. + * @param alleles the potentially missing alleles. + * @param defaultLikelihood the default read likelihood value for that allele. + * + * @throws IllegalArgumentException if {@code alleles} is {@code null} or there is more than + * one missing allele that is a reference or there is one but the collection already has + * a reference allele. + */ + public void addMissingAlleles(final Collection alleles, final double defaultLikelihood) { + if (alleles == null) + throw new IllegalArgumentException("the alleles list cannot be null"); + if (alleles.isEmpty()) + return; + final List allelesToAdd = new ArrayList(alleles.size()); + for (final A allele : alleles) + if (!alleleIndex.containsKey(allele)) + allelesToAdd.add(allele); + if (allelesToAdd.isEmpty()) + return; + + final int oldAlleleCount = this.alleles.length; + final int newAlleleCount = this.alleles.length + allelesToAdd.size(); + + alleleList = null; + int referenceIndex = this.referenceAlleleIndex; + this.alleles = Arrays.copyOf(this.alleles,newAlleleCount); + int nextIndex = oldAlleleCount; + for (final A allele : allelesToAdd) { + if (allele.isReference()) { + if (referenceIndex != -1) + throw new IllegalArgumentException("there cannot be more than one reference allele"); + referenceIndex = nextIndex; + } + alleleIndex.put(this.alleles[nextIndex] = allele, nextIndex++); + } + + if (referenceIndex != -1) + referenceAlleleIndex = referenceIndex; + + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount); + for (int a = oldAlleleCount; a < newAlleleCount; a++) { + newValuesBySampleIndex[a] = new double[sampleReadCount]; + if (defaultLikelihood != 0.0) + Arrays.fill(newValuesBySampleIndex[a],defaultLikelihood); + } + valuesBySampleIndex[s] = newValuesBySampleIndex; + } + } + /** * Likelihood matrix between a set of alleles and reads. * @param the allele-type. @@ -740,7 +794,7 @@ public class ReadLikelihoods implements Cloneable { * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. */ - public ReadLikelihoods marginalize(final Map> newToOldAlleleMap,final GenomeLoc overlap) { + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap, final GenomeLoc overlap) { if (overlap == null) return marginalize(newToOldAlleleMap); @@ -775,7 +829,7 @@ public class ReadLikelihoods implements Cloneable { final int newSampleReadCount = sampleReadsToKeep.length; if (newSampleReadCount == oldSampleReadCount) { newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); - newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + newReadsBySampleIndex[s] = oldSampleReads.clone(); } else { final Object2IntMap newSampleReadIndex = newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(newSampleReadCount); final GATKSAMRecord[] newSampleReads = newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java index eb0d408e3..6c4460cb3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java @@ -26,15 +26,15 @@ package org.broadinstitute.gatk.utils.pairhmm; import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; /** @@ -158,10 +158,9 @@ public abstract class PairHMM { return listMaxReadLength; } - protected int findMaxHaplotypeLength(final Map haplotypeMap) { + protected int findMaxHaplotypeLength(final Collection haplotypes) { int listMaxHaplotypeLength = 0; - for( final Allele a: haplotypeMap.keySet() ) { - final Haplotype h = haplotypeMap.get(a); + for( final Haplotype h : haplotypes) { final int haplotypeLength = h.getBases().length; if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } } @@ -172,14 +171,20 @@ public abstract class PairHMM { * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from * each haplotype given base substitution, insertion, and deletion probabilities. * - * @param processedReads reads to analyze + * @param processedReads reads to analyze instead of the ones present in the destination read-likelihoods. * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]} * conditional to {@code alleles[a]}. - * @param constantGCP constant penalty for gap continuations. + * @param gcp penalty for gap continuations base array map for processed reads. + * + * @throws IllegalArgumentException * * @return never {@code null}. */ - public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final byte constantGCP) { + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, + final List processedReads, + final Map gcp) { + if (processedReads.isEmpty()) + return; if(doProfiling) startTime = System.nanoTime(); // (re)initialize the pairHMM only if necessary @@ -195,13 +200,11 @@ public abstract class PairHMM { int idx = 0; int readIndex = 0; for(final GATKSAMRecord read : processedReads){ - final int readLength = read.getReadLength(); final byte[] readBases = read.getReadBases(); final byte[] readQuals = read.getBaseQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = new byte[readLength]; - Arrays.fill(overallGCP,constantGCP); + final byte[] overallGCP = gcp.get(read); // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) final boolean isFirstHaplotype = true; @@ -225,78 +228,6 @@ public abstract class PairHMM { } } - /** - * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from - * each haplotype given base substitution, insertion, and deletion probabilities. - * - * @param reads the list of reads - * @param alleleHaplotypeMap the list of haplotypes - * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. - * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of - * said read coming from the said haplotype under the provided error model - * @deprecated - */ - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { - if(doProfiling) - startTime = System.nanoTime(); - - // (re)initialize the pairHMM only if necessary - final int readMaxLength = findMaxReadLength(reads); - final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); - if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - mLikelihoodArray = new double[reads.size()*alleleHaplotypeMap.size()]; - int idx = 0; - for(GATKSAMRecord read : reads){ - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); - - // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - byte[] currentHaplotypeBases = null; - final boolean isFirstHaplotype = true; - Allele currentAllele = null; - double log10l; - //for (final Allele allele : alleleHaplotypeMap.keySet()){ - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()){ - //final Haplotype haplotype = alleleHaplotypeMap.get(allele); - final Allele allele = currEntry.getKey(); - final Haplotype haplotype = currEntry.getValue(); - final byte[] nextHaplotypeBases = haplotype.getBases(); - if (currentHaplotypeBases != null) { - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - mLikelihoodArray[idx++] = log10l; - likelihoodMap.add(read, currentAllele, log10l); - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = allele; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - - // there is no next haplotype, so pass null for nextHaplotypeBases. - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); - likelihoodMap.add(read, currentAllele, log10l); - mLikelihoodArray[idx++] = log10l; - } - } - if(doProfiling) - { - threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); - //synchronized(doProfiling) - { - pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; - } - } - return likelihoodMap; - } - /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities.