From 9e25b209e0ef10cc63dd17f2abf9e61dfbf1b47e Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 3 Aug 2012 12:24:23 -0400 Subject: [PATCH] First pass of implementation of Reduced Reads with HaplotypeCaller. Main changes: a) Active region: scale PL's by representative count to determine whether region is active. b) Scale per-read, per-haplotype likelihoods by read representative counts. A read representative count is (temporarily) defined as the average representative count over all bases in read, TBD whether this is good enough to avoid biases in GL's. c) DeBruijn assembler inserts kmers N times in graph, where N is min representative count of read over kmer span - TBD again whether this is the best approach. d) Bug fixes in FragmentUtils: logic to merge fragments was wrong in cases where there is discrepancy of overlaps between unclipped/soft clipped bases. Didn't affect things before but RR makes prevalence of hard-clipped bases in CIGARs more prevalent so this was exposed. e) Cache read representative counts along with read likelihoods associated with a Haplotype. Code can/should be cleaned up and unified with PairHMMIndelErrorModelCode, as well as refactored to support arbitrary ploidy in HaplotypeCaller --- .../haplotypecaller/HaplotypeCaller.java | 6 +-- .../LikelihoodCalculationEngine.java | 24 ++++++++--- .../SimpleDeBruijnAssembler.java | 17 ++++++-- .../SimpleDeBruijnAssemblerUnitTest.java | 40 +++++++++++++++++++ .../broadinstitute/sting/utils/Haplotype.java | 14 ++++++- .../broadinstitute/sting/utils/MathUtils.java | 7 ++++ .../sting/utils/fragments/FragmentUtils.java | 23 ++++++++++- 7 files changed, 115 insertions(+), 16 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 1130feaea..8acab3e3c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -356,9 +356,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } } - genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); - genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF ); - genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD; + genotypeLikelihoods[AA] += p.getRepresentativeCount()* QualityUtils.qualToProbLog10(qual); + genotypeLikelihoods[AB] += p.getRepresentativeCount()* MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF ); + genotypeLikelihoods[BB] += p.getRepresentativeCount()* QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD; } } genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 365459882..a3179681e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -50,7 +50,6 @@ public class LikelihoodCalculationEngine { } public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { - final int numHaplotypes = haplotypes.size(); int X_METRIC_LENGTH = 0; for( final String sample : perSampleReadList.keySet() ) { @@ -60,8 +59,8 @@ public class LikelihoodCalculationEngine { } } int Y_METRIC_LENGTH = 0; - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final int haplotypeLength = haplotypes.get(jjj).getBases().length; + for( Haplotype h: haplotypes ) { + final int haplotypeLength = h.getBases().length; if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } } @@ -90,8 +89,10 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); final double[][] readLikelihoods = new double[numHaplotypes][numReads]; + final int[][] readCounts = new int[numHaplotypes][numReads]; for( int iii = 0; iii < numReads; iii++ ) { final GATKSAMRecord read = reads.get(iii); + final int readCount = getRepresentativeReadCount(read); final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? @@ -114,13 +115,23 @@ public class LikelihoodCalculationEngine { readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readCounts[jjj][iii] = readCount; } } for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] ); + haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] ); } } + private static int getRepresentativeReadCount(GATKSAMRecord read) { + if (!read.isReducedRead()) + return 1; + + // compute mean representative read counts + final byte[] counts = read.getReducedReadCounts(); + return MathUtils.sum(counts)/counts.length; + } + private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ if( b1[iii] != b2[iii] ) { @@ -154,17 +165,20 @@ public class LikelihoodCalculationEngine { } // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_mapped.getReadCounts(sample); for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); double haplotypeLikelihood = 0.0; for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // log10(10^(a*x1) + 10^(b*x2)) // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF; + haplotypeLikelihood +=readCounts_iii[kkk] *( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF); } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index e2bc7a10f..f3dd3babb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -4,6 +4,7 @@ import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -68,7 +69,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); } - private void createDeBruijnGraphs( final ArrayList reads, final Haplotype refHaplotype ) { + protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { graphs.clear(); // create the graph @@ -161,7 +162,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } } - private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final ArrayList reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final Collection reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -183,6 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); + final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not readuced if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = sequence.length - KMER_LENGTH + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { @@ -194,6 +196,12 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { break; } } + int countNumber = 1; + if (read.isReducedRead()) { + // compute min (?) number of reduced read counts in current kmer span + countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1)); + } + if( !badKmer ) { // get the kmers final byte[] kmer1 = new byte[KMER_LENGTH]; @@ -201,7 +209,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final byte[] kmer2 = new byte[KMER_LENGTH]; System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH); - addKmersToGraph(graph, kmer1, kmer2, false); + for (int k=0; k < countNumber; k++) + addKmersToGraph(graph, kmer1, kmer2, false); } } } @@ -230,7 +239,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return true; } - private void printGraphs() { + protected void printGraphs() { int count = 0; for( final DefaultDirectedGraph graph : graphs ) { GRAPH_WRITER.println("digraph kmer" + count++ +" {"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java index 4f42d5bc8..a83afdbab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java @@ -7,6 +7,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; */ import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -18,6 +20,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { @@ -143,6 +146,43 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { Assert.assertTrue(graphEquals(graph, expectedGraph)); } + @Test(enabled=true) + public void testBasicGraphCreation() { + final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref"); + final byte refBase = refPileupTestProvider.getReferenceContext().getBase(); + final String altBase = (refBase==(byte)'A'?"C":"A"); + final int matches = 50; + final int mismatches = 50; + Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, altBase, new int[]{matches, mismatches}, false, 30); + PrintStream graphWriter = null; + + try{ + graphWriter = new PrintStream("du.txt"); + } catch (Exception e) {} + + + SimpleDeBruijnAssembler assembler = new SimpleDeBruijnAssembler(true,graphWriter); + final Haplotype refHaplotype = new Haplotype(refPileupTestProvider.getReferenceContext().getBases()); + refHaplotype.setIsReference(true); + assembler.createDeBruijnGraphs(refContext.get(refPileupTestProvider.getSampleNames().get(0)).getBasePileup().getReads(), refHaplotype); + +/* // clean up the graphs by pruning and merging + for( final DefaultDirectedGraph graph : graphs ) { + SimpleDeBruijnAssembler.pruneGraph( graph, PRUNE_FACTOR ); + //eliminateNonRefPaths( graph ); + SimpleDeBruijnAssembler.mergeNodes( graph ); + } + */ + if( graphWriter != null ) { + assembler.printGraphs(); + } + + int k=2; + + // find the best paths in the graphs + // return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + + } @Test(enabled = true) public void testEliminateNonRefPaths() { DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 54442622f..eca2e454e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -40,6 +40,7 @@ public class Haplotype { protected final double[] quals; private GenomeLoc genomeLocation = null; private HashMap readLikelihoodsPerSample = null; + private HashMap readCountsPerSample = null; private HashMap eventMap = null; private boolean isRef = false; private Cigar cigar; @@ -83,18 +84,27 @@ public class Haplotype { return Arrays.hashCode(bases); } - public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) { + public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) { if( readLikelihoodsPerSample == null ) { readLikelihoodsPerSample = new HashMap(); } readLikelihoodsPerSample.put(sample, readLikelihoods); + if( readCountsPerSample == null ) { + readCountsPerSample = new HashMap(); + } + readCountsPerSample.put(sample, readCounts); } @Ensures({"result != null"}) public double[] getReadLikelihoods( final String sample ) { return readLikelihoodsPerSample.get(sample); } - + + @Ensures({"result != null"}) + public int[] getReadCounts( final String sample ) { + return readCountsPerSample.get(sample); + } + public Set getSampleKeySet() { return readLikelihoodsPerSample.keySet(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 1f5eaefee..96704f0b8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -210,6 +210,13 @@ public class MathUtils { return total; } + public static int sum(byte[] x) { + int total = 0; + for (byte v : x) + total += (int)v; + return total; + } + /** * Calculates the log10 cumulative sum of an array with log10 probabilities * diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 851272673..2f31c154c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -134,17 +134,36 @@ public class FragmentUtils { GATKSAMRecord firstRead = overlappingPair.get(0); GATKSAMRecord secondRead = overlappingPair.get(1); - if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + /* + System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart()); + System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd()); + System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart()); + System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd()); + System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart()); + System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd()); + System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart()); + System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd()); + */ + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { firstRead = overlappingPair.get(1); // swap them secondRead = overlappingPair.get(0); } - if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { return overlappingPair; // fragments contain indels so don't merge them } +/* // check for inconsistent start positions between uncliped/soft alignment starts + if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart()) + return overlappingPair; + if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart()) + return overlappingPair; + + if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd()) + return overlappingPair; + */ final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );