diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 1130feaea..8acab3e3c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -356,9 +356,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } } - genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); - genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF ); - genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD; + genotypeLikelihoods[AA] += p.getRepresentativeCount()* QualityUtils.qualToProbLog10(qual); + genotypeLikelihoods[AB] += p.getRepresentativeCount()* MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF ); + genotypeLikelihoods[BB] += p.getRepresentativeCount()* QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD; } } genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 365459882..a3179681e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -50,7 +50,6 @@ public class LikelihoodCalculationEngine { } public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { - final int numHaplotypes = haplotypes.size(); int X_METRIC_LENGTH = 0; for( final String sample : perSampleReadList.keySet() ) { @@ -60,8 +59,8 @@ public class LikelihoodCalculationEngine { } } int Y_METRIC_LENGTH = 0; - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final int haplotypeLength = haplotypes.get(jjj).getBases().length; + for( Haplotype h: haplotypes ) { + final int haplotypeLength = h.getBases().length; if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } } @@ -90,8 +89,10 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); final double[][] readLikelihoods = new double[numHaplotypes][numReads]; + final int[][] readCounts = new int[numHaplotypes][numReads]; for( int iii = 0; iii < numReads; iii++ ) { final GATKSAMRecord read = reads.get(iii); + final int readCount = getRepresentativeReadCount(read); final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? @@ -114,13 +115,23 @@ public class LikelihoodCalculationEngine { readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readCounts[jjj][iii] = readCount; } } for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] ); + haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] ); } } + private static int getRepresentativeReadCount(GATKSAMRecord read) { + if (!read.isReducedRead()) + return 1; + + // compute mean representative read counts + final byte[] counts = read.getReducedReadCounts(); + return MathUtils.sum(counts)/counts.length; + } + private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ if( b1[iii] != b2[iii] ) { @@ -154,17 +165,20 @@ public class LikelihoodCalculationEngine { } // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_mapped.getReadCounts(sample); for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); double haplotypeLikelihood = 0.0; for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // log10(10^(a*x1) + 10^(b*x2)) // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF; + haplotypeLikelihood +=readCounts_iii[kkk] *( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF); } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index e2bc7a10f..f3dd3babb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -4,6 +4,7 @@ import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -68,7 +69,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); } - private void createDeBruijnGraphs( final ArrayList reads, final Haplotype refHaplotype ) { + protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { graphs.clear(); // create the graph @@ -161,7 +162,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } } - private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final ArrayList reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final Collection reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -183,6 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); + final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not readuced if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = sequence.length - KMER_LENGTH + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { @@ -194,6 +196,12 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { break; } } + int countNumber = 1; + if (read.isReducedRead()) { + // compute min (?) number of reduced read counts in current kmer span + countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1)); + } + if( !badKmer ) { // get the kmers final byte[] kmer1 = new byte[KMER_LENGTH]; @@ -201,7 +209,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final byte[] kmer2 = new byte[KMER_LENGTH]; System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH); - addKmersToGraph(graph, kmer1, kmer2, false); + for (int k=0; k < countNumber; k++) + addKmersToGraph(graph, kmer1, kmer2, false); } } } @@ -230,7 +239,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return true; } - private void printGraphs() { + protected void printGraphs() { int count = 0; for( final DefaultDirectedGraph graph : graphs ) { GRAPH_WRITER.println("digraph kmer" + count++ +" {"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java index 4f42d5bc8..a83afdbab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java @@ -7,6 +7,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; */ import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -18,6 +20,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { @@ -143,6 +146,43 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { Assert.assertTrue(graphEquals(graph, expectedGraph)); } + @Test(enabled=true) + public void testBasicGraphCreation() { + final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref"); + final byte refBase = refPileupTestProvider.getReferenceContext().getBase(); + final String altBase = (refBase==(byte)'A'?"C":"A"); + final int matches = 50; + final int mismatches = 50; + Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, altBase, new int[]{matches, mismatches}, false, 30); + PrintStream graphWriter = null; + + try{ + graphWriter = new PrintStream("du.txt"); + } catch (Exception e) {} + + + SimpleDeBruijnAssembler assembler = new SimpleDeBruijnAssembler(true,graphWriter); + final Haplotype refHaplotype = new Haplotype(refPileupTestProvider.getReferenceContext().getBases()); + refHaplotype.setIsReference(true); + assembler.createDeBruijnGraphs(refContext.get(refPileupTestProvider.getSampleNames().get(0)).getBasePileup().getReads(), refHaplotype); + +/* // clean up the graphs by pruning and merging + for( final DefaultDirectedGraph graph : graphs ) { + SimpleDeBruijnAssembler.pruneGraph( graph, PRUNE_FACTOR ); + //eliminateNonRefPaths( graph ); + SimpleDeBruijnAssembler.mergeNodes( graph ); + } + */ + if( graphWriter != null ) { + assembler.printGraphs(); + } + + int k=2; + + // find the best paths in the graphs + // return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + + } @Test(enabled = true) public void testEliminateNonRefPaths() { DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 54442622f..eca2e454e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -40,6 +40,7 @@ public class Haplotype { protected final double[] quals; private GenomeLoc genomeLocation = null; private HashMap readLikelihoodsPerSample = null; + private HashMap readCountsPerSample = null; private HashMap eventMap = null; private boolean isRef = false; private Cigar cigar; @@ -83,18 +84,27 @@ public class Haplotype { return Arrays.hashCode(bases); } - public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) { + public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) { if( readLikelihoodsPerSample == null ) { readLikelihoodsPerSample = new HashMap(); } readLikelihoodsPerSample.put(sample, readLikelihoods); + if( readCountsPerSample == null ) { + readCountsPerSample = new HashMap(); + } + readCountsPerSample.put(sample, readCounts); } @Ensures({"result != null"}) public double[] getReadLikelihoods( final String sample ) { return readLikelihoodsPerSample.get(sample); } - + + @Ensures({"result != null"}) + public int[] getReadCounts( final String sample ) { + return readCountsPerSample.get(sample); + } + public Set getSampleKeySet() { return readLikelihoodsPerSample.keySet(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 1f5eaefee..96704f0b8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -210,6 +210,13 @@ public class MathUtils { return total; } + public static int sum(byte[] x) { + int total = 0; + for (byte v : x) + total += (int)v; + return total; + } + /** * Calculates the log10 cumulative sum of an array with log10 probabilities * diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 851272673..2f31c154c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -134,17 +134,36 @@ public class FragmentUtils { GATKSAMRecord firstRead = overlappingPair.get(0); GATKSAMRecord secondRead = overlappingPair.get(1); - if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + /* + System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart()); + System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd()); + System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart()); + System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd()); + System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart()); + System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd()); + System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart()); + System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd()); + */ + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { firstRead = overlappingPair.get(1); // swap them secondRead = overlappingPair.get(0); } - if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { return overlappingPair; // fragments contain indels so don't merge them } +/* // check for inconsistent start positions between uncliped/soft alignment starts + if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart()) + return overlappingPair; + if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart()) + return overlappingPair; + + if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd()) + return overlappingPair; + */ final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );