From a783f19ab12060084c9811902365d7629b1631ca Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:45:53 -0500 Subject: [PATCH 02/16] Fix for potential HaplotypeCaller bug in annotation ordering -- Annotations were being called on VariantContext that might needed to be trimmed. Simply inverted the order of operations so trimming occurs before the annotations are added. -- Minor cleanup of call to PairHMM in LikelihoodCalculationEngine --- .../walkers/haplotypecaller/GenotypingEngine.java | 13 ++++++++----- .../LikelihoodCalculationEngine.java | 9 ++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 1cfc65581..400de6485 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -273,16 +273,19 @@ public class GenotypingEngine { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + + VariantContext annotatedCall = call; + // TODO -- should be before annotated call, so that QDL works correctly + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + } + + annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall); // maintain the set of all called haplotypes for ( final Allele calledAllele : call.getAlleles() ) calledHaplotypes.addAll(alleleMapper.get(calledAllele)); - if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - } - returnCalls.add( annotatedCall ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a7d85b969..87b488b3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -151,9 +151,12 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), - pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); + final boolean isFirstHaplotype = jjj == 0; + final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), + read.getReadBases(), readQuals, readInsQuals, readDelQuals, + overallGCP, haplotypeStart, isFirstHaplotype); + + perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } } return perReadAlleleLikelihoodMap; From 752440707d6005104410ff67f79fe410723df964 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:52:53 -0500 Subject: [PATCH 03/16] AlignmentUtils.calcNumDifferentBases computes the number of bases that differ between a reference and read sequence given a cigar between the two. --- .../sting/utils/sam/AlignmentUtils.java | 39 +++++++++++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 30 +++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index d59d0ef63..58f70d4b6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,45 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Get the number of bases at which refSeq and readSeq differ, given their alignment + * + * @param cigar the alignment of readSeq to refSeq + * @param refSeq the bases of the reference sequence + * @param readSeq the bases of the read sequence + * @return the number of bases that differ between refSeq and readSeq + */ + public static int calcNumDifferentBases(final Cigar cigar, final byte[] refSeq, final byte[] readSeq) { + int refIndex = 0, readIdx = 0, delta = 0; + + for (final CigarElement ce : cigar.getCigarElements()) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case X:case EQ:case M: + for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) + delta += refSeq[refIndex] != readSeq[readIdx] ? 1 : 0; + break; + case I: + delta += elementLength; + case S: + readIdx += elementLength; + break; + case D: + delta += elementLength; + case N: + refIndex += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + } + + return delta; + } + public static class MismatchCount { public int numMismatches = 0; public long mismatchQualities = 0; diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index ae01c6c63..660dadc00 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,7 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { - private final static boolean DEBUG = false; + private final static boolean DEBUG = true; private SAMFileHeader header; /** Basic aligned and mapped read. */ @@ -145,6 +145,34 @@ public class AlignmentUtilsUnitTest { } + @DataProvider(name = "CalcNumDifferentBasesData") + public Object[][] makeCalcNumDifferentBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"5M", "ACGTA", "ACGTA", 0}); + tests.add(new Object[]{"5M", "ACGTA", "ACGTT", 1}); + tests.add(new Object[]{"5M", "ACGTA", "TCGTT", 2}); + tests.add(new Object[]{"5M", "ACGTA", "TTGTT", 3}); + tests.add(new Object[]{"5M", "ACGTA", "TTTTT", 4}); + tests.add(new Object[]{"5M", "ACGTA", "TTTCT", 5}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTA", 3}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTT", 4}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "TCNNNGTT", 5}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACA", 2}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACT", 3}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TCT", 4}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TGT", 5}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "CalcNumDifferentBasesData") + public void testCalcNumDifferentBases(final String cigarString, final String ref, final String read, final int expectedDifferences) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + Assert.assertEquals(AlignmentUtils.calcNumDifferentBases(cigar, ref.getBytes(), read.getBytes()), expectedDifferences); + } + + @DataProvider(name = "NumAlignedBasesCountingSoftClips") public Object[][] makeNumAlignedBasesCountingSoftClips() { List tests = new ArrayList(); From a8fb26bf0167147bae2c3896e41be5049dd0bb48 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 21:39:18 -0500 Subject: [PATCH 04/16] A generic downsampler that reduces coverage for a bunch of reads -- Exposed the underlying minElementsPerStack parameter for LevelingDownsampler --- .../gatk/downsampling/DownsamplingUtils.java | 107 ++++++++++++++++++ .../downsampling/LevelingDownsampler.java | 26 ++++- .../walkers/readutils/DownsampleReadsQC.java | 105 +++++++++++++++++ 3 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java new file mode 100644 index 000000000..877083829 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Utilities for using the downsamplers for common tasks + * + * User: depristo + * Date: 3/6/13 + * Time: 4:26 PM + */ +public class DownsamplingUtils { + private DownsamplingUtils() { } + + /** + * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing + * coverage at any read start to less than minReadsPerAlignmentStart + * + * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and + * want to reduce the coverage of the big peak down without removing the many reads at the edge of this + * interval that are in fact good + * + * This algorithm separately operates on the reads for each sample independently. + * + * @param reads a sorted list of reads + * @param downsampleTo the targeted number of reads we want from reads per sample + * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start + * to below this. That is, if this value is 2, we'll never reduce the number + * of reads starting at a specific start site to less than 2 + * @return a sorted list of reads + */ + public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { + if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); + + final List downsampled = new ArrayList(reads.size()); + + final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); + for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { + final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); + downsampler.submit(readsByPosMap.values()); + downsampler.signalEndOfInput(); + for ( final List downsampledReads : downsampler.consumeFinalizedItems()) + downsampled.addAll(downsampledReads); + } + + return ReadUtils.sortReadsByCoordinate(downsampled); + } + + /** + * Build the data structure mapping for each sample -> (position -> reads at position) + * + * Note that the map position -> reads isn't ordered in any meaningful way + * + * @param reads a list of sorted reads + * @return a map containing the list of reads at each start location, for each sample independently + */ + private static Map>> partitionReadsBySampleAndStart(final List reads) { + final Map>> readsBySampleByStart = new LinkedHashMap>>(); + + for ( final GATKSAMRecord read : reads ) { + Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); + + if ( readsByStart == null ) { + readsByStart = new LinkedHashMap>(); + readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); + } + + List readsAtStart = readsByStart.get(read.getAlignmentStart()); + if ( readsAtStart == null ) { + readsAtStart = new LinkedList(); + readsByStart.put(read.getAlignmentStart(), readsAtStart); + } + + readsAtStart.add(read); + } + + return readsBySampleByStart; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index 9b4b2adcb..a8a808333 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -47,8 +47,8 @@ import java.util.*; * @author David Roazen */ public class LevelingDownsampler, E> implements Downsampler { - - private int targetSize; + private final int minElementsPerStack; + private final int targetSize; private List groups; @@ -59,12 +59,32 @@ public class LevelingDownsampler, E> implements Downsampler /** * Construct a LevelingDownsampler * + * Uses the default minElementsPerStack of 1 + * * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed * this value -- if it does, items are removed from Lists evenly until the total size * is <= this value */ public LevelingDownsampler( int targetSize ) { + this(targetSize, 1); + } + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, + * if a stack has only 3 elements and minElementsPerStack is 3, no matter what + * we'll not reduce this stack below 3. + */ + public LevelingDownsampler(final int targetSize, final int minElementsPerStack) { + if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); + if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); + this.targetSize = targetSize; + this.minElementsPerStack = minElementsPerStack; clear(); reset(); } @@ -148,7 +168,7 @@ public class LevelingDownsampler, E> implements Downsampler // remove any more items without violating the constraint that all groups must // be left with at least one item while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > 1 ) { + if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { groupSizes[currentGroupIndex]--; numItemsToRemove--; numConsecutiveUmodifiableGroups = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java new file mode 100644 index 000000000..1141a9164 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; + +/** + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class DownsampleReadsQC extends ReadWalker> implements NanoSchedulable { + @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) + StingSAMFileWriter out; + + @Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false) + private int minReadsPerAlignmentStart = 5; + + @Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false) + private int downsampleTo = 1000; + + /** + * The initialize function. + */ + public void initialize() { +// final boolean preSorted = true; +// if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { +// Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); +// } + } + + /** + * The reads map function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param readIn the read itself, as a GATKSAMRecord + * @return the read itself + */ + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + return readIn; + } + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public Collection reduceInit() { + return new LinkedList(); + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param read the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public Collection reduce( GATKSAMRecord read, Collection output ) { + output.add(read); + return output; + } + + @Override + public void onTraversalDone(Collection result) { + for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList(result), downsampleTo, minReadsPerAlignmentStart) ) + out.addAlignment(read); + } +} From ffea6dd95f34de0c979273c0783d6da75bbe16f0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 18 Mar 2013 17:06:32 -0400 Subject: [PATCH 05/16] HaplotypeCaller now has the ability to only consider the best N haplotypes for genotyping -- Added a -dontGenotype mode for testing assembly efficiency -- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted --- .../haplotypecaller/DeBruijnAssembler.java | 74 +++++++++++++------ .../haplotypecaller/HaplotypeCaller.java | 22 +++++- .../broadinstitute/sting/utils/Haplotype.java | 32 +++++++- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 566605a8c..bf08d1526 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -52,6 +52,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -73,6 +74,7 @@ import java.util.*; */ public class DeBruijnAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; @@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; - private final boolean DEBUG; - private final PrintStream GRAPH_WRITER; + private final boolean debug; + private final PrintStream graphWriter; private final List graphs = new ArrayList(); - private final int MIN_KMER; + private final int minKmer; + private final int maxHaplotypesToConsider; private int PRUNE_FACTOR = 2; - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) { + public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { super(); - DEBUG = debug; - GRAPH_WRITER = graphWriter; - MIN_KMER = minKmer; + this.debug = debug; + this.graphWriter = graphWriter; + this.minKmer = minKmer; + this.maxHaplotypesToConsider = maxHaplotypesToConsider; } /** @@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on - if( GRAPH_WRITER != null ) { + if( graphWriter != null ) { printGraphs(); } @@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { graphs.clear(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs + if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs // create the graph for each possible kmer - for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); + for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { + //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready pruneGraph(graph, PRUNE_FACTOR); @@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { - GRAPH_WRITER.println("digraph assemblyGraphs {"); + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); } if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); } if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); } } - GRAPH_WRITER.println("}"); + graphWriter.println("}"); } @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm final List returnHaplotypes = new ArrayList(); refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); final Cigar c = new Cigar(); @@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } if( !returnHaplotypes.contains(h) ) { h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setCigar( leftAlignedCigar ); + h.setCigar(leftAlignedCigar); + h.setScore(path.getScore()); returnHaplotypes.add(h); // for GGA mode, add the desired allele into the haplotype if it isn't already present @@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - if( DEBUG ) { - if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); + final List finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes); + if ( finalHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { + if( finalHaplotypes.size() > 1 ) { + System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { System.out.println("Found only the reference haplotype in the assembly graph."); } - for( final Haplotype h : returnHaplotypes ) { + for( final Haplotype h : finalHaplotypes ) { System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() ); + System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } - return returnHaplotypes; + + return finalHaplotypes; + } + + /** + * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider + * + * @param haplotypes a list of haplotypes to consider + * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider + */ + private List selectHighestScoringHaplotypes(final List haplotypes) { + if ( haplotypes.size() <= maxHaplotypesToConsider ) + return haplotypes; + else { + final List sorted = new ArrayList(haplotypes); + Collections.sort(sorted, new Haplotype.ScoreComparator()); + return sorted.subList(0, maxHaplotypesToConsider); + } } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4fc075807..cff631802 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; + @Advanced + @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false) + protected int maxHaplotypesToConsider = 100000; + /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; @@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer ); + assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + if (dontGenotype) + return 1; + // evaluate each sample's reads against all haplotypes final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); @@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // //--------------------------------------------------------------------------------------------------------------- - private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); @@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } } - activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); + + activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 415cb73ac..070ae4f5d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -41,12 +41,12 @@ import java.io.Serializable; import java.util.*; public class Haplotype extends Allele { - private GenomeLoc genomeLocation = null; private Map eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; + private double score = 0; /** * Main constructor @@ -259,4 +259,34 @@ public class Haplotype extends Allele { this.pos = pos; } } + + /** + * Get the score (an estimate of the support) of this haplotype + * @return a double, where higher values are better + */ + public double getScore() { + return this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * Set the score (an estimate of the support) of this haplotype. + * + * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score + * + * @param score a double, where higher values are better + */ + public void setScore(double score) { + this.score = this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * A comparator that sorts haplotypes in decreasing order of score, so that the best supported + * haplotypes are at the top + */ + public static class ScoreComparator implements Comparator { + @Override + public int compare(Haplotype o1, Haplotype o2) { + return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); + } + } } From 53a904bcbd8ec63420a76e98e7dda6432d2907f8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 8 Mar 2013 11:28:22 -0500 Subject: [PATCH 06/16] Bugfix for HaplotypeCaller: GSA-822 for trimming softclipped reads -- Previous version would not trim down soft clip bases that extend beyond the active region, causing the assembly graph to go haywire. The new code explicitly reverts soft clips to M bases with the ever useful ReadClipper, and then trims. Note this isn't a 100% fix for the issue, as it's possible that the newly unclipped bases might in reality extend beyond the active region, should their true alignment include a deletion in the reference. Needs to be fixed. JIRA added -- See https://jira.broadinstitute.org/browse/GSA-822 -- #resolve #fix GSA-822 --- .../haplotypecaller/DeBruijnAssembler.java | 18 +++++++++++-- .../DeBruijnAssemblyGraph.java | 27 ++++++++++++++++--- .../haplotypecaller/HaplotypeCaller.java | 12 +++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index bf08d1526..33198ce8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -271,9 +271,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph + //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -289,6 +290,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { + //if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue; + //logger.info("Adding read " + read + " with sequence " + read.getReadString()); final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced @@ -325,8 +328,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { + final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size + final int writeFirstGraphWithSizeSmallerThan = 50; + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { + if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); @@ -337,8 +348,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]"); } + + if ( onlyWriteOneGraph ) + break; } graphWriter.println("}"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java index 6a95049d1..d28f81b55 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -47,9 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.jgrapht.graph.DefaultDirectedGraph; import java.io.PrintStream; @@ -62,9 +60,32 @@ import java.util.Arrays; */ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + private final int kmerSize; - public DeBruijnAssemblyGraph() { + /** + * Construct a DeBruijnAssemblyGraph with kmerSize + * @param kmerSize + */ + public DeBruijnAssemblyGraph(final int kmerSize) { super(DeBruijnEdge.class); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; + } + + /** + * Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11 + */ + protected DeBruijnAssemblyGraph() { + this(11); + } + + /** + * How big of a kmer did we use to create this graph? + * @return + */ + public int getKmerSize() { + return kmerSize; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index cff631802..affad6450 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -608,8 +608,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + + // uncomment to remove hard clips from consideration at all + //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } } From 0f4328f6fe0bdb08e0d82553a27bd2fd0d5668d5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 8 Mar 2013 13:10:15 -0500 Subject: [PATCH 07/16] Basic kmer error correction algorithm xfor the HaplotypeCaller -- Error correction algorithm for the assembler. Only error correct reads to others that are exactly 1 mismatch away -- The assembler logic is now: build initial graph, error correct*, merge nodes*, prune dead nodes, merge again, make haplotypes. The * elements are new -- Refactored the printing routines a bit so it's easy to write a single graph to disk for testing. -- Easier way to control the testing of the graph assembly algorithms -- Move graph printing function to DeBruijnAssemblyGraph from DeBruijnAssembler -- Simple protected parsing function for making DeBruijnAssemblyGraph -- Change the default prune factor for the graph to 1, from 2 -- debugging graph transformations are controllable from command line --- .../haplotypecaller/DeBruijnAssembler.java | 107 ++++++-- .../DeBruijnAssemblyGraph.java | 115 ++++++-- .../haplotypecaller/DeBruijnVertex.java | 12 + .../haplotypecaller/HaplotypeCaller.java | 7 +- .../haplotypecaller/KMerErrorCorrector.java | 253 ++++++++++++++++++ .../DeBruijnAssemblerUnitTest.java | 68 ++++- .../KMerErrorCorrectorUnitTest.java | 78 ++++++ 7 files changed, 594 insertions(+), 46 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 33198ce8c..0caebebee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -64,6 +64,9 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.PrintStream; import java.util.*; @@ -88,16 +91,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; private final boolean debug; + private final int onlyBuildKmerGraphOfThisSite = -1; // 35; + private final boolean debugGraphTransformations; private final PrintStream graphWriter; private final List graphs = new ArrayList(); private final int minKmer; private final int maxHaplotypesToConsider; private int PRUNE_FACTOR = 2; - - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { + + public DeBruijnAssembler(final boolean debug, final boolean debugGraphTransformations, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { super(); this.debug = debug; + this.debugGraphTransformations = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; this.maxHaplotypesToConsider = maxHaplotypesToConsider; @@ -144,13 +150,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // create the graph for each possible kmer for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { - //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + if ( onlyBuildKmerGraphOfThisSite != -1 && kmer != onlyBuildKmerGraphOfThisSite ) + continue; + + if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - pruneGraph(graph, PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); + graph = graph.errorCorrect(); + if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); cleanNonRefPaths(graph); mergeNodes(graph); + if ( debugGraphTransformations ) graph.printGraph(new File("merged.dot"), PRUNE_FACTOR); + pruneGraph(graph, PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("pruned.dot"), PRUNE_FACTOR); + mergeNodes(graph); + if ( debugGraphTransformations ) graph.printGraph(new File("merged2.dot"), PRUNE_FACTOR); if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference sanityCheckReferenceGraph(graph, refHaplotype); graphs.add(graph); @@ -169,7 +185,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && - graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { + graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); final Set inEdges = graph.incomingEdgesOf(incomingVertex); if( inEdges.size() == 1 && outEdges.size() == 1 ) { @@ -199,6 +215,59 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + // + // X -> ABC -> Y + // -> aBC -> Y + // + // becomes + // + // X -> A -> BCY + // -> a -> BCY + // +// @Requires({"graph != null"}) +// protected static void simplifyMergedGraph(final DeBruijnAssemblyGraph graph) { +// boolean foundNodesToMerge = true; +// while( foundNodesToMerge ) { +// foundNodesToMerge = false; +// +// for( final DeBruijnVertex v : graph.vertexSet() ) { +// if ( isRootOfComplexDiamond(v) ) { +// foundNodesToMerge = simplifyComplexDiamond(graph, v); +// if ( foundNodesToMerge ) +// break; +// } +// } +// } +// } +// +// private static boolean simplifyComplexDiamond(final DeBruijnAssemblyGraph graph, final DeBruijnVertex root) { +// final Set outEdges = graph.outgoingEdgesOf(root); +// final DeBruijnVertex diamondBottom = graph.getEdge(graph.getEdgeTarget(outEdges.iterator().next()); +// // all of the edges point to the same sink, so it's time to merge +// final byte[] commonSuffix = commonSuffixOfEdgeTargets(outEdges, targetSink); +// if ( commonSuffix != null ) { +// final DeBruijnVertex suffixVertex = new DeBruijnVertex(commonSuffix, graph.getKmerSize()); +// graph.addVertex(suffixVertex); +// graph.addEdge(suffixVertex, targetSink); +// +// for( final DeBruijnEdge edge : outEdges ) { +// final DeBruijnVertex target = graph.getEdgeTarget(edge); +// final DeBruijnVertex prefix = target.withoutSuffix(commonSuffix); +// graph.addEdge(prefix, suffixVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); +// graph.removeVertex(graph.getEdgeTarget(edge)); +// graph.removeAllEdges(root, target); +// graph.removeAllEdges(target, targetSink); +// } +// +// graph.removeAllEdges(outEdges); +// graph.removeVertex(targetSink); +// +// return true; +// } else { +// return false; +// } +// } + protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { return; @@ -279,7 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) { + if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) { if( DEBUG ) { System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); } @@ -297,7 +366,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { @@ -318,42 +387,32 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false, 1); } } } } } + return graph; } protected void printGraphs() { - final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size final int writeFirstGraphWithSizeSmallerThan = 50; graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { - if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); continue; } - for( final DeBruijnEdge edge : graph.edgeSet() ) { - if( edge.getMultiplicity() > PRUNE_FACTOR ) { - graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); - } - if( edge.isRef() ) { - graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); - } - if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } - } - for( final DeBruijnVertex v : graph.vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]"); - } + graph.printGraph(graphWriter, false, PRUNE_FACTOR); - if ( onlyWriteOneGraph ) + if ( debugGraphTransformations ) break; } + graphWriter.println("}"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java index d28f81b55..a78a5c627 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -48,8 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.PrintStream; import java.util.Arrays; @@ -60,6 +64,7 @@ import java.util.Arrays; */ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + private final static Logger logger = Logger.getLogger(DeBruijnAssemblyGraph.class); private final int kmerSize; /** @@ -73,6 +78,24 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + "label=\""+ edge.getMultiplicity() +"\"" + "];"); +// if( edge.getMultiplicity() > PRUNE_FACTOR ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); +// } if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } + //if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } + for( final DeBruijnVertex v : vertexSet() ) { - final String label = ( inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]"); } - GRAPH_WRITER.println("}"); + + if ( writeHeader ) + graphWriter.println("}"); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 1390b0ee9..aa8e24576 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -68,6 +68,18 @@ public class DeBruijnVertex { this.kmer = kmer; } + protected DeBruijnVertex( final String sequence, final int kmer ) { + this(sequence.getBytes(), kmer); + } + + protected DeBruijnVertex( final String sequence ) { + this(sequence.getBytes(), sequence.length()); + } + + public int getKmer() { + return kmer; + } + @Override public boolean equals( Object v ) { return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index affad6450..d5f283475 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -192,7 +192,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected String keepRG = null; @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; + protected int MIN_PRUNE_FACTOR = 1; @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) @@ -284,6 +284,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler", required = false) + protected boolean debugGraphTransformations = false; + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -386,7 +389,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider ); + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java new file mode 100644 index 000000000..66ea8a078 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -0,0 +1,253 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.*; + +/** + * generic utility function that error corrects kmers based on counts + * + * This class provides a generic facility for remapping kmers (byte[] of constant size) + * that occur infrequently to those that occur frequently, based on their simple edit distance + * as measured by mismatches. + * + * The overall workflow of using this class is simple. First, you create the class with + * parameters determining how the error correction should proceed. Next, you provide all + * of the kmers you see in your data. Once all kmers have been added, you call computeErrorCorrectionMap + * to tell this class that all kmers have been added and its time to determine error correcting + * mapping from observed kmers to corrected kmers. This correction looks for low-count (as determined + * by maxCountToCorrect) kmers and chooses the best kmer (minimizing mismatches) among those + * with at least minCountOfKmerToBeCorrection occurrences to error correct the kmer to. If + * there is no kmer with less than maxMismatchesToCorrect then the kmer will be mapped to + * null, indicating the kmer should not be used. + * + * TODO -- for ease of implementation this class uses strings instead of byte[] as those cannot + * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient + * TODO -- version would use the byte[] directly + * + * User: depristo + * Date: 3/8/13 + * Time: 1:16 PM + */ +public class KMerErrorCorrector { + /** + * A map of for each kmer to its num occurrences in addKmers + */ + Map countsByKMer = new HashMap(); + + /** + * A map from raw kmer -> error corrected kmer + */ + Map rawToErrorCorrectedMap = null; + + final int kmerLength; + final int maxCountToCorrect; + final int maxMismatchesToCorrect; + final int minCountOfKmerToBeCorrection; + + /** + * Create a new kmer corrector + * + * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 + * @param maxCountToCorrect kmers with < maxCountToCorrect will try to be error corrected to another kmer, must be >= 0 + * @param maxMismatchesToCorrect the maximum number of mismatches between a to-be-corrected kmer and its + * best match that we attempt to error correct. If no sufficiently similar + * kmer exists, it will be remapped to null. Must be >= 1 + * @param minCountOfKmerToBeCorrection the minimum count of a kmer to be considered a target for correction. + * That is, kmers that need correction will only be matched with kmers + * with at least minCountOfKmerToBeCorrection occurrences. Must be >= 1 + */ + public KMerErrorCorrector(final int kmerLength, + final int maxCountToCorrect, + final int maxMismatchesToCorrect, + final int minCountOfKmerToBeCorrection) { + if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); + if ( maxCountToCorrect < 0 ) throw new IllegalArgumentException("maxCountToCorrect must be >= 0 but got " + maxCountToCorrect); + if ( maxMismatchesToCorrect < 1 ) throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); + if ( minCountOfKmerToBeCorrection < 1 ) throw new IllegalArgumentException("minCountOfKmerToBeCorrection must be >= 1 but got " + minCountOfKmerToBeCorrection); + + this.kmerLength = kmerLength; + this.maxCountToCorrect = maxCountToCorrect; + this.maxMismatchesToCorrect = maxMismatchesToCorrect; + this.minCountOfKmerToBeCorrection = minCountOfKmerToBeCorrection; + } + + /** + * For testing purposes + * + * @param kmers + */ + protected void addKmers(final String ... kmers) { + for ( final String kmer : kmers ) + addKmer(kmer, 1); + computeErrorCorrectionMap(); + } + + /** + * Add a kmer that occurred kmerCount times + * + * @param rawKmer a kmer + * @param kmerCount the number of occurrences + */ + public void addKmer(final byte[] rawKmer, final int kmerCount) { + addKmer(new String(rawKmer), kmerCount); + } + + + /** + * Get the error corrected kmer for rawKmer + * + * @param rawKmer a kmer that was already added that we want to get an error corrected version for + * @return an error corrected kmer to use instead of rawKmer. May be == rawKmer if no error correction + * is not necessary. May be null, indicating the rawKmer shouldn't be used at all + */ + public byte[] getErrorCorrectedKmer(final byte[] rawKmer) { + final String result = getErrorCorrectedKmer(new String(rawKmer)); + return result == null ? null : result.getBytes(); + } + + /** + * Indicate that no more kmers will be added to the kmer error corrector, so that the + * error correction data structure should be computed from the added kmers. Enabled calls + * to getErrorCorrectedKmer, and disable calls to addKmer. + */ + public void computeErrorCorrectionMap() { + if ( countsByKMer == null ) + throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); + + final LinkedList needsCorrection = new LinkedList(); + final LinkedList goodKmers = new LinkedList(); + + rawToErrorCorrectedMap = new HashMap(); + for ( Map.Entry kmerCounts: countsByKMer.entrySet() ) { + if ( kmerCounts.getValue() <= maxCountToCorrect ) + needsCorrection.add(kmerCounts.getKey()); + else { + // todo -- optimization could make not in map mean == + rawToErrorCorrectedMap.put(kmerCounts.getKey(), kmerCounts.getKey()); + + // only allow corrections to kmers with at least this count + if ( kmerCounts.getValue() >= minCountOfKmerToBeCorrection ) + goodKmers.add(kmerCounts.getKey()); + } + } + + for ( final String toCorrect : needsCorrection ) { + final String corrected = findClosestKMer(toCorrect, goodKmers); + rawToErrorCorrectedMap.put(toCorrect, corrected); + } + + // cleanup memory -- we don't need the counts for each kmer any longer + countsByKMer = null; + } + + protected void addKmer(final String rawKmer, final int kmerCount) { + if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); + if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); + if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); + + final Integer countFromMap = countsByKMer.get(rawKmer); + final int count = countFromMap == null ? 0 : countFromMap; + countsByKMer.put(rawKmer, count + kmerCount); + } + + protected String findClosestKMer(final String kmer, final Collection goodKmers) { + String bestMatch = null; + int minMismatches = Integer.MAX_VALUE; + + for ( final String goodKmer : goodKmers ) { + final int mismatches = countMismatches(kmer, goodKmer); + if ( mismatches < minMismatches ) { + minMismatches = mismatches; + bestMatch = goodKmer; + } + } + + return minMismatches > maxMismatchesToCorrect ? null : bestMatch; + } + + protected int countMismatches(final String one, final String two) { + int mismatches = 0; + for ( int i = 0; i < one.length(); i++ ) + mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; + return mismatches; + } + + protected String getErrorCorrectedKmer(final String rawKmer) { + if ( rawToErrorCorrectedMap == null ) throw new IllegalStateException("Cannot get error corrected kmers until after computeErrorCorrectionMap has been called"); + if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); + return rawToErrorCorrectedMap.get(rawKmer); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); + for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { + final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); + if ( correcting ) + b.append(String.format("%n\t%s / %d -> %s / %d [correcting? %b]", + toCorrect.getKey(), getCounts(toCorrect.getKey()), + toCorrect.getValue(), getCounts(toCorrect.getValue()), + correcting)); + } + b.append("\n}"); + return b.toString(); + } + + /** + * Get a simple count estimate for printing for kmer + * @param kmer the kmer + * @return an integer count for kmer + */ + private int getCounts(final String kmer) { + if ( kmer == null ) return 0; + final Integer count = countsByKMer == null ? -1 : countsByKMer.get(kmer); + if ( count == null ) + throw new IllegalArgumentException("kmer not found in counts -- bug " + kmer); + return count; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index f4a6d5494..2096b487e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -67,6 +67,7 @@ import org.testng.annotations.Test; import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { + private final static boolean DEBUG = true; private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { @@ -97,7 +98,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { final byte[] kmer2 = new byte[KMER_LENGTH]; System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - graph.addKmersToGraph(kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false, 1); } DeBruijnAssembler.mergeNodes(graph); return graph; @@ -118,13 +119,70 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); } - @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true) + @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG) public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { logger.warn(String.format("Test: %s", cfg.toString())); Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph())); } - @Test(enabled = true) +// @DataProvider(name = "SimpleMergeOperationsData") +// public Object[][] makeSimpleMergeOperationsData() { +// List tests = new ArrayList(); +// +// { +// DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); +// DeBruijnVertex v1 = new DeBruijnVertex("AT"); +// DeBruijnVertex v2 = new DeBruijnVertex("TC"); +// DeBruijnVertex v3 = new DeBruijnVertex("CT"); +// DeBruijnVertex v4 = new DeBruijnVertex("TG"); +// DeBruijnVertex v5 = new DeBruijnVertex("AG"); +// DeBruijnVertex v6 = new DeBruijnVertex("GG"); +// DeBruijnVertex v7 = new DeBruijnVertex("GA"); +// DeBruijnVertex v8 = new DeBruijnVertex("AA"); +// +// graph.addVertices(v1, v2, v3, v4, v5, v6, v7, v8); +// graph.addEdge(v1, v2, new DeBruijnEdge(false, 2)); +// graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); +// graph.addEdge(v2, v4, new DeBruijnEdge(false, 5)); +// graph.addEdge(v3, v5, new DeBruijnEdge(false, 3)); +// graph.addEdge(v4, v6, new DeBruijnEdge(false, 3)); +// graph.addEdge(v5, v7, new DeBruijnEdge(false, 2)); +// graph.addEdge(v6, v7, new DeBruijnEdge(false, 6)); +// graph.addEdge(v7, v8, new DeBruijnEdge(false, 2)); +// +// graph.printGraph(new File("unittest.dot"), 1); +// +// DeBruijnAssemblyGraph expected = new DeBruijnAssemblyGraph(); +// DeBruijnVertex e1 = new DeBruijnVertex("ATC"); +// DeBruijnVertex e2 = new DeBruijnVertex("T"); +// DeBruijnVertex e3 = new DeBruijnVertex("G"); +// DeBruijnVertex e4 = new DeBruijnVertex("GAA"); +// +// expected.addVertices(e1,e2,e3,e4); +// expected.addEdge(e1, e2, new DeBruijnEdge(false, 3)); +// expected.addEdge(e1, e3, new DeBruijnEdge(false, 5)); +// expected.addEdge(e2, e4, new DeBruijnEdge(false, 2)); +// expected.addEdge(e3, e4, new DeBruijnEdge(false, 6)); +// +// expected.printGraph(new File("expected.dot"), 1); +// +// tests.add(new Object[]{graph.clone(), expected}); +// } +// +// return tests.toArray(new Object[][]{}); +// } +// +// @Test(dataProvider = "SimpleMergeOperationsData", enabled = true) +// public void testSimpleMergeOperations(final DeBruijnAssemblyGraph unmergedGraph, final DeBruijnAssemblyGraph expectedGraph) throws Exception { +// final DeBruijnAssemblyGraph mergedGraph = (DeBruijnAssemblyGraph)unmergedGraph.clone(); +// DeBruijnAssembler.mergeNodes(mergedGraph); +// mergedGraph.printGraph(new File("merged.dot"), 1); +// DeBruijnAssembler.simplifyMergedGraph(mergedGraph); +// mergedGraph.printGraph(new File("reduced.dot"), 1); +// Assert.assertTrue(graphEquals(mergedGraph, expectedGraph)); +// } + + @Test(enabled = !DEBUG) public void testPruneGraph() { DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph(); @@ -210,7 +268,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { return true; } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; @@ -221,7 +279,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testLeftAlignCigarSequentially() { String preRefString = "GATCGATCGATC"; String postRefString = "TTT"; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java new file mode 100644 index 000000000..f88d7ee7f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java @@ -0,0 +1,78 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KMerErrorCorrectorUnitTest extends BaseTest { + @Test + public void testMyData() { + final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); + + corrector.addKmers( + "ATG", "ATG", "ATG", "ATG", + "ACC", "ACC", "ACC", + "AAA", "AAA", + "CTG", // -> ATG + "NNA", // -> AAA + "CCC", // => ACC + "NNN", // => null + "NNC" // => ACC [because of min count won't go to NNA] + ); + + Assert.assertEquals(corrector.getErrorCorrectedKmer("ATG"), "ATG"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("ACC"), "ACC"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("AAA"), "AAA"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("CTG"), "ATG"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNA"), "AAA"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("CCC"), "ACC"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNN"), null); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNC"), "ACC"); + } +} From 98c4cd060d098323655e9b0899a8253ef1be4b25 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 14 Mar 2013 10:03:04 -0400 Subject: [PATCH 08/16] HaplotypeCaller now uses SeqGraph instead of kmer graph to build haplotypes. -- DeBruijnAssembler functions are no longer static. This isn't the right way to unit test your code -- An a HaplotypeCaller command line option to use low-quality bases in the assembly -- Refactored DeBruijnGraph and associated libraries into base class -- Refactored out BaseEdge, BaseGraph, and BaseVertex from DeBruijn equivalents. These DeBruijn versions now inherit from these base classes. Added some reasonable unit tests for the base and Debruijn edges and vertex classes. -- SeqVertex: allows multiple vertices in the sequence graph to have the same sequence and yet be distinct -- Further refactoring of DeBruijnAssembler in preparation for the full SeqGraph <-> DeBruijnGraph split -- Moved generic methods in DeBruijnAssembler into BaseGraph -- Created a simple SeqGraph that contains SeqVertex objects -- Simple chain zipper for SeqGraph that reproduces the results for the mergeNode function on DeBruijnGraphs -- A working version of the diamond remodeling algorithm in SeqGraph that converts graphs that look like A -> Xa, A -> Ya, Xa -> Z, Ya -> Z into A -> X -> a, A -Y -> a, a -> Z -- Allow SeqGraph zip merging of vertices where the in vertex has multiple incoming edges or the out vertex has multiple outgoing edges -- Fix all unit tests so they work with the new SeqGraph system. All tests passed without modification. -- Debugging makes it easier to tell which kmer graph contributes to a haplotype -- Better docs and unit tests for BaseVertex, SeqVertex, BaseEdge, and KMerErrorCorrector -- Remove unnecessary printing of cleaning info in BaseGraph -- Turn off kmer graph creation in DeBruijnAssembler.java -- Only print SeqGraphs when debugGraphTransformations is set to true -- Rename DeBruijnGraphUnitTest to SeqGraphUnitTest. Now builds DeBruijnGraph, converts to SeqGraph, uses SeqGraph.mergenodes and tests for equality. -- Update KBestPathsUnitTest to use SeqGraphs not DebruijnGraphs -- DebruijnVertex now longer takes kmer argument -- it's implicit that the kmer length is the sequence.length now --- .../{DeBruijnEdge.java => BaseEdge.java} | 70 ++-- ...ruijnAssemblyGraph.java => BaseGraph.java} | 318 ++++++++++-------- .../walkers/haplotypecaller/BaseVertex.java | 148 ++++++++ .../haplotypecaller/DeBruijnAssembler.java | 249 ++++---------- .../haplotypecaller/DeBruijnGraph.java | 179 ++++++++++ .../haplotypecaller/DeBruijnVertex.java | 63 ++-- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../walkers/haplotypecaller/KBestPaths.java | 96 +++--- .../haplotypecaller/KMerErrorCorrector.java | 28 +- .../walkers/haplotypecaller/SeqGraph.java | 280 +++++++++++++++ .../walkers/haplotypecaller/SeqVertex.java | 153 +++++++++ .../haplotypecaller/BaseEdgeUnitTest.java | 105 ++++++ .../haplotypecaller/BaseGraphUnitTest.java | 192 +++++++++++ .../haplotypecaller/BaseVertexUnitTest.java | 91 +++++ .../DeBruijnAssemblerUnitTest.java | 205 +---------- .../DeBruijnAssemblyGraphUnitTest.java | 2 +- .../DeBruijnVertexUnitTest.java | 69 ++++ .../haplotypecaller/KBestPathsUnitTest.java | 183 ++++++---- .../KMerErrorCorrectorUnitTest.java | 25 +- .../haplotypecaller/SeqGraphUnitTest.java | 106 ++++++ .../haplotypecaller/SeqVertexUnitTest.java | 109 ++++++ .../org/broadinstitute/sting/utils/Utils.java | 13 + 22 files changed, 1964 insertions(+), 732 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{DeBruijnEdge.java => BaseEdge.java} (83%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{DeBruijnAssemblyGraph.java => BaseGraph.java} (70%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java similarity index 83% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 28c735b5c..053f0e1a1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -46,68 +46,94 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.jgrapht.graph.DefaultDirectedGraph; - import java.io.Serializable; import java.util.Comparator; /** - * Created by IntelliJ IDEA. + * simple edge class for connecting nodes in the graph + * + * Works equally well for all graph types (kmer or sequence) + * * User: ebanks * Date: Mar 23, 2011 */ - -// simple edge class for connecting nodes in the graph -public class DeBruijnEdge { - +public class BaseEdge { private int multiplicity; private boolean isRef; - public DeBruijnEdge() { - multiplicity = 1; - isRef = false; - } + /** + * Create a new BaseEdge with weight multiplicity and, if isRef == true, indicates a path through the reference + * + * @param isRef indicates whether this edge is a path through the reference + * @param multiplicity the number of observations of this edge + */ + public BaseEdge(final boolean isRef, final int multiplicity) { + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); - public DeBruijnEdge( final boolean isRef ) { - multiplicity = 1; - this.isRef = isRef; - } - - public DeBruijnEdge( final boolean isRef, final int multiplicity ) { this.multiplicity = multiplicity; this.isRef = isRef; } + /** + * Copy constructor + * + * @param toCopy + */ + public BaseEdge(final BaseEdge toCopy) { + this(toCopy.isRef(), toCopy.getMultiplicity()); + } + + /** + * Get the number of observations of paths connecting two vertices + * @return a positive integer >= 0 + */ public int getMultiplicity() { return multiplicity; } + /** + * Set the multiplicity of this edge to value + * @param value an integer >= 0 + */ public void setMultiplicity( final int value ) { + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); multiplicity = value; } + /** + * Does this edge indicate a path through the reference graph? + * @return true if so + */ public boolean isRef() { return isRef; } + /** + * Indicate that this edge follows the reference sequence, or not + * @param isRef true if this is a reference edge + */ public void setIsRef( final boolean isRef ) { this.isRef = isRef; } // For use when comparing edges pulled from the same graph - public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) { + public boolean equals( final BaseGraph graph, final BaseEdge edge ) { return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); } // For use when comparing edges across graphs! - public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) { + public boolean equals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } - public static class EdgeWeightComparator implements Comparator, Serializable { + /** + * Sorts a collection of BaseEdges in decreasing order of weight, so that the most + * heavily weighted is at the start of the list + */ + public static class EdgeWeightComparator implements Comparator, Serializable { @Override - public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) { - return edge1.multiplicity - edge2.multiplicity; + public int compare(final BaseEdge edge1, final BaseEdge edge2) { + return edge2.multiplicity - edge1.multiplicity; } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java similarity index 70% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index a78a5c627..6aa687312 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -49,13 +49,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; +import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintStream; -import java.util.Arrays; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -63,44 +65,37 @@ import java.util.Arrays; * Date: 2/6/13 */ -public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { - private final static Logger logger = Logger.getLogger(DeBruijnAssemblyGraph.class); +public class BaseGraph extends DefaultDirectedGraph { + protected final static Logger logger = Logger.getLogger(BaseGraph.class); private final int kmerSize; /** - * Construct a DeBruijnAssemblyGraph with kmerSize - * @param kmerSize + * Construct an empty BaseGraph */ - public DeBruijnAssemblyGraph(final int kmerSize) { - super(DeBruijnEdge.class); - - if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); - this.kmerSize = kmerSize; - } - - public static DeBruijnAssemblyGraph parse(final int kmerSize, final int multiplicity, final String ... reads) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(kmerSize); - - for ( final String read : reads ) { - final int kmersInSequence = read.length() - kmerSize + 1; - for (int i = 0; i < kmersInSequence - 1; i++) { - // get the kmers - final byte[] kmer1 = new byte[kmerSize]; - System.arraycopy(read.getBytes(), i, kmer1, 0, kmerSize); - final byte[] kmer2 = new byte[kmerSize]; - System.arraycopy(read.getBytes(), i+1, kmer2, 0, kmerSize); - graph.addKmersToGraph(kmer1, kmer2, false, multiplicity); - } - } - - return graph; + public BaseGraph() { + this(11); } /** - * Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11 + * Edge factory that creates non-reference multiplicity 1 edges + * @param the new of our vertices */ - protected DeBruijnAssemblyGraph() { - this(11); + private static class MyEdgeFactory implements EdgeFactory { + @Override + public BaseEdge createEdge(T sourceVertex, T targetVertex) { + return new BaseEdge(false, 1); + } + } + + /** + * Construct a DeBruijnGraph with kmerSize + * @param kmerSize + */ + public BaseGraph(final int kmerSize) { + super(new MyEdgeFactory()); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; } /** @@ -115,9 +110,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph outgoingVerticesOf(final T v) { + final Set s = new HashSet(); + for ( final BaseEdge e : outgoingEdgesOf(v) ) { + s.add(getEdgeTarget(e)); + } + return s; + } + + /** + * Get the set of vertices connected to v by incoming edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v + */ + public Set incomingVerticesOf(final T v) { + final Set s = new HashSet(); + for ( final BaseEdge e : incomingEdgesOf(v) ) { + s.add(getEdgeSource(e)); + } + return s; + } + /** * Print out the graph in the dot language for visualization * @param destination File to write to @@ -403,11 +353,12 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph PRUNE_FACTOR ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); // } @@ -417,11 +368,114 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph edgesToCheck = new HashSet(); + edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex())); + while( !edgesToCheck.isEmpty() ) { + final BaseEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex())); + while( !edgesToCheck.isEmpty() ) { + final BaseEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new LinkedList(); + for( final T v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + removeAllVertices(verticesToRemove); + } + + protected void pruneGraph( final int pruneFactor ) { + final List edgesToRemove = new ArrayList(); + for( final BaseEdge e : edgeSet() ) { + if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + edgesToRemove.add(e); + } + } + removeAllEdges(edgesToRemove); + + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new ArrayList(); + for( final T v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + + removeAllVertices(verticesToRemove); + } + + public void removeVerticesNotConnectedToRef() { + final HashSet toRemove = new HashSet(vertexSet()); + final HashSet visited = new HashSet(); + + final LinkedList toVisit = new LinkedList(); + final T refV = getReferenceSourceVertex(); + if ( refV != null ) { + toVisit.add(refV); + while ( ! toVisit.isEmpty() ) { + final T v = toVisit.pop(); + if ( ! visited.contains(v) ) { + toRemove.remove(v); + visited.add(v); + for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev); + for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next); + } + } + +// for ( final T remove : toRemove ) +// logger.info("Cleaning up nodes not attached to any reference node: " + remove.toString()); + + removeAllVertices(toRemove); + } + } + + public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { + if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { + return false; + } + for( BaseEdge e1 : g1.edgeSet() ) { + boolean found = false; + for( BaseEdge e2 : g2.edgeSet() ) { + if( e1.equals(g1, e2, g2) ) { found = true; break; } + } + if( !found ) { return false; } + } + for( BaseEdge e2 : g2.edgeSet() ) { + boolean found = false; + for( BaseEdge e1 : g1.edgeSet() ) { + if( e2.equals(g2, e1, g1) ) { found = true; break; } + } + if( !found ) { return false; } + } + return true; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java new file mode 100644 index 000000000..fad7a51d1 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -0,0 +1,148 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; + +import java.util.Arrays; + +/** + * A graph vertex that holds some sequence information + * + * @author: depristo + * @since 03/2013 + */ +public class BaseVertex { + final byte[] sequence; + + /** + * Create a new sequence vertex with sequence + * @param sequence a non-null, non-empty sequence of bases contained in this vertex + */ + public BaseVertex(final byte[] sequence) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); + + // TODO -- should we really be cloning here? + this.sequence = sequence.clone(); + } + + /** + * Get the length of this sequence + * @return a positive integer >= 1 + */ + public int length() { + return sequence.length; + } + + /** + * For testing purposes only -- low performance + * @param sequence + */ + protected BaseVertex(final String sequence) { + this(sequence.getBytes()); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + BaseVertex that = (BaseVertex) o; + + if (!Arrays.equals(sequence, that.sequence)) return false; + + return true; + } + + @Override + public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + return Arrays.hashCode(sequence); + } + + @Override + public String toString() { + return getSequenceString(); + } + + /** + * Get the sequence of bases contained in this vertex + * + * Do not modify these bytes in any way! + * + * @return a non-null pointer to the bases contained in this vertex + */ + @Ensures("result != null") + public byte[] getSequence() { + // TODO -- why is this cloning? It's likely extremely expensive + return sequence.clone(); + } + + /** + * Get a string representation of the bases in this vertex + * @return a non-null String + */ + @Ensures("result != null") + public String getSequenceString() { + return new String(sequence); + } + + /** + * Get the sequence unique to this vertex + * + * This function may not return the entire sequence stored in the vertex, as kmer graphs + * really only provide 1 base of additional sequence (the last base of the kmer). + * + * The base implementation simply returns the sequence. + * + * @param source is this vertex a source vertex (i.e., no in nodes) in the graph + * @return a byte[] of the sequence added by this vertex to the overall sequence + */ + public byte[] getAdditionalSequence(final boolean source) { + return getSequence(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 0caebebee..9d84d611f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -65,8 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.PrintStream; import java.util.*; @@ -81,7 +79,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; - private static final byte MIN_QUALITY = (byte) 16; + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -91,22 +89,34 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; private final boolean debug; - private final int onlyBuildKmerGraphOfThisSite = -1; // 35; private final boolean debugGraphTransformations; private final PrintStream graphWriter; - private final List graphs = new ArrayList(); private final int minKmer; private final int maxHaplotypesToConsider; + private final byte minBaseQualityToUseInAssembly; + + private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; private int PRUNE_FACTOR = 2; - public DeBruijnAssembler(final boolean debug, final boolean debugGraphTransformations, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { + protected DeBruijnAssembler() { + this(false, -1, null, 11, 1000, DEFAULT_MIN_BASE_QUALITY_TO_USE); + } + + public DeBruijnAssembler(final boolean debug, + final int debugGraphTransformations, + final PrintStream graphWriter, + final int minKmer, + final int maxHaplotypesToConsider, + final byte minBaseQualityToUseInAssembly) { super(); this.debug = debug; - this.debugGraphTransformations = debugGraphTransformations; + this.debugGraphTransformations = debugGraphTransformations > 0; + this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; this.maxHaplotypesToConsider = maxHaplotypesToConsider; + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } /** @@ -130,199 +140,73 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { this.PRUNE_FACTOR = PRUNE_FACTOR; // create the graphs - createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); + final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on if( graphWriter != null ) { - printGraphs(); + printGraphs(graphs); } // find the best paths in the graphs and return them as haplotypes - return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); } @Requires({"reads != null", "refHaplotype != null"}) - protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { - graphs.clear(); + protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { + final List graphs = new LinkedList(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs - + if( maxKmer < minKmer) { + // Reads are too small for assembly so don't try to create any assembly graphs + return Collections.emptyList(); + } // create the graph for each possible kmer for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { - if ( onlyBuildKmerGraphOfThisSite != -1 && kmer != onlyBuildKmerGraphOfThisSite ) + if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) continue; if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); graph = graph.errorCorrect(); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); - cleanNonRefPaths(graph); - mergeNodes(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("merged.dot"), PRUNE_FACTOR); - pruneGraph(graph, PRUNE_FACTOR); - if ( debugGraphTransformations ) graph.printGraph(new File("pruned.dot"), PRUNE_FACTOR); - mergeNodes(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("merged2.dot"), PRUNE_FACTOR); - if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference - sanityCheckReferenceGraph(graph, refHaplotype); - graphs.add(graph); + graph.cleanNonRefPaths(); + + final SeqGraph seqGraph = toSeqGraph(graph); + + if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + sanityCheckReferenceGraph(seqGraph, refHaplotype); + graphs.add(seqGraph); + + if ( debugGraphTransformations ) // we only want to use one graph size + break; } } + } + + return graphs; } - @Requires({"graph != null"}) - protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; - - for( final DeBruijnEdge e : graph.edgeSet() ) { - final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); - final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && - graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { - final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); - final Set inEdges = graph.incomingEdgesOf(incomingVertex); - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } - - final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer ); - graph.addVertex(addedVertex); - for( final DeBruijnEdge edge : outEdges ) { - graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final DeBruijnEdge edge : inEdges ) { - graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } - - graph.removeVertex( incomingVertex ); - graph.removeVertex( outgoingVertex ); - foundNodesToMerge = true; - break; - } - } - } + private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { + final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); + seqGraph.pruneGraph(PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + seqGraph.mergeNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR); + seqGraph.removeVerticesNotConnectedToRef(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR); + seqGraph.mergeBranchingNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR); + seqGraph.mergeNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR); + return seqGraph; } - // - // X -> ABC -> Y - // -> aBC -> Y - // - // becomes - // - // X -> A -> BCY - // -> a -> BCY - // -// @Requires({"graph != null"}) -// protected static void simplifyMergedGraph(final DeBruijnAssemblyGraph graph) { -// boolean foundNodesToMerge = true; -// while( foundNodesToMerge ) { -// foundNodesToMerge = false; -// -// for( final DeBruijnVertex v : graph.vertexSet() ) { -// if ( isRootOfComplexDiamond(v) ) { -// foundNodesToMerge = simplifyComplexDiamond(graph, v); -// if ( foundNodesToMerge ) -// break; -// } -// } -// } -// } -// -// private static boolean simplifyComplexDiamond(final DeBruijnAssemblyGraph graph, final DeBruijnVertex root) { -// final Set outEdges = graph.outgoingEdgesOf(root); -// final DeBruijnVertex diamondBottom = graph.getEdge(graph.getEdgeTarget(outEdges.iterator().next()); -// // all of the edges point to the same sink, so it's time to merge -// final byte[] commonSuffix = commonSuffixOfEdgeTargets(outEdges, targetSink); -// if ( commonSuffix != null ) { -// final DeBruijnVertex suffixVertex = new DeBruijnVertex(commonSuffix, graph.getKmerSize()); -// graph.addVertex(suffixVertex); -// graph.addEdge(suffixVertex, targetSink); -// -// for( final DeBruijnEdge edge : outEdges ) { -// final DeBruijnVertex target = graph.getEdgeTarget(edge); -// final DeBruijnVertex prefix = target.withoutSuffix(commonSuffix); -// graph.addEdge(prefix, suffixVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); -// graph.removeVertex(graph.getEdgeTarget(edge)); -// graph.removeAllEdges(root, target); -// graph.removeAllEdges(target, targetSink); -// } -// -// graph.removeAllEdges(outEdges); -// graph.removeVertex(targetSink); -// -// return true; -// } else { -// return false; -// } -// } - - protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { - if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { - return; - } - // Remove non-ref edges connected before and after the reference path - final Set edgesToCheck = new HashSet(); - edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) { - final List edgesToRemove = new ArrayList(); - for( final DeBruijnEdge e : graph.edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - graph.removeAllEdges(edgesToRemove); - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) { + protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { if( graph.getReferenceSourceVertex() == null ) { throw new IllegalStateException("All reference graphs must have a reference source vertex."); } @@ -338,9 +222,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH); + final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); @@ -370,7 +254,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { - if( qualities[jjj] < MIN_QUALITY ) { + if( qualities[jjj] < minBaseQualityToUseInAssembly ) { badKmer = true; break; } @@ -397,11 +281,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return graph; } - protected void printGraphs() { + protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; graphWriter.println("digraph assemblyGraphs {"); - for( final DeBruijnAssemblyGraph graph : graphs ) { + for( final SeqGraph graph : graphs ) { if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); continue; @@ -418,7 +302,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @Ensures({"result.contains(refHaplotype)"}) - private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { + private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm @@ -440,8 +324,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - for( final DeBruijnAssemblyGraph graph : graphs ) { - for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + for( final SeqGraph graph : graphs ) { + for ( final KBestPaths.Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -466,6 +350,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { h.setScore(path.getScore()); returnHaplotypes.add(h); + if ( debug ) + logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place @@ -599,7 +486,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @return the left-aligned cigar */ @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { final Cigar cigarToReturn = new Cigar(); Cigar cigarToAlign = new Cigar(); for (int i = 0; i < cigar.numCigarElements(); i++) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java new file mode 100644 index 000000000..d9df03539 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -0,0 +1,179 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * A DeBruijn kmer graph + * + * User: rpoplin + * Date: 2/6/13 + */ +public class DeBruijnGraph extends BaseGraph { + /** + * Create an empty DeBruijnGraph with default kmer size + */ + public DeBruijnGraph() { + super(); + } + + /** + * Create an empty DeBruijnGraph with kmer size + * @param kmerSize kmer size, must be >= 1 + */ + public DeBruijnGraph(int kmerSize) { + super(kmerSize); + } + + /** + * Pull kmers out of the given long sequence and throw them on in the graph + * @param sequence byte array holding the sequence with which to build the assembly graph + * @param KMER_LENGTH the desired kmer length to use + * @param isRef if true the kmers added to the graph will have reference edges linking them + */ + public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) { + if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); } + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1); + } + } + + /** + * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers + * @return a freshly allocated graph + */ + protected DeBruijnGraph errorCorrect() { + final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables + + for( final BaseEdge e : edgeSet() ) { + for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) { + // TODO -- need a cleaner way to deal with the ref weight + corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); + } + } + corrector.computeErrorCorrectionMap(); + + final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); + + for( final BaseEdge e : edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } + } + + return correctedGraph; + } + + /** + * Add edge to assembly graph connecting the two kmers + * @param kmer1 the source kmer for the edge + * @param kmer2 the target kmer for the edge + * @param isRef true if the added edge is a reference edge + * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph + */ + public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { + if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } + + final int numVertexBefore = vertexSet().size(); + final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 ); + addVertex(v1); + final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 ); + addVertex(v2); + if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + + final BaseEdge targetEdge = getEdge(v1, v2); + if ( targetEdge == null ) { + addEdge(v1, v2, new BaseEdge( isRef, multiplicity )); + } else { + if( isRef ) { + targetEdge.setIsRef( true ); + } + targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity); + } + return true; + } + + /** + * Convert this kmer graph to a simple sequence graph. + * + * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer + * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence + * + * @return a newly allocated SequenceGraph + */ + @Ensures({"result != null"}) + protected SeqGraph convertToSequenceGraph() { + final SeqGraph seqGraph = new SeqGraph(getKmerSize()); + final Map vertexMap = new HashMap(); + + // create all of the equivalent seq graph vertices + for ( final DeBruijnVertex dv : vertexSet() ) { + final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); + vertexMap.put(dv, sv); + seqGraph.addVertex(sv); + } + + // walk through the nodes and connect them to their equivalent seq vertices + for( final BaseEdge e : edgeSet() ) { + final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); + final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); + seqGraph.addEdge(seqInV, seqOutV, e); + } + + return seqGraph; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index aa8e24576..47716b7c5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -52,59 +52,50 @@ import com.google.java.contract.Invariant; import java.util.Arrays; /** - * Created by IntelliJ IDEA. + * simple node class for storing kmer sequences + * * User: ebanks * Date: Mar 23, 2011 */ -// simple node class for storing kmer sequences -@Invariant("kmer > 0") -public class DeBruijnVertex { - - protected final byte[] sequence; - public final int kmer; - - public DeBruijnVertex( final byte[] sequence, final int kmer ) { - this.sequence = sequence.clone(); - this.kmer = kmer; - } - - protected DeBruijnVertex( final String sequence, final int kmer ) { - this(sequence.getBytes(), kmer); +public class DeBruijnVertex extends BaseVertex { + public DeBruijnVertex( final byte[] sequence ) { + super(sequence); } + /** + * For testing purposes only + * @param sequence + */ protected DeBruijnVertex( final String sequence ) { - this(sequence.getBytes(), sequence.length()); + this(sequence.getBytes()); } + /** + * Get the kmer size for this DeBruijnVertex + * @return integer >= 1 + */ + @Ensures("result >= 1") public int getKmer() { - return kmer; + return sequence.length; } - @Override - public boolean equals( Object v ) { - return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); - } - - @Override - public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect - return Arrays.hashCode(sequence); - } - - public String toString() { - return new String(sequence); - } - + /** + * Get the string representation of the suffix of this DeBruijnVertex + * @return a non-null non-empty string + */ + @Ensures({"result != null", "result.length() >= 1"}) public String getSuffixString() { return new String(getSuffix()); } @Ensures("result != null") - public byte[] getSequence() { - return sequence.clone(); + // TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base + public byte[] getSuffix() { + return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length ); } - @Ensures("result != null") - public byte[] getSuffix() { - return Arrays.copyOfRange( sequence, kmer - 1, sequence.length ); + @Override + public byte[] getAdditionalSequence(boolean source) { + return source ? super.getAdditionalSequence(source) : getSuffix(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index d5f283475..7bec4bee5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -284,8 +284,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; - @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler", required = false) - protected boolean debugGraphTransformations = false; + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) + protected int debugGraphTransformations = -1; + + @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) + protected boolean useLowQualityBasesForAssembly = false; // the UG engines private UnifiedGenotyperEngine UG_engine = null; @@ -389,7 +392,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider ); + final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider, minBaseQualityToUseInAssembly ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -610,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { - GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index e97fdb3cb..8c29cfa98 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -52,13 +52,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; @@ -70,28 +65,27 @@ import java.util.*; */ // Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. // This is different from most graph traversals because we want to test paths from any source node to any sink node. -public class KBestPaths { - +public class KBestPaths { // static access only - protected KBestPaths() { } + public KBestPaths() { } + private static int MAX_PATHS_TO_HOLD = 100; protected static class MyInt { public int val = 0; } // class to keep track of paths - protected static class Path { - + protected static class Path { // the last vertex seen in the path - private final DeBruijnVertex lastVertex; + private final T lastVertex; // the list of edges comprising the path - private final List edges; + private final List edges; // the scores for the path private final int totalScore; // the graph from which this path originated - private final DeBruijnAssemblyGraph graph; + private final BaseGraph graph; // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base @@ -101,19 +95,19 @@ public class KBestPaths { private static final double SW_GAP_EXTEND = -1.1; private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) { + public Path( final T initialVertex, final BaseGraph graph ) { lastVertex = initialVertex; - edges = new ArrayList(0); + edges = new ArrayList(0); totalScore = 0; this.graph = graph; } - public Path( final Path p, final DeBruijnEdge edge ) { + public Path( final Path p, final BaseEdge edge ) { if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } graph = p.graph; lastVertex = p.graph.getEdgeTarget(edge); - edges = new ArrayList(p.edges); + edges = new ArrayList(p.edges); edges.add(edge); totalScore = p.totalScore + edge.getMultiplicity(); } @@ -123,10 +117,10 @@ public class KBestPaths { * @param edge the given edge to test * @return true if the edge is found in this path */ - public boolean containsEdge( final DeBruijnEdge edge ) { + public boolean containsEdge( final BaseEdge edge ) { if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edge) ) { return true; } @@ -140,11 +134,11 @@ public class KBestPaths { * @param edge the given edge to test * @return number of times this edge appears in the path */ - public int numInPath( final DeBruijnEdge edge ) { + public int numInPath( final BaseEdge edge ) { if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } int numInPath = 0; - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edge) ) { numInPath++; } @@ -153,22 +147,11 @@ public class KBestPaths { return numInPath; } - /** - * Does this path contain a reference edge? - * @return true if the path contains a reference edge - */ - public boolean containsRefEdge() { - for( final DeBruijnEdge e : edges ) { - if( e.isRef() ) { return true; } - } - return false; - } - - public List getEdges() { return edges; } + public List getEdges() { return edges; } public int getScore() { return totalScore; } - public DeBruijnVertex getLastVertexInPath() { return lastVertex; } + public T getLastVertexInPath() { return lastVertex; } /** * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes @@ -179,7 +162,7 @@ public class KBestPaths { if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); } byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0))); - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); } return bases; @@ -201,9 +184,9 @@ public class KBestPaths { } // reset the bubble state machine - final BubbleStateMachine bsm = new BubbleStateMachine(cigar); + final BubbleStateMachine bsm = new BubbleStateMachine(cigar); - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edges.get(0)) ) { advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); } @@ -231,7 +214,7 @@ public class KBestPaths { * @param e the edge which generated this node in the path */ @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) { + private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { if( graph.isReferenceNode( node ) ) { if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else if( e !=null && !e.isRef() ) { @@ -283,7 +266,7 @@ public class KBestPaths { */ @Requires({"graph != null"}) @Ensures({"result != null"}) - private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) { + private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); final Cigar returnCigar = new Cigar(); @@ -328,10 +311,10 @@ public class KBestPaths { } // class to keep track of the bubble state machine - protected static class BubbleStateMachine { + protected static class BubbleStateMachine { public boolean inBubble = false; public byte[] bubbleBytes = null; - public DeBruijnVertex lastSeenReferenceNode = null; + public T lastSeenReferenceNode = null; public Cigar cigar = null; public BubbleStateMachine( final Cigar initialCigar ) { @@ -358,14 +341,14 @@ public class KBestPaths { * @return a list with at most k top-scoring paths from the graph */ @Ensures({"result != null", "result.size() <= k"}) - public static List getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) { + public List getKBestPaths( final BaseGraph graph, final int k ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); } final ArrayList bestPaths = new ArrayList(); // run a DFS for best paths - for( final DeBruijnVertex v : graph.vertexSet() ) { + for( final T v : graph.vertexSet() ) { if( graph.inDegreeOf(v) == 0 ) { findBestPaths(new Path(v, graph), bestPaths); } @@ -376,31 +359,28 @@ public class KBestPaths { return bestPaths.subList(0, Math.min(k, bestPaths.size())); } - private static void findBestPaths( final Path path, final List bestPaths ) { + private void findBestPaths( final Path path, final List bestPaths ) { findBestPaths(path, bestPaths, new MyInt()); } - private static void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { + private void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { // did we hit the end of a path? if ( allOutgoingEdgesHaveBeenVisited(path) ) { - if( path.containsRefEdge() ) { - if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { - // clean out some low scoring paths - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 - } - bestPaths.add(path); + if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { + // clean out some low scoring paths + Collections.sort(bestPaths, new PathComparatorTotalScore() ); + for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 } + bestPaths.add(path); } else if( n.val > 10000) { // do nothing, just return } else { // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(); + final ArrayList edgeArrayList = new ArrayList(); edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex)); - Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator()); - Collections.reverse(edgeArrayList); - for ( final DeBruijnEdge edge : edgeArrayList ) { + Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); + for ( final BaseEdge edge : edgeArrayList ) { // make sure the edge is not already in the path if ( path.containsEdge(edge) ) continue; @@ -416,8 +396,8 @@ public class KBestPaths { * @param path the path to test * @return true if all the outgoing edges at the end of this path have already been visited */ - private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { - for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { + private boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { + for( final BaseEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java index 66ea8a078..05bd1b881 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -226,28 +226,16 @@ public class KMerErrorCorrector { @Override public String toString() { final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); - for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { - final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); - if ( correcting ) - b.append(String.format("%n\t%s / %d -> %s / %d [correcting? %b]", - toCorrect.getKey(), getCounts(toCorrect.getKey()), - toCorrect.getValue(), getCounts(toCorrect.getValue()), - correcting)); + if ( rawToErrorCorrectedMap == null ) { + b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); + } else { + for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { + final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); + if ( correcting ) + b.append(String.format("%n\tCorrecting %s -> %s", toCorrect.getKey(), toCorrect.getValue())); + } } b.append("\n}"); return b.toString(); } - - /** - * Get a simple count estimate for printing for kmer - * @param kmer the kmer - * @return an integer count for kmer - */ - private int getCounts(final String kmer) { - if ( kmer == null ) return 0; - final Integer count = countsByKMer == null ? -1 : countsByKMer.get(kmer); - if ( count == null ) - throw new IllegalArgumentException("kmer not found in counts -- bug " + kmer); - return count; - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java new file mode 100644 index 000000000..960f2cdd7 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -0,0 +1,280 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; + +import java.util.*; + +/** + * A graph that contains base sequence at each node + * + * @author: depristo + * @since 03/2013 + */ +public class SeqGraph extends BaseGraph { + /** + * Construct an empty SeqGraph + */ + public SeqGraph() { + super(); + } + + /** + * Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer + * + * The kmer size is purely information. It is useful when converting a Debruijn graph -> SeqGraph + * for us to track the kmer used to make the transformation. + * + * @param kmer kmer + */ + public SeqGraph(final int kmer) { + super(kmer); + } + + protected void mergeNodes() { + zipLinearChains(); + } + + protected void zipLinearChains() { + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; + + for( final BaseEdge e : edgeSet() ) { + final SeqVertex outgoingVertex = getEdgeTarget(e); + final SeqVertex incomingVertex = getEdgeSource(e); + if( !outgoingVertex.equals(incomingVertex) + && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 + && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { + + final Set outEdges = outgoingEdgesOf(outgoingVertex); + final Set inEdges = incomingEdgesOf(incomingVertex); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + } else if( inEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } else if( outEdges.size() == 1 ) { + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } + + final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); + addVertex(addedVertex); + for( final BaseEdge edge : outEdges ) { + addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + for( final BaseEdge edge : inEdges ) { + addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + + removeVertex(incomingVertex); + removeVertex(outgoingVertex); + foundNodesToMerge = true; + break; + } + } + } + } + + // + // X -> ABC -> Y + // -> aBC -> Y + // + // becomes + // + // X -> A -> BCY + // -> a -> BCY + // + public void mergeBranchingNodes() { + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; + + for( final SeqVertex v : vertexSet() ) { + foundNodesToMerge = simplifyDiamond(v); + if ( foundNodesToMerge ) + break; + } + } + } + + /** + * A simple structure that looks like: + * + * v + * / | \ \ + * m1 m2 m3 ... mn + * \ | / / + * b + * + * @param v + * @return + */ + protected boolean isRootOfDiamond(final SeqVertex v) { + final Set ve = outgoingEdgesOf(v); + if ( ve.size() <= 1 ) + return false; + + SeqVertex bottom = null; + for ( final BaseEdge e : ve ) { + final SeqVertex mi = getEdgeTarget(e); + + // all nodes must have at least 1 connection + if ( outDegreeOf(mi) < 1 ) + return false; + + // can only have 1 incoming node, the root vertex + if ( inDegreeOf(mi) != 1 ) + return false; + + for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { + if ( bottom == null ) + bottom = mt; + else if ( ! bottom.equals(mt) ) + return false; + } + } + + return true; + } + + private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { + final String[] kmers = new String[middleVertices.size()]; + + int i = 0; + for ( final SeqVertex v : middleVertices ) { + kmers[i++] = (StringUtils.reverse(v.getSequenceString())); + } + + final String commonPrefix = StringUtils.getCommonPrefix(kmers); + return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); + } + + private SeqVertex getDiamondBottom(final SeqVertex top) { + final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); + final SeqVertex middle = getEdgeTarget(topEdge); + final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next(); + return getEdgeTarget(middleEdge); + } + + final Set getMiddleVertices(final SeqVertex top) { + final Set middles = new HashSet(); + for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { + middles.add(getEdgeTarget(topToMiddle)); + } + return middles; + } + + private boolean simplifyDiamond(final SeqVertex top) { + if ( ! isRootOfDiamond(top) ) + return false; + + final SeqVertex diamondBottom = getDiamondBottom(top); + final Set middleVertices = getMiddleVertices(top); + + final List verticesToRemove = new LinkedList(); + final List edgesToRemove = new LinkedList(); + + // all of the edges point to the same sink, so it's time to merge + final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); + if ( commonSuffix != null ) { + boolean newBottomEdgeIsRef = false; + int newBottomEdgeMultiplicity = 0; + + final SeqVertex newBottomV = new SeqVertex(commonSuffix); + addVertex(newBottomV); + + for ( final SeqVertex middle : middleVertices ) { + boolean missingNodeEdgeIsRef = false; + int missingNodeMultiplicity = 0; + final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); + + if ( withoutSuffix != null ) // this node is a deletion + addVertex(withoutSuffix); + + // update all edges from top -> middle to be top -> without suffix + for( final BaseEdge topToMiddleEdge : getAllEdges(top, middle) ) { + edgesToRemove.add(topToMiddleEdge); + missingNodeMultiplicity += topToMiddleEdge.getMultiplicity(); + missingNodeEdgeIsRef = missingNodeEdgeIsRef || topToMiddleEdge.isRef(); + if ( withoutSuffix != null ) // this node is a deletion + addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge.isRef(), topToMiddleEdge.getMultiplicity())); + } + + // reattached prefix to the new bottom V by updating all edges from middleV -> bottom + for ( final BaseEdge middleToBottomE : getAllEdges(middle, diamondBottom) ) { + missingNodeMultiplicity += middleToBottomE.getMultiplicity(); + missingNodeEdgeIsRef = missingNodeEdgeIsRef || middleToBottomE.isRef(); + + if ( withoutSuffix != null ) // this node is a deletion + addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE.isRef(), middleToBottomE.getMultiplicity())); + edgesToRemove.add(middleToBottomE); + + // update the info for the new bottom edge + newBottomEdgeIsRef = newBottomEdgeIsRef || middleToBottomE.isRef(); + newBottomEdgeMultiplicity += middleToBottomE.getMultiplicity(); + } + + if ( withoutSuffix == null ) // add an edge from top to new bottom + addEdge(top, newBottomV, new BaseEdge(missingNodeEdgeIsRef, missingNodeMultiplicity)); + + verticesToRemove.add(middle); + } + + addEdge(newBottomV, diamondBottom, new BaseEdge(newBottomEdgeIsRef, newBottomEdgeMultiplicity)); + + removeAllEdges(edgesToRemove); + removeAllVertices(verticesToRemove); + + return true; + } else { + return false; + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java new file mode 100644 index 000000000..b45ac0c34 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java @@ -0,0 +1,153 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.Utils; + +import java.util.Arrays; + +/** + * A graph vertex containing a sequence of bases and a unique ID that + * allows multiple distinct nodes in the graph to have the same sequence. + * + * This is essential when thinking about representing the actual sequence of a haplotype + * in a graph. There can be many parts of the sequence that have the same sequence, but + * are distinct elements in the graph because they have a different position in the graph. For example: + * + * A -> C -> G -> A -> T + * + * The two As are not the same, because they occur with different connections. In a kmer graph equals() + * is based on the sequence itself, as each distinct kmer can only be represented once. But the transformation + * of the kmer graph into a graph of base sequences, without their kmer prefixes, means that nodes that + * where once unique including their prefix can become equal after shedding the prefix. So we need to + * use some mechanism -- here a unique ID per node -- to separate nodes that have the same sequence + * but are distinct elements of the graph. + * + * @author: depristo + * @since 03/2013 + */ +public class SeqVertex extends BaseVertex { + private static int idCounter = 0; + public final int id; + + /** + * Create a new SeqVertex with sequence and the next available id + * @param sequence our base sequence + */ + public SeqVertex(final byte[] sequence) { + super(sequence); + this.id = idCounter++; + } + + /** + * Create a new SeqVertex having bases of sequence.getBytes() + * @param sequence the string representation of our bases + */ + public SeqVertex(final String sequence) { + super(sequence); + this.id = idCounter++; + } + + /** + * Create a copy of toCopy + * @param toCopy a SeqVertex to copy into this newly allocated one + */ + public SeqVertex(final SeqVertex toCopy) { + super(toCopy.sequence); + this.id = toCopy.id; + } + + /** + * Get the unique ID for this SeqVertex + * @return a positive integer >= 0 + */ + public int getId() { + return id; + } + + @Override + public String toString() { + return "SeqVertex_id_" + id + "_seq_" + getSequenceString(); + } + + /** + * Two SeqVertex are equal only if their ids are equal + * @param o + * @return + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + SeqVertex seqVertex = (SeqVertex) o; + if (id != seqVertex.id) return false; + + // note that we don't test for super equality here because the ids are unique + //if (!super.equals(o)) return false; + + return true; + } + + @Override + public int hashCode() { + return id; + } + + /** + * Return a new SeqVertex derived from this one but not including the suffix bases + * + * @param suffix the suffix bases to remove from this vertex + * @return a newly allocated SeqVertex with appropriate prefix, or null if suffix removes all bases from this node + */ + @Requires("Utils.endsWith(sequence, suffix)") + public SeqVertex withoutSuffix(final byte[] suffix) { + final int prefixSize = sequence.length - suffix.length; + return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java new file mode 100644 index 000000000..3cc44c7de --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java @@ -0,0 +1,105 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class BaseEdgeUnitTest extends BaseTest { + @DataProvider(name = "EdgeCreationData") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( final int multiplicity : Arrays.asList(1, 2, 3) ) { + for ( final boolean isRef : Arrays.asList(true, false) ) { + tests.add(new Object[]{isRef, multiplicity}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "EdgeCreationData") + public void testBasic(final boolean isRef, final int mult) { + final BaseEdge e = new BaseEdge(isRef, mult); + Assert.assertEquals(e.isRef(), isRef); + Assert.assertEquals(e.getMultiplicity(), mult); + + e.setIsRef(!isRef); + Assert.assertEquals(e.isRef(), !isRef); + + e.setMultiplicity(mult + 1); + Assert.assertEquals(e.getMultiplicity(), mult + 1); + + final BaseEdge copy = new BaseEdge(e); + Assert.assertEquals(copy.isRef(), e.isRef()); + Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity()); + } + + @Test + public void testEdgeWeightComparator() { + final BaseEdge e10 = new BaseEdge(false, 10); + final BaseEdge e5 = new BaseEdge(true, 5); + final BaseEdge e2 = new BaseEdge(false, 2); + final BaseEdge e1 = new BaseEdge(false, 1); + + final List edges = new ArrayList(Arrays.asList(e1, e2, e5, e10)); + Collections.sort(edges, new BaseEdge.EdgeWeightComparator()); + Assert.assertEquals(edges.get(0), e10); + Assert.assertEquals(edges.get(1), e5); + Assert.assertEquals(edges.get(2), e2); + Assert.assertEquals(edges.get(3), e1); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java new file mode 100644 index 000000000..463e861b1 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java @@ -0,0 +1,192 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; +import scala.actors.threadpool.Arrays; + +import java.io.File; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 3/15/13 + * Time: 3:36 PM + * To change this template use File | Settings | File Templates. + */ +public class BaseGraphUnitTest extends BaseTest { + SeqGraph graph; + SeqVertex v1, v2, v3, v4, v5; + + @BeforeMethod + public void setUp() throws Exception { + graph = new SeqGraph(); + + v1 = new SeqVertex("A"); + v2 = new SeqVertex("C"); + v3 = new SeqVertex("C"); + v4 = new SeqVertex("C"); + v5 = new SeqVertex("C"); + + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdge(v1, v2); + graph.addEdge(v2, v4); + graph.addEdge(v3, v2); + graph.addEdge(v2, v3); + graph.addEdge(v4, v5); + } + + @Test + public void testIncomingAndOutgoingVertices() throws Exception { + assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v1)); + + assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4); + assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3); + + assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v3), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5); + assertVertexSetEquals(graph.incomingVerticesOf(v4), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v5)); + assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); + } + + @Test + public void testPrintEmptyGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + new SeqGraph().printGraph(tmp, 10); + new DeBruijnGraph().printGraph(tmp, 10); + } + + @Test + public void testComplexGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + graph.printGraph(tmp, 10); + } + + private void assertVertexSetEquals(final Set actual, final SeqVertex ... expected) { + final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); + Assert.assertEquals(actual, expectedSet); + } + + @Test(enabled = true) + public void testPruneGraph() { + DeBruijnGraph graph = new DeBruijnGraph(); + DeBruijnGraph expectedGraph = new DeBruijnGraph(); + + DeBruijnVertex v = new DeBruijnVertex("ATGG"); + DeBruijnVertex v2 = new DeBruijnVertex("ATGGA"); + DeBruijnVertex v3 = new DeBruijnVertex("ATGGT"); + DeBruijnVertex v4 = new DeBruijnVertex("ATGGG"); + DeBruijnVertex v5 = new DeBruijnVertex("ATGGC"); + DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC"); + + graph.addVertex(v); + graph.addVertex(v2); + graph.addVertex(v3); + graph.addVertex(v4); + graph.addVertex(v5); + graph.addVertex(v6); + graph.addEdge(v, v2, new BaseEdge(false, 1)); + graph.addEdge(v2, v3, new BaseEdge(false, 3)); + graph.addEdge(v3, v4, new BaseEdge(false, 5)); + graph.addEdge(v4, v5, new BaseEdge(false, 3)); + graph.addEdge(v5, v6, new BaseEdge(false, 2)); + + expectedGraph.addVertex(v2); + expectedGraph.addVertex(v3); + expectedGraph.addVertex(v4); + expectedGraph.addVertex(v5); + expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); + expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); + expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); + + graph.pruneGraph(2); + + Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); + + graph = new DeBruijnGraph(); + expectedGraph = new DeBruijnGraph(); + + graph.addVertex(v); + graph.addVertex(v2); + graph.addVertex(v3); + graph.addVertex(v4); + graph.addVertex(v5); + graph.addVertex(v6); + graph.addEdge(v, v2, new BaseEdge(true, 1)); + graph.addEdge(v2, v3, new BaseEdge(false, 3)); + graph.addEdge(v3, v4, new BaseEdge(false, 5)); + graph.addEdge(v4, v5, new BaseEdge(false, 3)); + + expectedGraph.addVertex(v); + expectedGraph.addVertex(v2); + expectedGraph.addVertex(v3); + expectedGraph.addVertex(v4); + expectedGraph.addVertex(v5); + expectedGraph.addEdge(v, v2, new BaseEdge(true, 1)); + expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); + expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); + expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); + + graph.pruneGraph(2); + + Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java new file mode 100644 index 000000000..cd27c7183 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java @@ -0,0 +1,91 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class BaseVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final BaseVertex v = new BaseVertex(bases); + Assert.assertEquals(v.getSequence(), bases); + Assert.assertEquals(v.getAdditionalSequence(false), bases); + Assert.assertEquals(v.getAdditionalSequence(true), bases); + Assert.assertEquals(v.getSequenceString(), new String(bases)); + Assert.assertEquals(v.toString(), v.getSequenceString()); + Assert.assertEquals(v.length(), bases.length); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCreationNull() { + new BaseVertex((byte[])null); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCreationEmptySeq() { + new BaseVertex(new byte[0]); + } + + @Test + public void testEqualsAndHashCode() { + final BaseVertex v1 = new BaseVertex("ACT".getBytes()); + final BaseVertex v1_eq = new BaseVertex("ACT".getBytes()); + final BaseVertex v2 = new BaseVertex("ACG".getBytes()); + + Assert.assertEquals(v1, v1); + Assert.assertEquals(v1.hashCode(), v1.hashCode()); + Assert.assertEquals(v1, v1_eq); + Assert.assertEquals(v1.hashCode(), v1_eq.hashCode()); + Assert.assertFalse(v1.equals(v2)); + Assert.assertFalse(v2.equals(v1)); + Assert.assertFalse(v2.hashCode() == v1.hashCode()); + Assert.assertFalse(v2.equals(v1)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 2096b487e..fa581f7fd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -69,211 +69,12 @@ import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { private final static boolean DEBUG = true; - - private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { - public byte[] sequence; - public int KMER_LENGTH; - - public MergeNodesWithNoVariationTestProvider(String seq, int kmer) { - super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq)); - sequence = seq.getBytes(); - KMER_LENGTH = kmer; - } - - public DeBruijnAssemblyGraph expectedGraph() { - DeBruijnVertex v = new DeBruijnVertex(sequence, KMER_LENGTH); - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - graph.addVertex(v); - return graph; - } - - public DeBruijnAssemblyGraph calcGraph() { - - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for (int i = 0; i < kmersInSequence - 1; i++) { - // get the kmers - final byte[] kmer1 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH); - final byte[] kmer2 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - - graph.addKmersToGraph(kmer1, kmer2, false, 1); - } - DeBruijnAssembler.mergeNodes(graph); - return graph; - } - } - - @DataProvider(name = "MergeNodesWithNoVariationTestProvider") - public Object[][] makeMergeNodesWithNoVariationTests() { - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7); - new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); - new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); - new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); - - return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); - } - - @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG) - public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { - logger.warn(String.format("Test: %s", cfg.toString())); - Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph())); - } - -// @DataProvider(name = "SimpleMergeOperationsData") -// public Object[][] makeSimpleMergeOperationsData() { -// List tests = new ArrayList(); -// -// { -// DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); -// DeBruijnVertex v1 = new DeBruijnVertex("AT"); -// DeBruijnVertex v2 = new DeBruijnVertex("TC"); -// DeBruijnVertex v3 = new DeBruijnVertex("CT"); -// DeBruijnVertex v4 = new DeBruijnVertex("TG"); -// DeBruijnVertex v5 = new DeBruijnVertex("AG"); -// DeBruijnVertex v6 = new DeBruijnVertex("GG"); -// DeBruijnVertex v7 = new DeBruijnVertex("GA"); -// DeBruijnVertex v8 = new DeBruijnVertex("AA"); -// -// graph.addVertices(v1, v2, v3, v4, v5, v6, v7, v8); -// graph.addEdge(v1, v2, new DeBruijnEdge(false, 2)); -// graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); -// graph.addEdge(v2, v4, new DeBruijnEdge(false, 5)); -// graph.addEdge(v3, v5, new DeBruijnEdge(false, 3)); -// graph.addEdge(v4, v6, new DeBruijnEdge(false, 3)); -// graph.addEdge(v5, v7, new DeBruijnEdge(false, 2)); -// graph.addEdge(v6, v7, new DeBruijnEdge(false, 6)); -// graph.addEdge(v7, v8, new DeBruijnEdge(false, 2)); -// -// graph.printGraph(new File("unittest.dot"), 1); -// -// DeBruijnAssemblyGraph expected = new DeBruijnAssemblyGraph(); -// DeBruijnVertex e1 = new DeBruijnVertex("ATC"); -// DeBruijnVertex e2 = new DeBruijnVertex("T"); -// DeBruijnVertex e3 = new DeBruijnVertex("G"); -// DeBruijnVertex e4 = new DeBruijnVertex("GAA"); -// -// expected.addVertices(e1,e2,e3,e4); -// expected.addEdge(e1, e2, new DeBruijnEdge(false, 3)); -// expected.addEdge(e1, e3, new DeBruijnEdge(false, 5)); -// expected.addEdge(e2, e4, new DeBruijnEdge(false, 2)); -// expected.addEdge(e3, e4, new DeBruijnEdge(false, 6)); -// -// expected.printGraph(new File("expected.dot"), 1); -// -// tests.add(new Object[]{graph.clone(), expected}); -// } -// -// return tests.toArray(new Object[][]{}); -// } -// -// @Test(dataProvider = "SimpleMergeOperationsData", enabled = true) -// public void testSimpleMergeOperations(final DeBruijnAssemblyGraph unmergedGraph, final DeBruijnAssemblyGraph expectedGraph) throws Exception { -// final DeBruijnAssemblyGraph mergedGraph = (DeBruijnAssemblyGraph)unmergedGraph.clone(); -// DeBruijnAssembler.mergeNodes(mergedGraph); -// mergedGraph.printGraph(new File("merged.dot"), 1); -// DeBruijnAssembler.simplifyMergedGraph(mergedGraph); -// mergedGraph.printGraph(new File("reduced.dot"), 1); -// Assert.assertTrue(graphEquals(mergedGraph, expectedGraph)); -// } - - @Test(enabled = !DEBUG) - public void testPruneGraph() { - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph(); - - DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1); - DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1); - DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 1); - DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 1); - DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 1); - DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 1); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(false, 1)); - graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - graph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - graph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - graph.addEdge(v5, v6, new DeBruijnEdge(false, 2)); - - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - DeBruijnAssembler.pruneGraph(graph, 2); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - - graph = new DeBruijnAssemblyGraph(); - expectedGraph = new DeBruijnAssemblyGraph(); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(true, 1)); - graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - graph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - graph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - expectedGraph.addVertex(v); - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1)); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - DeBruijnAssembler.pruneGraph(graph, 2); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - } - - private boolean graphEquals(DeBruijnAssemblyGraph g1, DeBruijnAssemblyGraph g2) { - if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { - return false; - } - for( DeBruijnEdge e1 : g1.edgeSet() ) { - boolean found = false; - for( DeBruijnEdge e2 : g2.edgeSet() ) { - if( e1.equals(g1, e2, g2) ) { found = true; break; } - } - if( !found ) { return false; } - } - for( DeBruijnEdge e2 : g2.edgeSet() ) { - boolean found = false; - for( DeBruijnEdge e1 : g1.edgeSet() ) { - if( e2.equals(g2, e1, g1) ) { found = true; break; } - } - if( !found ) { return false; } - } - return true; - } - @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnAssemblyGraph g1 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DeBruijnAssemblyGraph g2 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); + final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); @@ -313,7 +114,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; - Cigar calculatedCigar = DeBruijnAssembler.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java index 5a1497236..2b87cf61d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -75,7 +75,7 @@ public class DeBruijnAssemblyGraphUnitTest { } public byte[] calculatedReferenceBytes() { - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + DeBruijnGraph graph = new DeBruijnGraph(); graph.addSequenceToGraph(refSequence, KMER_LENGTH, true); if( altSequence.length > 0 ) { graph.addSequenceToGraph(altSequence, KMER_LENGTH, false); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java new file mode 100644 index 000000000..2db35e173 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java @@ -0,0 +1,69 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.annotations.Test; +import org.testng.Assert; + +public class DeBruijnVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final DeBruijnVertex v = new DeBruijnVertex(bases); + Assert.assertEquals(v.getSequence(), bases); + Assert.assertEquals(v.getSequenceString(), new String(bases)); + Assert.assertEquals(v.length(), bases.length); + Assert.assertEquals(v.getSuffix().length, 1); + Assert.assertEquals(v.getSuffix()[0], (byte)'T'); + Assert.assertEquals(v.getSuffixString(), "T"); + + Assert.assertEquals(v.getAdditionalSequence(true), bases); + Assert.assertEquals(v.getAdditionalSequence(false).length, 1); + Assert.assertEquals(v.getAdditionalSequence(false)[0], (byte)'T'); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java index 53400b790..10863cef9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java @@ -49,14 +49,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.jgrapht.graph.DefaultDirectedGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -79,58 +78,105 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BasicBubbleDataProvider") + @Test(dataProvider = "BasicBubbleDataProvider", enabled = true) public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { // Construct the assembly graph - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int KMER_LENGTH = 3; + SeqGraph graph = new SeqGraph(3); final String preRef = "ATGG"; - final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGGGC"; + final String postRef = "GGGGC"; - DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH); - DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v3 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH); + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(postRef); graph.addVertex(v); graph.addVertex(v2Ref); graph.addVertex(v2Alt); graph.addVertex(v3); - graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10)); - graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5)); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + + graph.printGraph(new File("test.dot"), 10); // Construct the test path - KBestPaths.Path path = new KBestPaths.Path(v, graph); - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + KBestPaths.Path path = new KBestPaths.Path(v, graph); + path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); - expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); } else if ( refBubbleLength < altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } + // TODO -- test block substitution because it doesn't look like it's correct now +// @Test(dataProvider = "BasicBubbleDataProvider") +// public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { +// // Construct the assembly graph +// final int KMER_LENGTH = 3; +// SeqGraph graph = new SeqGraph(KMER_LENGTH); +// final String preRef = "ATGG"; +// final String postRef = "GGGGC"; +// +// SeqVertex v = new SeqVertex(preRef); +// SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); +// SeqVertex v2Alt = new SeqVertex(Utils.dupString('T', altBubbleLength)); +// SeqVertex v3 = new SeqVertex(postRef); +// +// graph.addVertex(v); +// graph.addVertex(v2Ref); +// graph.addVertex(v2Alt); +// graph.addVertex(v3); +// graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); +// graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); +// graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); +// graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); +// +// graph.printGraph(new File("test.dot"), 10); +// +// // Construct the test path +// KBestPaths.Path path = new KBestPaths.Path(v, graph); +// path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); +// path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); +// +// // Construct the actual cigar string implied by the test path +// Cigar expectedCigar = new Cigar(); +// expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); +// if( refBubbleLength > altBubbleLength ) { +// expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); +// expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); +// } else if ( refBubbleLength < altBubbleLength ) { +// expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); +// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); +// } else { +// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); +// } +// expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); +// +// Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); +// } @DataProvider(name = "TripleBubbleDataProvider") public Object[][] makeTripleBubbleDataProvider() { List tests = new ArrayList(); for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { - for ( final boolean offRefBeginning : Arrays.asList(false) ) { - for ( final boolean offRefEnding : Arrays.asList(true, false) ) { + for ( final boolean offRefEnding : Arrays.asList(true, false) ) { + for ( final boolean offRefBeginning : Arrays.asList(false) ) { tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding}); } } @@ -139,30 +185,29 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "TripleBubbleDataProvider") + @Test(dataProvider = "TripleBubbleDataProvider", enabled = true) public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int KMER_LENGTH = 3; + SeqGraph graph = new SeqGraph(); final String preAltOption = "ATCGATCGATCGATCGATCG"; final String postAltOption = "CCCC"; final String preRef = "ATGG"; - final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGCCG"; - final String midRef1 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "TTCCT"; - final String midRef2 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "CCCAAAAAAAAAAAA"; + final String postRef = "GGCCG"; + final String midRef1 = "TTCCT"; + final String midRef2 = "CCCAAAAAAAAAAAA"; - DeBruijnVertex preV = new DeBruijnVertex(preAltOption.getBytes(), KMER_LENGTH); - DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH); - DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v4Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'C', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v4Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'C', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v6Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'G', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v6Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'G', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v3 = new DeBruijnVertex(midRef1.getBytes(), KMER_LENGTH); - DeBruijnVertex v5 = new DeBruijnVertex(midRef2.getBytes(), KMER_LENGTH); - DeBruijnVertex v7 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH); - DeBruijnVertex postV = new DeBruijnVertex(postAltOption.getBytes(), KMER_LENGTH); + SeqVertex preV = new SeqVertex(preAltOption); + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength)); + SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T"); + SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength)); + SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(midRef1); + SeqVertex v5 = new SeqVertex(midRef2); + SeqVertex v7 = new SeqVertex(postRef); + SeqVertex postV = new SeqVertex(postAltOption); graph.addVertex(preV); graph.addVertex(v); @@ -176,34 +221,36 @@ public class KBestPathsUnitTest { graph.addVertex(v6Alt); graph.addVertex(v7); graph.addVertex(postV); - graph.addEdge(preV, v, new DeBruijnEdge(false, 1)); - graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10)); - graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5)); - graph.addEdge(v3, v4Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v4Ref, v5, new DeBruijnEdge(true, 10)); - graph.addEdge(v3, v4Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v4Alt, v5, new DeBruijnEdge(false, 5)); - graph.addEdge(v5, v6Ref, new DeBruijnEdge(true, 11)); - graph.addEdge(v6Ref, v7, new DeBruijnEdge(true, 11)); - graph.addEdge(v5, v6Alt, new DeBruijnEdge(false, 55)); - graph.addEdge(v6Alt, v7, new DeBruijnEdge(false, 55)); - graph.addEdge(v7, postV, new DeBruijnEdge(false, 1)); + graph.addEdge(preV, v, new BaseEdge(false, 1)); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + graph.addEdge(v3, v4Ref, new BaseEdge(true, 10)); + graph.addEdge(v4Ref, v5, new BaseEdge(true, 10)); + graph.addEdge(v3, v4Alt, new BaseEdge(false, 5)); + graph.addEdge(v4Alt, v5, new BaseEdge(false, 5)); + graph.addEdge(v5, v6Ref, new BaseEdge(true, 11)); + graph.addEdge(v6Ref, v7, new BaseEdge(true, 11)); + graph.addEdge(v5, v6Alt, new BaseEdge(false, 55)); + graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); + graph.addEdge(v7, postV, new BaseEdge(false, 1)); + + graph.printGraph(new File("test.debruijn.dot"), 10); // Construct the test path - KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); + KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); if( offRefBeginning ) { - path = new KBestPaths.Path(path, graph.getEdge(preV, v)); + path = new KBestPaths.Path(path, graph.getEdge(preV, v)); } - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); - path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); - path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); - path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); + path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); + path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); + path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); if( offRefEnding ) { - path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); + path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); } // Construct the actual cigar string implied by the test path @@ -211,7 +258,7 @@ public class KBestPathsUnitTest { if( offRefBeginning ) { expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); } - expectedCigar.add(new CigarElement(preRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); // first bubble if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); @@ -222,10 +269,10 @@ public class KBestPathsUnitTest { } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(midRef1.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M)); // second bubble is ref path expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - expectedCigar.add(new CigarElement(midRef2.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M)); // third bubble if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); @@ -236,9 +283,9 @@ public class KBestPathsUnitTest { } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); if( offRefEnding ) { - expectedCigar.add(new CigarElement(postAltOption.length() - (KMER_LENGTH - 1), CigarOperator.I)); + expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); } Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java index f88d7ee7f..a4edfcacc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java @@ -55,6 +55,8 @@ public class KMerErrorCorrectorUnitTest extends BaseTest { public void testMyData() { final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); + Assert.assertNotNull(corrector.toString()); + corrector.addKmers( "ATG", "ATG", "ATG", "ATG", "ACC", "ACC", "ACC", @@ -66,13 +68,20 @@ public class KMerErrorCorrectorUnitTest extends BaseTest { "NNC" // => ACC [because of min count won't go to NNA] ); - Assert.assertEquals(corrector.getErrorCorrectedKmer("ATG"), "ATG"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("ACC"), "ACC"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("AAA"), "AAA"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("CTG"), "ATG"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNA"), "AAA"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("CCC"), "ACC"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNN"), null); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNC"), "ACC"); + testCorrection(corrector, "ATG", "ATG"); + testCorrection(corrector, "ACC", "ACC"); + testCorrection(corrector, "AAA", "AAA"); + testCorrection(corrector, "CTG", "ATG"); + testCorrection(corrector, "NNA", "AAA"); + testCorrection(corrector, "CCC", "ACC"); + testCorrection(corrector, "NNN", null); + testCorrection(corrector, "NNC", "ACC"); + + Assert.assertNotNull(corrector.toString()); + } + + private void testCorrection(final KMerErrorCorrector corrector, final String in, final String out) { + Assert.assertEquals(corrector.getErrorCorrectedKmer(in), out); + Assert.assertEquals(corrector.getErrorCorrectedKmer(in.getBytes()), out == null ? null : out.getBytes()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java new file mode 100644 index 000000000..b5089e878 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -0,0 +1,106 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class SeqGraphUnitTest extends BaseTest { + private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { + public byte[] sequence; + public int KMER_LENGTH; + + public MergeNodesWithNoVariationTestProvider(String seq, int kmer) { + super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq)); + sequence = seq.getBytes(); + KMER_LENGTH = kmer; + } + + public SeqGraph calcGraph() { + final DeBruijnGraph deBruijnGraph = new DeBruijnGraph(); + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for (int i = 0; i < kmersInSequence - 1; i++) { + // get the kmers + final byte[] kmer1 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH); + final byte[] kmer2 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); + + deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1); + } + final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); + seqGraph.mergeNodes(); + return seqGraph; + } + } + + @DataProvider(name = "MergeNodesWithNoVariationTestProvider") + public Object[][] makeMergeNodesWithNoVariationTests() { + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7); + new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); + new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); + new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); + + return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); + } + + @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true) + public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { + logger.warn(String.format("Test: %s", cfg.toString())); + + final SeqGraph actual = cfg.calcGraph(); + Assert.assertEquals(actual.vertexSet().size(), 1); + final SeqVertex actualV = actual.vertexSet().iterator().next(); + Assert.assertEquals(actualV.getSequence(), cfg.sequence); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java new file mode 100644 index 000000000..ca38351cc --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java @@ -0,0 +1,109 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SeqVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final SeqVertex v1 = new SeqVertex(bases); + final SeqVertex v2 = new SeqVertex(bases); + Assert.assertTrue(v1.getId() >= 0); + Assert.assertTrue(v2.getId() >= 0); + Assert.assertTrue(v2.getId() > v1.getId()); + } + + @Test + public void testEqualsAndHashCode() { + final byte[] bases = "ACT".getBytes(); + final SeqVertex v1 = new SeqVertex(bases); + final SeqVertex v1_neq = new SeqVertex(bases); + final SeqVertex v1_eq = new SeqVertex(v1); + + Assert.assertEquals(v1, v1); + Assert.assertEquals(v1.hashCode(), v1.hashCode()); + Assert.assertEquals(v1, v1_eq); + Assert.assertEquals(v1.hashCode(), v1_eq.hashCode()); + Assert.assertFalse(v1.equals(v1_neq)); + Assert.assertFalse(v1_neq.equals(v1)); + Assert.assertFalse(v1_neq.hashCode() == v1.hashCode()); + } + + @DataProvider(name = "WithoutSuffixData") + public Object[][] makeWithoutSuffixData() { + List tests = new ArrayList(); + + final String bases = "ACGTACGTACGT"; + final int l = bases.length(); + for ( int suffixLength = 0; suffixLength <= l; suffixLength++ ) { + final int suffixStart = l - suffixLength; + final String prefix = suffixLength == l ? null : bases.substring(0, suffixStart); + final String suffix = suffixStart == l ? "" : bases.substring(suffixStart, l); + tests.add(new Object[]{bases, suffix, prefix}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "WithoutSuffixData") + public void testWithoutSuffix(final String bases, final String suffix, final String expected) { + final SeqVertex basesSV = new SeqVertex(bases); + if ( expected == null ) + Assert.assertNull(basesSV.withoutSuffix(suffix.getBytes()), "Failed for bases " + bases + " with suffix " + suffix + " != " + expected); + else + Assert.assertEquals(basesSV.withoutSuffix(suffix.getBytes()).getSequenceString(), expected, "Failed for bases " + bases + " with suffix " + suffix + " != " + expected); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index ff64133a7..e50025ea1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -795,4 +795,17 @@ public class Utils { while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 return md5String; } + + /** + * Does big end with the exact sequence of bytes in suffix? + * + * @param big a non-null byte[] to test if it a prefix + suffix + * @param suffix a non-null byte[] to test if it's a suffix of big + * @return true if big is proper byte[] composed of some prefix + suffix + */ + public static boolean endsWith(final byte[] big, final byte[] suffix) { + if ( big == null ) throw new IllegalArgumentException("big cannot be null"); + if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null"); + return new String(big).endsWith(new String(suffix)); + } } From 1fa5050faf232bd9cff8edc5f521ec5cbd66ec22 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 14:33:09 -0400 Subject: [PATCH 09/16] Cleanup, unit test, and optimize KBestPaths and Path -- Split Path from inner class of KBestPaths -- Use google MinMaxPriorityQueue to track best k paths, a more efficient implementation -- Path now properly typed throughout the code -- Path maintains a on-demand hashset of BaseEdges so that path.containsEdge is fast --- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../walkers/haplotypecaller/KBestPaths.java | 337 ++------------- .../gatk/walkers/haplotypecaller/Path.java | 394 ++++++++++++++++++ .../haplotypecaller/KBestPathsUnitTest.java | 176 +++++--- 4 files changed, 549 insertions(+), 360 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9d84d611f..688d5336e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -325,7 +325,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } for( final SeqGraph graph : graphs ) { - for ( final KBestPaths.Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 8c29cfa98..0724729a8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -46,293 +46,44 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; /** - * Created by IntelliJ IDEA. - * User: ebanks, rpoplin + * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. + * This is different from most graph traversals because we want to test paths from any source node to any sink node. + * + * User: ebanks, rpoplin, mdepristo * Date: Mar 23, 2011 */ -// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. -// This is different from most graph traversals because we want to test paths from any source node to any sink node. public class KBestPaths { - // static access only public KBestPaths() { } - private static int MAX_PATHS_TO_HOLD = 100; - protected static class MyInt { public int val = 0; } - // class to keep track of paths - protected static class Path { - // the last vertex seen in the path - private final T lastVertex; - - // the list of edges comprising the path - private final List edges; - - // the scores for the path - private final int totalScore; - - // the graph from which this path originated - private final BaseGraph graph; - - // used in the bubble state machine to apply Smith-Waterman to the bubble sequence - // these values were chosen via optimization against the NA12878 knowledge base - private static final double SW_MATCH = 20.0; - private static final double SW_MISMATCH = -15.0; - private static final double SW_GAP = -26.0; - private static final double SW_GAP_EXTEND = -1.1; - private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - - public Path( final T initialVertex, final BaseGraph graph ) { - lastVertex = initialVertex; - edges = new ArrayList(0); - totalScore = 0; - this.graph = graph; - } - - public Path( final Path p, final BaseEdge edge ) { - if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } - - graph = p.graph; - lastVertex = p.graph.getEdgeTarget(edge); - edges = new ArrayList(p.edges); - edges.add(edge); - totalScore = p.totalScore + edge.getMultiplicity(); - } - - /** - * Does this path contain the given edge - * @param edge the given edge to test - * @return true if the edge is found in this path - */ - public boolean containsEdge( final BaseEdge edge ) { - if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - - for( final BaseEdge e : edges ) { - if( e.equals(graph, edge) ) { - return true; - } - } - - return false; - } - - /** - * Calculate the number of times this edge appears in the path - * @param edge the given edge to test - * @return number of times this edge appears in the path - */ - public int numInPath( final BaseEdge edge ) { - if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - - int numInPath = 0; - for( final BaseEdge e : edges ) { - if( e.equals(graph, edge) ) { - numInPath++; - } - } - - return numInPath; - } - - public List getEdges() { return edges; } - - public int getScore() { return totalScore; } - - public T getLastVertexInPath() { return lastVertex; } - - /** - * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes - * @return non-null sequence of bases corresponding to this path - */ - @Ensures({"result != null"}) - public byte[] getBases() { - if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); } - - byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0))); - for( final BaseEdge e : edges ) { - bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); - } - return bases; - } - - /** - * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble - * @return non-null Cigar string with reference length equal to the refHaplotype's reference length - */ - @Ensures("result != null") - public Cigar calculateCigar() { - - final Cigar cigar = new Cigar(); - // special case for paths that start on reference but not at the reference source node - if( edges.get(0).isRef() && !graph.isRefSource(edges.get(0)) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) { - cigar.add(ce); - } - } - - // reset the bubble state machine - final BubbleStateMachine bsm = new BubbleStateMachine(cigar); - - for( final BaseEdge e : edges ) { - if( e.equals(graph, edges.get(0)) ) { - advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); - } - advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); - } - - // special case for paths that don't end on reference - if( bsm.inBubble ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } else if( edges.get(edges.size()-1).isRef() && !graph.isRefSink(edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } - - return AlignmentUtils.consolidateCigar(bsm.cigar); - } - - /** - * Advance the bubble state machine by incorporating the next node in the path. - * @param bsm the current bubble state machine - * @param node the node to be incorporated - * @param e the edge which generated this node in the path - */ - @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { - if( graph.isReferenceNode( node ) ) { - if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else - if( e !=null && !e.isRef() ) { - if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); - } else { - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = graph.getEdgeSource(e); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } - } else { - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // close the bubble and use a local SW to determine the Cigar string - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.inBubble = false; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = null; - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else { // non-ref vertex - if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // open up a bubble - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } - } - } - - /** - * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble - * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble - * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) - * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) - * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble - */ - @Requires({"graph != null"}) - @Ensures({"result != null"}) - private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { - final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); - - final Cigar returnCigar = new Cigar(); - - // add padding to anchor ref/alt bases in the SW matrix - byte[] padding = STARTING_SW_ANCHOR_BYTES; - boolean goodAlignment = false; - SWPairwiseAlignment swConsensus = null; - while( !goodAlignment && padding.length < 1000 ) { - padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time - final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); - final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); - swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { - goodAlignment = true; - } - } - if( !goodAlignment ) { - returnCigar.add(new CigarElement(1, CigarOperator.N)); - return returnCigar; - } - - final Cigar swCigar = swConsensus.getCigar(); - if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference - returnCigar.add(new CigarElement(1, CigarOperator.N)); - } else { - for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { - // now we need to remove the padding from the cigar string - int length = swCigar.getCigarElement(iii).getLength(); - if( iii == 0 ) { length -= padding.length; } - if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } - if( length > 0 ) { - returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); - } - } - if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { - throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); - } - } - - return returnCigar; - } - - // class to keep track of the bubble state machine - protected static class BubbleStateMachine { - public boolean inBubble = false; - public byte[] bubbleBytes = null; - public T lastSeenReferenceNode = null; - public Cigar cigar = null; - - public BubbleStateMachine( final Cigar initialCigar ) { - inBubble = false; - bubbleBytes = null; - lastSeenReferenceNode = null; - cigar = initialCigar; - } - } - } - + /** + * Compare paths such that paths with greater weight are earlier in a list + */ protected static class PathComparatorTotalScore implements Comparator, Serializable { @Override public int compare(final Path path1, final Path path2) { - return path1.totalScore - path2.totalScore; + return path2.getScore() - path1.getScore(); } } + /** + * @see #getKBestPaths(BaseGraph, int) retriving the first 1000 paths + */ + public List> getKBestPaths( final BaseGraph graph ) { + return getKBestPaths(graph, 1000); + } + /** * Traverse the graph and pull out the best k paths. * Paths are scored via their comparator function. The default being PathComparatorTotalScore() @@ -341,51 +92,41 @@ public class KBestPaths { * @return a list with at most k top-scoring paths from the graph */ @Ensures({"result != null", "result.size() <= k"}) - public List getKBestPaths( final BaseGraph graph, final int k ) { + public List> getKBestPaths( final BaseGraph graph, final int k ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } - if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); } - final ArrayList bestPaths = new ArrayList(); - + // a min max queue that will collect the best k paths + final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); + // run a DFS for best paths - for( final T v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 ) { - findBestPaths(new Path(v, graph), bestPaths); + for ( final T v : graph.vertexSet() ) { + if ( graph.inDegreeOf(v) == 0 ) { + findBestPaths(new Path(v, graph), bestPaths, new MyInt()); } } - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - Collections.reverse(bestPaths); - return bestPaths.subList(0, Math.min(k, bestPaths.size())); + // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result + final List> toReturn = new ArrayList>(bestPaths); + Collections.sort(toReturn, new PathComparatorTotalScore()); + return toReturn; } - private void findBestPaths( final Path path, final List bestPaths ) { - findBestPaths(path, bestPaths, new MyInt()); - } - - private void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { - + private void findBestPaths( final Path path, final MinMaxPriorityQueue> bestPaths, final MyInt n ) { // did we hit the end of a path? if ( allOutgoingEdgesHaveBeenVisited(path) ) { - if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { - // clean out some low scoring paths - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 - } bestPaths.add(path); - } else if( n.val > 10000) { - // do nothing, just return + } else if( n.val > 10000 ) { + // do nothing, just return, as we've done too much work already } else { // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(); - edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex)); + final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); for ( final BaseEdge edge : edgeArrayList ) { // make sure the edge is not already in the path if ( path.containsEdge(edge) ) continue; - final Path newPath = new Path(path, edge); + final Path newPath = new Path(path, edge); n.val++; findBestPaths(newPath, bestPaths, n); } @@ -393,11 +134,15 @@ public class KBestPaths { } /** + * Have all of the outgoing edges of the final vertex been visited? + * + * I.e., are all outgoing vertices of the current path in the list of edges of the graph? + * * @param path the path to test * @return true if all the outgoing edges at the end of this path have already been visited */ private boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { - for( final BaseEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { + for( final BaseEdge edge : path.getOutgoingEdgesOfLastVertex() ) { if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java new file mode 100644 index 000000000..895cffcca --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -0,0 +1,394 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * A path thought a BaseGraph + * + * class to keep track of paths + * + * User: depristo + * Date: 3/19/13 + * Time: 2:34 PM + * + */ +class Path { + // the last vertex seen in the path + private final T lastVertex; + + // the list of edges comprising the path + private Set edgesAsSet = null; + private final LinkedList edgesInOrder; + + // the scores for the path + private final int totalScore; + + // the graph from which this path originated + private final BaseGraph graph; + + // used in the bubble state machine to apply Smith-Waterman to the bubble sequence + // these values were chosen via optimization against the NA12878 knowledge base + private static final double SW_MATCH = 20.0; + private static final double SW_MISMATCH = -15.0; + private static final double SW_GAP = -26.0; + private static final double SW_GAP_EXTEND = -1.1; + private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); + + /** + * Create a new Path containing no edges and starting at initialVertex + * @param initialVertex the starting vertex of the path + * @param graph the graph this path with follow through + */ + public Path(final T initialVertex, final BaseGraph graph) { + if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); + + lastVertex = initialVertex; + edgesInOrder = new LinkedList(); + totalScore = 0; + this.graph = graph; + } + + /** + * Create a new Path extending p with edge + * + * @param p the path to extend + * @param edge the edge to extend path by + */ + public Path(final Path p, final BaseEdge edge) { + if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); + if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); + if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); + if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } + + graph = p.graph; + lastVertex = p.graph.getEdgeTarget(edge); + edgesInOrder = new LinkedList(p.getEdges()); + edgesInOrder.add(edge); + totalScore = p.totalScore + edge.getMultiplicity(); + } + + /** + * Get the collection of edges leaving the last vertex of this path + * @return a non-null collection + */ + public Collection getOutgoingEdgesOfLastVertex() { + return getGraph().outgoingEdgesOf(getLastVertex()); + } + + /** + * Does this path contain the given edge + * @param edge the given edge to test + * @return true if the edge is found in this path + */ + public boolean containsEdge( final BaseEdge edge ) { + if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } + if ( edgesInOrder.isEmpty() ) return false; + + // initialize contains cache if necessary + if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); + return edgesAsSet.contains(edge); + } + + /** + * Check that two paths have the same edges and total score + * @param path the other path we might be the same as + * @return true if this and path are the same + */ + protected boolean pathsAreTheSame(Path path) { + return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path="); + boolean first = true; + for ( final T v : getVertices() ) { + if ( first ) { + b.append(" -> "); + first = false; + } + b.append(v.getSequenceString()); + } + return b.toString(); + } + + /** + * Get the graph of this path + * @return a non-null graph + */ + @Ensures("result != null") + public BaseGraph getGraph() { + return graph; + } + + /** + * Get the edges of this path in order + * @return a non-null list of edges + */ + @Ensures("result != null") + public List getEdges() { return edgesInOrder; } + + /** + * Get the list of vertices in this path in order defined by the edges of the path + * @return a non-null, non-empty list of vertices + */ + @Ensures({"result != null", "!result.isEmpty()"}) + public List getVertices() { + if ( getEdges().isEmpty() ) + return Collections.singletonList(lastVertex); + else { + final LinkedList vertices = new LinkedList(); + boolean first = true; + for ( final BaseEdge e : getEdges() ) { + if ( first ) { + vertices.add(graph.getEdgeSource(e)); + first = false; + } + vertices.add(graph.getEdgeTarget(e)); + } + return vertices; + } + } + + /** + * Get the total score of this path (bigger is better) + * @return a positive integer + */ + @Ensures("result >= 0") + public int getScore() { return totalScore; } + + /** + * Get the final vertex of the path + * @return a non-null vertex + */ + @Ensures("result != null") + public T getLastVertex() { return lastVertex; } + + /** + * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes + * @return non-null sequence of bases corresponding to this path + */ + @Ensures({"result != null"}) + public byte[] getBases() { + if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } + + byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst())); + for( final BaseEdge e : edgesInOrder ) { + bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); + } + return bases; + } + + /** + * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble + * @return non-null Cigar string with reference length equal to the refHaplotype's reference length + */ + @Ensures("result != null") + public Cigar calculateCigar() { + final Cigar cigar = new Cigar(); + // special case for paths that start on reference but not at the reference source node + if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) { + cigar.add(ce); + } + } + + // reset the bubble state machine + final BubbleStateMachine bsm = new BubbleStateMachine(cigar); + + for( final BaseEdge e : getEdges() ) { + if( e.equals(graph, edgesInOrder.getFirst()) ) { + advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); + } + advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); + } + + // special case for paths that don't end on reference + if( bsm.inBubble ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { + bsm.cigar.add(ce); + } + } else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) { + bsm.cigar.add(ce); + } + } + + return AlignmentUtils.consolidateCigar(bsm.cigar); + } + + /** + * Advance the bubble state machine by incorporating the next node in the path. + * @param bsm the current bubble state machine + * @param node the node to be incorporated + * @param e the edge which generated this node in the path + */ + @Requires({"bsm != null", "graph != null", "node != null"}) + private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { + if( graph.isReferenceNode( node ) ) { + if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else + if( e !=null && !e.isRef() ) { + if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { + bsm.cigar.add(ce); + } + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); + } else { + bsm.inBubble = true; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = graph.getEdgeSource(e); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } + } else { + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } + } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } else { // close the bubble and use a local SW to determine the Cigar string + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { + bsm.cigar.add(ce); + } + bsm.inBubble = false; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = null; + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } + } else { // non-ref vertex + if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } else { // open up a bubble + bsm.inBubble = true; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } + } + } + + /** + * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble + * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble + * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) + * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) + * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble + */ + @Requires({"graph != null"}) + @Ensures({"result != null"}) + private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { + final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); + + final Cigar returnCigar = new Cigar(); + + // add padding to anchor ref/alt bases in the SW matrix + byte[] padding = STARTING_SW_ANCHOR_BYTES; + boolean goodAlignment = false; + SWPairwiseAlignment swConsensus = null; + while( !goodAlignment && padding.length < 1000 ) { + padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time + final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); + final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); + swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { + goodAlignment = true; + } + } + if( !goodAlignment ) { + returnCigar.add(new CigarElement(1, CigarOperator.N)); + return returnCigar; + } + + final Cigar swCigar = swConsensus.getCigar(); + if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference + returnCigar.add(new CigarElement(1, CigarOperator.N)); + } else { + for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { + // now we need to remove the padding from the cigar string + int length = swCigar.getCigarElement(iii).getLength(); + if( iii == 0 ) { length -= padding.length; } + if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } + if( length > 0 ) { + returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); + } + } + if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { + throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); + } + } + + return returnCigar; + } + + // class to keep track of the bubble state machine + private static class BubbleStateMachine { + public boolean inBubble = false; + public byte[] bubbleBytes = null; + public T lastSeenReferenceNode = null; + public Cigar cigar = null; + + public BubbleStateMachine( final Cigar initialCigar ) { + inBubble = false; + bubbleBytes = null; + lastSeenReferenceNode = null; + cigar = initialCigar; + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java index 10863cef9..34b4ba912 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java @@ -55,9 +55,9 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; /** @@ -67,6 +67,72 @@ import java.util.List; */ public class KBestPathsUnitTest { + @DataProvider(name = "BasicPathFindingData") + public Object[][] makeBasicPathFindingData() { + List tests = new ArrayList(); +// for ( final int nStartNodes : Arrays.asList(1) ) { +// for ( final int nBranchesPerBubble : Arrays.asList(2) ) { +// for ( final int nEndNodes : Arrays.asList(1) ) { +// for ( final boolean addCycle : Arrays.asList(true) ) { + for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { + for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { + for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { + for ( final boolean addCycle : Arrays.asList(true, false) ) { + tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private static int weight = 1; + final List createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { + final List seqs = Arrays.asList("A", "C", "G", "T"); + final List vertices = new LinkedList(); + for ( int i = 0; i < n; i++ ) { + final SeqVertex v = new SeqVertex(seqs.get(i)); + graph.addVertex(v); + vertices.add(v); + if ( source != null ) graph.addEdge(source, v, new BaseEdge(false, weight++)); + if ( target != null ) graph.addEdge(v, target, new BaseEdge(false, weight++)); + } + return vertices; + } + + @Test(dataProvider = "BasicPathFindingData", enabled = true) + public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle) { + SeqGraph graph = new SeqGraph(); + + final SeqVertex middleTop = new SeqVertex("GTAC"); + final SeqVertex middleBottom = new SeqVertex("ACTG"); + graph.addVertices(middleTop, middleBottom); + final List starts = createVertices(graph, nStartNodes, null, middleTop); + final List bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); + final List ends = createVertices(graph, nEndNodes, middleBottom, null); + + if ( addCycle ) graph.addEdge(middleBottom, middleBottom); + + // enumerate all possible paths + final List> paths = new KBestPaths().getKBestPaths(graph); + + final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle ? 2 : 1) * nEndNodes; + Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); + + int lastScore = Integer.MAX_VALUE; + for ( final Path path : paths ) { + Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); + lastScore = path.getScore(); + } + + // get the best path, and make sure it's the same as our optimal path overall + final Path best = paths.get(0); + final List> justOne = new KBestPaths().getKBestPaths(graph, 1); + Assert.assertEquals(justOne.size(), 1); + Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); + } + @DataProvider(name = "BasicBubbleDataProvider") public Object[][] makeBasicBubbleDataProvider() { List tests = new ArrayList(); @@ -99,12 +165,10 @@ public class KBestPathsUnitTest { graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); - graph.printGraph(new File("test.dot"), 10); - // Construct the test path - KBestPaths.Path path = new KBestPaths.Path(v, graph); - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + Path path = new Path(v, graph); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); @@ -123,52 +187,40 @@ public class KBestPathsUnitTest { Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } - // TODO -- test block substitution because it doesn't look like it's correct now -// @Test(dataProvider = "BasicBubbleDataProvider") -// public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { -// // Construct the assembly graph -// final int KMER_LENGTH = 3; -// SeqGraph graph = new SeqGraph(KMER_LENGTH); -// final String preRef = "ATGG"; -// final String postRef = "GGGGC"; -// -// SeqVertex v = new SeqVertex(preRef); -// SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); -// SeqVertex v2Alt = new SeqVertex(Utils.dupString('T', altBubbleLength)); -// SeqVertex v3 = new SeqVertex(postRef); -// -// graph.addVertex(v); -// graph.addVertex(v2Ref); -// graph.addVertex(v2Alt); -// graph.addVertex(v3); -// graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); -// graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); -// graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); -// graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); -// -// graph.printGraph(new File("test.dot"), 10); -// -// // Construct the test path -// KBestPaths.Path path = new KBestPaths.Path(v, graph); -// path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); -// path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); -// -// // Construct the actual cigar string implied by the test path -// Cigar expectedCigar = new Cigar(); -// expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); -// if( refBubbleLength > altBubbleLength ) { -// expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); -// expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); -// } else if ( refBubbleLength < altBubbleLength ) { -// expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); -// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); -// } else { -// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); -// } -// expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); -// -// Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); -// } + @DataProvider(name = "GetBasesData") + public Object[][] makeGetBasesData() { + List tests = new ArrayList(); + + final List frags = Arrays.asList("ACT", "GAC", "CAT"); + + for ( int n = 1; n <= frags.size(); n++ ) { + for ( final List comb : Utils.makePermutations(frags, n, false) ) { + tests.add(new Object[]{comb}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesData", enabled = true) + public void testGetBases(final List frags) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(3); + + SeqVertex prev = null; + for ( int i = 0; i < frags.size(); i++ ) { + SeqVertex v = new SeqVertex(frags.get(i)); + graph.addVertex(v); + if ( prev != null ) + graph.addEdge(prev, v); + prev = v; + } + + // enumerate all possible paths + final List> paths = new KBestPaths().getKBestPaths(graph); + Assert.assertEquals(paths.size(), 1); + final Path path = paths.get(0); + Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); + } @DataProvider(name = "TripleBubbleDataProvider") public Object[][] makeTripleBubbleDataProvider() { @@ -236,21 +288,19 @@ public class KBestPathsUnitTest { graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); graph.addEdge(v7, postV, new BaseEdge(false, 1)); - graph.printGraph(new File("test.debruijn.dot"), 10); - // Construct the test path - KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); + Path path = new Path( (offRefBeginning ? preV : v), graph); if( offRefBeginning ) { - path = new KBestPaths.Path(path, graph.getEdge(preV, v)); + path = new Path(path, graph.getEdge(preV, v)); } - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); - path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); - path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); - path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); + path = new Path(path, graph.getEdge(v3, v4Ref)); + path = new Path(path, graph.getEdge(v4Ref, v5)); + path = new Path(path, graph.getEdge(v5, v6Alt)); + path = new Path(path, graph.getEdge(v6Alt, v7)); if( offRefEnding ) { - path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); + path = new Path(path, graph.getEdge(v7,postV)); } // Construct the actual cigar string implied by the test path From 2e36f15861fc16ae095faf517af16e01f6db7450 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 16:22:06 -0400 Subject: [PATCH 10/16] Update md5s to reflect new downsampling and assembly algorithm output -- Only minor differences, with improvement in allele discovery where the sites differ. The test of an insertion at the start of the MT no longer calls a 1 bp indel at position 0 in the genome --- ...plexAndSymbolicVariantsIntegrationTest.java | 8 ++++---- .../HaplotypeCallerIntegrationTest.java | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 72e06ddc6..fd16ed856 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b83b53741edb07218045d6f25f20a18b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "2b9355ab532314bce157c918c7606409"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "298c1af47a515ea7c8c1ea704d7755ce"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8225fb59b9fcbe767a473c9eb8b21537"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "fd3412030628fccf77effdb1ec03dce7"); + "f2add041ba1692db576ae9763a14b8a6"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "633e8930a263e34def5e097889dd9805"); + "383320e81a1a3bee880fcc6cd0564452"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index fb267297f..c93e54f87 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "694d6ea7f0f305854d4108379d68de75"); + HCTest(CEUTRIO_BAM, "", "75dbef605b28f02616b13bb5d8bf2fbd"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "995501d8af646af3b6eaa4109e2fb4a0"); + HCTest(NA12878_BAM, "", "fa8705a5d3ada66470019fa7ddcb9b2c"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "627124af27dc4556d83df1a04e4b9f97"); + "9f9062a6eb93f984658492400102b0c7"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,12 +96,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "205fc8647b908c0dab7b5c6d6b78c0c2"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a38f6fade253577d205a00db3e67828"); } @Test public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "e6f7bbab7cf96cbb25837b7a94bf0f82"); + HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ccd30e226f097a40cdeebaa035a290a7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("1e7b1bda6be5d3835ae318f2977cfbdd")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f1250a8ecd404443dcca20741a74ec4f")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6d63f558259883262ea84f339acb767")); executeTest("HCTestStructuralIndels: ", spec); } @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a")); + Arrays.asList("5280f1a50ca27d8e435da0bd5b26ae93")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("d3eb900eecdafafda3170f67adff42ae")); + Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 5226b24a119a99c8139996fec65a64ae711ad234 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 18:09:23 -0400 Subject: [PATCH 11/16] HaplotypeCaller instructure cleanup and unit testing -- UnitTest for isRootOfDiamond along with key bugfix detected while testing -- Fix up the equals methods in BaseEdge. Now called hasSameSourceAndTarget and seqEquals. A much more meaningful naming -- Generalize graphEquals to use seqEquals, so it works equally well with Debruijn and SeqGraphs -- Add BaseVertex method called seqEquals that returns true if two BaseVertex objects have the same sequence -- Reorganize SeqGraph mergeNodes into a single master function that does zipping, branch merging, and zipping again, rather than having this done in the DeBruijnAssembler itself -- Massive expansion of the SeqGraph unit tests. We now really test out the zipping and branch merging code. -- Near final cleanup of the current codebase -- DeBruijnVertex cleanup and optimizations. Since kmer graphs don't allow sequences longer than the kmer size, the suffix is always a byte, not a byte[]. Optimize the code to make use of this constraint --- .../walkers/haplotypecaller/BaseEdge.java | 15 +- .../walkers/haplotypecaller/BaseGraph.java | 51 ++++- .../walkers/haplotypecaller/BaseVertex.java | 10 + .../haplotypecaller/DeBruijnAssembler.java | 11 +- .../haplotypecaller/DeBruijnVertex.java | 44 +++- .../gatk/walkers/haplotypecaller/Path.java | 2 +- .../walkers/haplotypecaller/SeqGraph.java | 175 ++++++++++----- .../DeBruijnVertexUnitTest.java | 3 +- .../haplotypecaller/SeqGraphUnitTest.java | 210 +++++++++++++++++- 9 files changed, 435 insertions(+), 86 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 053f0e1a1..7b5fd2bbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -116,14 +116,21 @@ public class BaseEdge { this.isRef = isRef; } - // For use when comparing edges pulled from the same graph - public boolean equals( final BaseGraph graph, final BaseEdge edge ) { + /** + * Does this and edge have the same source and target vertices in graph? + * + * @param graph the graph containing both this and edge + * @param edge our comparator edge + * @param + * @return true if we have the same source and target vertices + */ + public boolean hasSameSourceAndTarget(final BaseGraph graph, final BaseEdge edge) { return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); } // For use when comparing edges across graphs! - public boolean equals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { - return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); + public boolean seqEquals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { + return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge))); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index 6aa687312..ec5c99bb1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -310,6 +310,19 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph the type of the nodes in those graphs + * @return true if g1 and g2 are equals + */ public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { - if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { + final Set vertices1 = g1.vertexSet(); + final Set vertices2 = g2.vertexSet(); + final Set edges1 = g1.edgeSet(); + final Set edges2 = g2.edgeSet(); + + if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() ) return false; + + for ( final T v1 : vertices1 ) { + boolean found = false; + for ( final T v2 : vertices2 ) + found = found || v1.getSequenceString().equals(v2.getSequenceString()); + if ( ! found ) return false; } - for( BaseEdge e1 : g1.edgeSet() ) { + + for( final BaseEdge e1 : g1.edgeSet() ) { boolean found = false; for( BaseEdge e2 : g2.edgeSet() ) { - if( e1.equals(g1, e2, g2) ) { found = true; break; } + if( e1.seqEquals(g1, e2, g2) ) { found = true; break; } } if( !found ) { return false; } } - for( BaseEdge e2 : g2.edgeSet() ) { + for( final BaseEdge e2 : g2.edgeSet() ) { boolean found = false; for( BaseEdge e1 : g1.edgeSet() ) { - if( e2.equals(g2, e1, g1) ) { found = true; break; } + if( e2.seqEquals(g2, e1, g1) ) { found = true; break; } } if( !found ) { return false; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index fad7a51d1..b6d278105 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -99,6 +99,16 @@ public class BaseVertex { return true; } + /** + * Are b and this equal according to their base sequences? + * + * @param b the vertex to compare ourselves to + * @return true if b and this have the same sequence, regardless of other attributes that might differentiate them + */ + public boolean seqEquals(final BaseVertex b) { + return Arrays.equals(this.getSequence(), b.getSequence()); + } + @Override public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect return Arrays.hashCode(sequence); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 688d5336e..6d295ff97 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -194,15 +194,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); seqGraph.pruneGraph(PRUNE_FACTOR); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); - seqGraph.mergeNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR); seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR); - seqGraph.mergeBranchingNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR); - seqGraph.mergeNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + seqGraph.simplifyGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); return seqGraph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 47716b7c5..0a2c26ca4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -47,17 +47,20 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; - -import java.util.Arrays; /** * simple node class for storing kmer sequences * - * User: ebanks + * User: ebanks, mdepristo * Date: Mar 23, 2011 */ public class DeBruijnVertex extends BaseVertex { + private final static byte[][] sufficesAsByteArray = new byte[256][]; + static { + for ( int i = 0; i < sufficesAsByteArray.length; i++ ) + sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)}; + } + public DeBruijnVertex( final byte[] sequence ) { super(sequence); } @@ -85,17 +88,38 @@ public class DeBruijnVertex extends BaseVertex { */ @Ensures({"result != null", "result.length() >= 1"}) public String getSuffixString() { - return new String(getSuffix()); + return new String(getSuffixAsArray()); } - @Ensures("result != null") - // TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base - public byte[] getSuffix() { - return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length ); + /** + * Get the suffix byte of this DeBruijnVertex + * + * The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT + * getSuffix would return T + * + * @return a byte + */ + public byte getSuffix() { + return sequence[getKmer() - 1]; } + /** + * Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory. + * + * Should not be modified + * + * @return a byte[] that contains 1 byte == getSuffix() + */ + @Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"}) + private byte[] getSuffixAsArray() { + return sufficesAsByteArray[getSuffix()]; + } + + /** + * {@inheritDoc} + */ @Override public byte[] getAdditionalSequence(boolean source) { - return source ? super.getAdditionalSequence(source) : getSuffix(); + return source ? super.getAdditionalSequence(source) : getSuffixAsArray(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 895cffcca..7546155a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -254,7 +254,7 @@ class Path { final BubbleStateMachine bsm = new BubbleStateMachine(cigar); for( final BaseEdge e : getEdges() ) { - if( e.equals(graph, edgesInOrder.getFirst()) ) { + if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) { advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); } advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index 960f2cdd7..f67815b92 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; @@ -77,67 +79,83 @@ public class SeqGraph extends BaseGraph { super(kmer); } - protected void mergeNodes() { + /** + * Simplify this graph, merging vertices together and restructuring the graph in an + * effort to minimize the number of overall vertices in the graph without changing + * in any way the sequences implied by a complex enumeration of all paths through the graph. + */ + public void simplifyGraph() { + zipLinearChains(); + mergeBranchingNodes(); zipLinearChains(); } + /** + * Zip up all of the simple linear chains present in this graph. + */ protected void zipLinearChains() { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; - - for( final BaseEdge e : edgeSet() ) { - final SeqVertex outgoingVertex = getEdgeTarget(e); - final SeqVertex incomingVertex = getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) - && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 - && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { - - final Set outEdges = outgoingEdgesOf(outgoingVertex); - final Set inEdges = incomingEdgesOf(incomingVertex); - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } - - final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); - addVertex(addedVertex); - for( final BaseEdge edge : outEdges ) { - addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final BaseEdge edge : inEdges ) { - addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - - removeVertex(incomingVertex); - removeVertex(outgoingVertex); - foundNodesToMerge = true; - break; - } - } + while( zipOneLinearChain() ) { + // just keep going until zipOneLinearChain says its done } } - // - // X -> ABC -> Y - // -> aBC -> Y - // - // becomes - // - // X -> A -> BCY - // -> a -> BCY - // - public void mergeBranchingNodes() { + /** + * Merge together two vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence + * + * Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1. + * + * If such a pair of vertices is found, they are merged and the graph is update. Otherwise nothing is changed. + * + * @return true if any such pair of vertices could be found, false otherwise + */ + protected boolean zipOneLinearChain() { + for( final BaseEdge e : edgeSet() ) { + final SeqVertex outgoingVertex = getEdgeTarget(e); + final SeqVertex incomingVertex = getEdgeSource(e); + if( !outgoingVertex.equals(incomingVertex) + && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 + && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { + + final Set outEdges = outgoingEdgesOf(outgoingVertex); + final Set inEdges = incomingEdgesOf(incomingVertex); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + } else if( inEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } else if( outEdges.size() == 1 ) { + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } + + final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); + addVertex(addedVertex); + for( final BaseEdge edge : outEdges ) { + addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + for( final BaseEdge edge : inEdges ) { + addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + + removeVertex(incomingVertex); + removeVertex(outgoingVertex); + return true; + } + } + + return false; + } + + /** + * Perform as many branch simplifications and merging operations as possible on this graph, + * modifying it in place. + */ + private void mergeBranchingNodes() { boolean foundNodesToMerge = true; while( foundNodesToMerge ) { foundNodesToMerge = false; for( final SeqVertex v : vertexSet() ) { - foundNodesToMerge = simplifyDiamond(v); + foundNodesToMerge = simplifyDiamondIfPossible(v); if ( foundNodesToMerge ) break; } @@ -153,8 +171,11 @@ public class SeqGraph extends BaseGraph { * \ | / / * b * - * @param v - * @return + * Only returns true if all outgoing edges of v go to vertices that all only connect to + * a single bottom node, and that all middle nodes have only the single edge + * + * @param v the vertex to test if its the top of a diamond pattern + * @return true if v is the root of a diamond */ protected boolean isRootOfDiamond(final SeqVertex v) { final Set ve = outgoingEdgesOf(v); @@ -173,6 +194,7 @@ public class SeqGraph extends BaseGraph { if ( inDegreeOf(mi) != 1 ) return false; + // make sure that all outgoing vertices of mi go only to the bottom node for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { if ( bottom == null ) bottom = mt; @@ -181,9 +203,24 @@ public class SeqGraph extends BaseGraph { } } + // bottom has some connections coming in from other nodes, don't allow + if ( inDegreeOf(bottom) != ve.size() ) + return false; + return true; } + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return + */ + @Requires("!middleVertices.isEmpty()") private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { final String[] kmers = new String[middleVertices.size()]; @@ -196,6 +233,14 @@ public class SeqGraph extends BaseGraph { return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); } + /** + * Get the node that is the bottom of a diamond configuration in the graph starting at top + * + * @param top + * @return + */ + @Requires("top != null") + @Ensures({"result != null"}) private SeqVertex getDiamondBottom(final SeqVertex top) { final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); final SeqVertex middle = getEdgeTarget(topEdge); @@ -203,6 +248,13 @@ public class SeqGraph extends BaseGraph { return getEdgeTarget(middleEdge); } + /** + * Get the set of vertices that are in the middle of a diamond starting at top + * @param top + * @return + */ + @Requires("top != null") + @Ensures({"result != null", "!result.isEmpty()"}) final Set getMiddleVertices(final SeqVertex top) { final Set middles = new HashSet(); for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { @@ -211,7 +263,26 @@ public class SeqGraph extends BaseGraph { return middles; } - private boolean simplifyDiamond(final SeqVertex top) { + /** + * Simply a diamond configuration in the current graph starting at top, if possible + * + * If top is actually the top of a diamond that can be simplified (i.e., doesn't have any + * random edges or other structure that would cause problems with the transformation), then this code + * performs the following transformation on this graph (modifying it): + * + * A -> M1 -> B, A -> M2 -> B, A -> Mn -> B + * + * becomes + * + * A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B' + * + * where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each + * middle vertex without their shared suffix. + * + * @param top a proposed vertex in this graph that might start a diamond (but doesn't have to) + * @return true top actually starts a diamond and it could be simplified + */ + private boolean simplifyDiamondIfPossible(final SeqVertex top) { if ( ! isRootOfDiamond(top) ) return false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java index 2db35e173..dfbe50668 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java @@ -58,8 +58,7 @@ public class DeBruijnVertexUnitTest extends BaseTest { Assert.assertEquals(v.getSequence(), bases); Assert.assertEquals(v.getSequenceString(), new String(bases)); Assert.assertEquals(v.length(), bases.length); - Assert.assertEquals(v.getSuffix().length, 1); - Assert.assertEquals(v.getSuffix()[0], (byte)'T'); + Assert.assertEquals(v.getSuffix(), (byte)'T'); Assert.assertEquals(v.getSuffixString(), "T"); Assert.assertEquals(v.getAdditionalSequence(true), bases); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index b5089e878..c63996d66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,6 +51,10 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + public class SeqGraphUnitTest extends BaseTest { private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { public byte[] sequence; @@ -75,7 +79,7 @@ public class SeqGraphUnitTest extends BaseTest { deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1); } final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - seqGraph.mergeNodes(); + seqGraph.simplifyGraph(); return seqGraph; } } @@ -103,4 +107,208 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex actualV = actual.vertexSet().iterator().next(); Assert.assertEquals(actualV.getSequence(), cfg.sequence); } + + @DataProvider(name = "IsDiamondData") + public Object[][] makeIsDiamondData() throws Exception { + List tests = new ArrayList(); + + SeqGraph graph; + SeqVertex pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2; + + graph = new SeqGraph(); + + pre1 = new SeqVertex("ACT"); + pre2 = new SeqVertex("AGT"); + top = new SeqVertex("A"); + middle1 = new SeqVertex("CT"); + middle2 = new SeqVertex("CG"); + middle3 = new SeqVertex("CA"); + bottom = new SeqVertex("AA"); + tail1 = new SeqVertex("GC"); + tail2 = new SeqVertex("GC"); + + graph.addVertices(pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2); + graph.addEdges(pre1, top, middle1, bottom, tail1); + graph.addEdges(pre2, top, middle2, bottom, tail1); + graph.addEdges(top, middle3, bottom); + graph.addEdges(bottom, tail2); + + for ( final SeqVertex no : Arrays.asList(pre1, pre2, middle1, middle2, middle3, bottom, tail1, tail2)) { + tests.add(new Object[]{graph, no, false}); + } + tests.add(new Object[]{graph, top, true}); + + final SeqGraph danglingMiddleGraph = (SeqGraph)graph.clone(); + final SeqVertex danglingMiddle = new SeqVertex("A"); + danglingMiddleGraph.addVertex(danglingMiddle); + danglingMiddleGraph.addEdge(top, danglingMiddle); + tests.add(new Object[]{danglingMiddleGraph, top, false}); + + final SeqGraph strangerToBottom = (SeqGraph)graph.clone(); + final SeqVertex notAttachedToTop = new SeqVertex("A"); + strangerToBottom.addVertex(notAttachedToTop); + strangerToBottom.addEdge(notAttachedToTop, bottom); + tests.add(new Object[]{strangerToBottom, top, false}); + + final SeqGraph strangerToMiddle = (SeqGraph)graph.clone(); + final SeqVertex attachedToMiddle = new SeqVertex("A"); + strangerToMiddle.addVertex(attachedToMiddle); + strangerToMiddle.addEdge(attachedToMiddle, middle1); + tests.add(new Object[]{strangerToMiddle, top, false}); + + // middle1 has outgoing edge to non-bottom + final SeqGraph middleExtraOut = (SeqGraph)graph.clone(); + final SeqVertex fromMiddle = new SeqVertex("A"); + middleExtraOut.addVertex(fromMiddle); + middleExtraOut.addEdge(middle1, fromMiddle); + tests.add(new Object[]{middleExtraOut, top, false}); + + // top connects to bottom directly as well + { + final SeqGraph topConnectsToBottomToo = new SeqGraph(); + final SeqVertex top2 = new SeqVertex("A"); + final SeqVertex middle4 = new SeqVertex("C"); + final SeqVertex bottom2 = new SeqVertex("G"); + topConnectsToBottomToo.addVertices(top2, middle4, bottom2); + topConnectsToBottomToo.addEdges(top2, middle4, bottom2); + topConnectsToBottomToo.addEdges(top2, bottom2); + tests.add(new Object[]{topConnectsToBottomToo, top2, false}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "IsDiamondData", enabled = true) + public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { + Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond); + } + + @DataProvider(name = "MergingData") + public Object[][] makeMergingData() throws Exception { + List tests = new ArrayList(); + + final SeqGraph graph = new SeqGraph(); + + SeqVertex pre1 = new SeqVertex("ACT"); + SeqVertex pre2 = new SeqVertex("AGT"); + SeqVertex top = new SeqVertex("A"); + SeqVertex middle1 = new SeqVertex("GC"); + SeqVertex middle2 = new SeqVertex("TC"); + SeqVertex middle3 = new SeqVertex("AC"); + SeqVertex middle4 = new SeqVertex("GCAC"); + SeqVertex bottom = new SeqVertex("AA"); + SeqVertex tail1 = new SeqVertex("GC"); + SeqVertex tail2 = new SeqVertex("GC"); + + // just a single vertex + graph.addVertices(pre1); + tests.add(new Object[]{graph.clone(), graph.clone()}); + + // pre1 -> top = pre1 + top + { + graph.addVertices(top); + graph.addEdges(pre1, top); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + final SeqGraph expected = new SeqGraph(); + expected.addVertex(pre1_top); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + + // pre1 -> top -> middle1 = pre1 + top + middle1 + { + graph.addVertices(middle1); + graph.addEdges(top, middle1); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top_middle1 = new SeqVertex(pre1.getSequenceString() + top.getSequenceString() + middle1.getSequenceString()); + expected.addVertex(pre1_top_middle1); + tests.add(new Object[]{graph.clone(), expected}); + } + + // pre1 -> top -> middle1 & top -> middle2 = pre1 + top -> middle1 & -> middle2 + { + graph.addVertices(middle2); + graph.addEdges(top, middle2); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + expected.addVertices(pre1_top, middle1, middle2); + expected.addEdges(pre1_top, middle1); + expected.addEdges(pre1_top, middle2); + tests.add(new Object[]{graph.clone(), expected}); + } + + // An actual diamond event to merge! + { + graph.addVertices(bottom); + graph.addEdges(middle1, bottom); + graph.addEdges(middle2, bottom); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + final SeqVertex newMiddle1 = new SeqVertex("G"); + final SeqVertex newMiddle2 = new SeqVertex("T"); + final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); + expected.addVertices(pre1_top, newMiddle1, newMiddle2, newBottom); + expected.addEdges(pre1_top, newMiddle1, newBottom); + expected.addEdges(pre1_top, newMiddle2, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.addVertices(middle3); + graph.addEdges(top, middle3, bottom); + final SeqVertex newMiddle3 = new SeqVertex("A"); + expected.addVertices(newMiddle3); + expected.addEdges(pre1_top, newMiddle3, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.addVertices(middle4); + graph.addEdges(top, middle4, bottom); + final SeqVertex newMiddle4 = new SeqVertex("GCA"); + expected.addVertices(newMiddle4); + expected.addEdges(pre1_top, newMiddle4, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + + { + final SeqGraph all = new SeqGraph(); + all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); + all.addEdges(pre1, top, middle1, bottom, tail1); + all.addEdges(pre2, top, middle2, bottom, tail2); + + final SeqGraph expected = new SeqGraph(); + final SeqVertex newMiddle1 = new SeqVertex("G"); + final SeqVertex newMiddle2 = new SeqVertex("T"); + final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); + expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(pre1, top, newMiddle1, newBottom, tail1); + expected.addEdges(pre2, top, newMiddle2, newBottom, tail2); + tests.add(new Object[]{all.clone(), expected.clone()}); + } + + // test the case where we delete a middle node away because the common sequence is all of its sequence + { + final SeqGraph graph2 = new SeqGraph(); + final SeqVertex mytop = new SeqVertex("A"); + final SeqVertex mid1 = new SeqVertex("AC"); + final SeqVertex mid2 = new SeqVertex("C"); + final SeqVertex bot = new SeqVertex("G"); + graph2.addVertices(mytop, mid1, mid2, bot); + graph2.addEdges(mytop, mid1, bot); + graph2.addEdges(mytop, mid2, bot); + + final SeqGraph expected = new SeqGraph(); + final SeqVertex newMid1 = new SeqVertex("A"); + final SeqVertex newBottom = new SeqVertex("CG"); + expected.addVertices(mytop, newMid1, newBottom); + expected.addEdges(mytop, newMid1, newBottom); + expected.addEdges(mytop, newBottom); + tests.add(new Object[]{graph2, expected}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MergingData", enabled = true) + public void testMerging(final SeqGraph graph, final SeqGraph expected) { + final SeqGraph merged = (SeqGraph)graph.clone(); + merged.simplifyGraph(); + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } } From d3b756bdc737ef880fe1416989803e71716ccdea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 08:39:01 -0400 Subject: [PATCH 12/16] BaseVertex optimization: don't clone byte[] unnecessarily -- Don't clone sequence upon construction or in getSequence(), as these are frequently called, memory allocating routines and cloning will be prohibitively expensive --- .../walkers/haplotypecaller/BaseVertex.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index b6d278105..93bd4f5c5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -61,14 +61,16 @@ public class BaseVertex { /** * Create a new sequence vertex with sequence + * + * This code doesn't copy sequence for efficiency reasons, so sequence should absolutely not be modified + * in any way after passing this sequence to the BaseVertex + * * @param sequence a non-null, non-empty sequence of bases contained in this vertex */ public BaseVertex(final byte[] sequence) { if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); - - // TODO -- should we really be cloning here? - this.sequence = sequence.clone(); + this.sequence = sequence; } /** @@ -81,7 +83,7 @@ public class BaseVertex { /** * For testing purposes only -- low performance - * @param sequence + * @param sequence the sequence as a string */ protected BaseVertex(final String sequence) { this(sequence.getBytes()); @@ -109,8 +111,13 @@ public class BaseVertex { return Arrays.equals(this.getSequence(), b.getSequence()); } + /** + * necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + * @return + */ @Override - public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + public int hashCode() { + // TODO -- optimization, could compute upfront once and cached in debruijn graph return Arrays.hashCode(sequence); } @@ -128,8 +135,7 @@ public class BaseVertex { */ @Ensures("result != null") public byte[] getSequence() { - // TODO -- why is this cloning? It's likely extremely expensive - return sequence.clone(); + return sequence; } /** From 3a8f001c276808dbb78e199202e7174d66e5e6c6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 14:26:37 -0400 Subject: [PATCH 13/16] Misc. fixes upon pull request review -- DeBruijnAssemblerUnitTest and AlignmentUtilsUnitTest were both in DEBUG = true mode (bad!) -- Remove the maxHaplotypesToConsider feature of HC as it's not useful --- .../haplotypecaller/DeBruijnAssembler.java | 34 ++++--------------- .../haplotypecaller/HaplotypeCaller.java | 6 +--- .../DeBruijnAssemblerUnitTest.java | 3 +- .../utils/sam/AlignmentUtilsUnitTest.java | 2 +- 4 files changed, 10 insertions(+), 35 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6d295ff97..f3db422e7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -92,7 +92,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debugGraphTransformations; private final PrintStream graphWriter; private final int minKmer; - private final int maxHaplotypesToConsider; private final byte minBaseQualityToUseInAssembly; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; @@ -100,14 +99,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private int PRUNE_FACTOR = 2; protected DeBruijnAssembler() { - this(false, -1, null, 11, 1000, DEFAULT_MIN_BASE_QUALITY_TO_USE); + this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, final PrintStream graphWriter, final int minKmer, - final int maxHaplotypesToConsider, final byte minBaseQualityToUseInAssembly) { super(); this.debug = debug; @@ -115,7 +113,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; - this.maxHaplotypesToConsider = maxHaplotypesToConsider; this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } @@ -371,39 +368,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - final List finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes); - if ( finalHaplotypes.size() < returnHaplotypes.size() ) - logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + if ( returnHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); if( debug ) { - if( finalHaplotypes.size() > 1 ) { - System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + if( returnHaplotypes.size() > 1 ) { + System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { System.out.println("Found only the reference haplotype in the assembly graph."); } - for( final Haplotype h : finalHaplotypes ) { + for( final Haplotype h : returnHaplotypes ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } - return finalHaplotypes; - } - - /** - * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider - * - * @param haplotypes a list of haplotypes to consider - * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider - */ - private List selectHighestScoringHaplotypes(final List haplotypes) { - if ( haplotypes.size() <= maxHaplotypesToConsider ) - return haplotypes; - else { - final List sorted = new ArrayList(haplotypes); - Collections.sort(sorted, new Haplotype.ScoreComparator()); - return sorted.subList(0, maxHaplotypesToConsider); - } + return returnHaplotypes; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7bec4bee5..31751d8f0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -206,10 +206,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; - @Advanced - @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false) - protected int maxHaplotypesToConsider = 100000; - /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -393,7 +389,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider, minBaseQualityToUseInAssembly ); + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index fa581f7fd..663d619a8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -61,13 +61,12 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index 660dadc00..125450257 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,7 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private SAMFileHeader header; /** Basic aligned and mapped read. */ From 6d7d21ca47a35b9925db55401464a3e7d86d9418 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 15:57:10 -0400 Subject: [PATCH 14/16] Bugfix for incorrect branch diamond merging algorithm -- Previous version was just incorrectly accumulating information about nodes that were completely eliminated by the common suffix, so we were dropping some reference connections between vertices. Fixed. In the process simplified the entire algorithm and codebase -- Resolves https://jira.broadinstitute.org/browse/GSA-884 --- .../walkers/haplotypecaller/BaseEdge.java | 14 +++++ .../walkers/haplotypecaller/BaseGraph.java | 19 +++++- .../walkers/haplotypecaller/SeqGraph.java | 60 ++++++++----------- .../haplotypecaller/SeqGraphUnitTest.java | 30 ++++++++++ 4 files changed, 85 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 7b5fd2bbd..d49b63672 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -143,4 +143,18 @@ public class BaseEdge { return edge2.multiplicity - edge1.multiplicity; } } + + /** + * Add edge to this edge, updating isRef and multiplicity as appropriate + * + * isRef is simply the or of this and edge + * multiplicity is the sum + * + * @param edge the edge to add + */ + public void add(final BaseEdge edge) { + if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); + this.multiplicity += edge.getMultiplicity(); + this.isRef = this.isRef || edge.isRef(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index ec5c99bb1..c77ec4222 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -47,11 +47,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -64,7 +64,7 @@ import java.util.*; * User: rpoplin * Date: 2/6/13 */ - +@Invariant("!this.isAllowingMultipleEdges()") public class BaseGraph extends DefaultDirectedGraph { protected final static Logger logger = Logger.getLogger(BaseGraph.class); private final int kmerSize; @@ -513,4 +513,19 @@ public class BaseGraph extends DefaultDirectedGraph edges = getAllEdges(source, target); + return edges.isEmpty() ? null : edges.iterator().next(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index f67815b92..b855390c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -51,6 +51,7 @@ import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; +import java.io.File; import java.util.*; /** @@ -149,7 +150,7 @@ public class SeqGraph extends BaseGraph { * Perform as many branch simplifications and merging operations as possible on this graph, * modifying it in place. */ - private void mergeBranchingNodes() { + protected void mergeBranchingNodes() { boolean foundNodesToMerge = true; while( foundNodesToMerge ) { foundNodesToMerge = false; @@ -288,61 +289,48 @@ public class SeqGraph extends BaseGraph { final SeqVertex diamondBottom = getDiamondBottom(top); final Set middleVertices = getMiddleVertices(top); - final List verticesToRemove = new LinkedList(); final List edgesToRemove = new LinkedList(); // all of the edges point to the same sink, so it's time to merge final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); if ( commonSuffix != null ) { - boolean newBottomEdgeIsRef = false; - int newBottomEdgeMultiplicity = 0; - + final BaseEdge botToNewBottom = new BaseEdge(false, 0); + final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0); final SeqVertex newBottomV = new SeqVertex(commonSuffix); addVertex(newBottomV); for ( final SeqVertex middle : middleVertices ) { - boolean missingNodeEdgeIsRef = false; - int missingNodeMultiplicity = 0; final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); + final BaseEdge topToMiddleEdge = getEdge(top, middle); + final BaseEdge middleToBottomE = getEdge(middle, diamondBottom); - if ( withoutSuffix != null ) // this node is a deletion + // clip out the two edges, since we'll be replacing them later + edgesToRemove.add(topToMiddleEdge); + edgesToRemove.add(middleToBottomE); + + if ( withoutSuffix != null ) { // this node is a deletion addVertex(withoutSuffix); - - // update all edges from top -> middle to be top -> without suffix - for( final BaseEdge topToMiddleEdge : getAllEdges(top, middle) ) { - edgesToRemove.add(topToMiddleEdge); - missingNodeMultiplicity += topToMiddleEdge.getMultiplicity(); - missingNodeEdgeIsRef = missingNodeEdgeIsRef || topToMiddleEdge.isRef(); - if ( withoutSuffix != null ) // this node is a deletion - addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge.isRef(), topToMiddleEdge.getMultiplicity())); + // update edge from top -> middle to be top -> without suffix + addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge)); + addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE)); + } else { + // this middle node is == the common suffix, wo we're removing the edge + elimMiddleNodeEdge.add(topToMiddleEdge); } - - // reattached prefix to the new bottom V by updating all edges from middleV -> bottom - for ( final BaseEdge middleToBottomE : getAllEdges(middle, diamondBottom) ) { - missingNodeMultiplicity += middleToBottomE.getMultiplicity(); - missingNodeEdgeIsRef = missingNodeEdgeIsRef || middleToBottomE.isRef(); - - if ( withoutSuffix != null ) // this node is a deletion - addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE.isRef(), middleToBottomE.getMultiplicity())); - edgesToRemove.add(middleToBottomE); - - // update the info for the new bottom edge - newBottomEdgeIsRef = newBottomEdgeIsRef || middleToBottomE.isRef(); - newBottomEdgeMultiplicity += middleToBottomE.getMultiplicity(); - } - - if ( withoutSuffix == null ) // add an edge from top to new bottom - addEdge(top, newBottomV, new BaseEdge(missingNodeEdgeIsRef, missingNodeMultiplicity)); - + // include the ref and multi of mid -> bot in our edge from new bot -> bot + botToNewBottom.add(middleToBottomE); verticesToRemove.add(middle); } - addEdge(newBottomV, diamondBottom, new BaseEdge(newBottomEdgeIsRef, newBottomEdgeMultiplicity)); + // add an edge from top to new bottom, because some middle nodes were removed + if ( elimMiddleNodeEdge.getMultiplicity() > 0 ) + addEdge(top, newBottomV, elimMiddleNodeEdge); + + addEdge(newBottomV, diamondBottom, botToNewBottom); removeAllEdges(edgesToRemove); removeAllVertices(verticesToRemove); - return true; } else { return false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index c63996d66..83a4f4c50 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,6 +51,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -311,4 +312,33 @@ public class SeqGraphUnitTest extends BaseTest { merged.simplifyGraph(); Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); } + + // A -> ACT -> C [non-ref] + // A -> ACT -> C [non-ref] + // A -> ACT -> C [ref] + // + // Should become A -> ACT -> C [ref and non-ref edges] + // + @Test + public void testBubbleSameBasesWithRef() { + final SeqGraph graph = new SeqGraph(); + final SeqVertex top = new SeqVertex("A"); + final SeqVertex mid1 = new SeqVertex("ACT"); + final SeqVertex mid2 = new SeqVertex("ACT"); + final SeqVertex bot = new SeqVertex("C"); + graph.addVertices(top, mid1, mid2, bot); + graph.addEdges(top, mid2, bot); + graph.addEdge(top, mid1, new BaseEdge(true, 1)); + graph.addEdge(mid1, bot, new BaseEdge(true, 1)); + + final SeqGraph expected = new SeqGraph(); + expected.addVertices(top, mid1, bot); + expected.addEdge(top, mid1, new BaseEdge(true, 2)); + expected.addEdge(mid1, bot, new BaseEdge(true, 2)); + + final SeqGraph actual = ((SeqGraph)graph.clone()); + actual.mergeBranchingNodes(); + + Assert.assertTrue(BaseGraph.graphEquals(actual, expected)); + } } From d94b3f85bcd56583bc25aa94d9ece3916df39908 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 20:56:58 -0400 Subject: [PATCH 15/16] Increase NUM_BEST_PATHS_PER_KMER_GRAPH in DeBruijnAssembler to 25 -- The value of 11 was too small to properly return a real low-frequency variant in our the 1000G AFR integration test. --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index f3db422e7..7cf4cc8d3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -78,7 +78,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; + + // TODO -- this number is very low, and limits our ability to explore low-frequnecy variants. It should + // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where + // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; From aa7f172b18ff5ad8e5e881dce451c30ad362d61a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 22:40:10 -0400 Subject: [PATCH 16/16] Cap the computational cost of the kmer based error correction in the DeBruijnGraph -- Simply don't do more than MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000 operations to correct a graph. If the number of ops would exceed this threshold, the original graph is used. -- Overall the algorithm is just extremely computational expensive, and actually doesn't implement the correct correction. So we live with this limitations while we continue to explore better algorithms -- Updating MD5s to reflect changes in assembly algorithms --- .../haplotypecaller/DeBruijnGraph.java | 25 ++-- .../haplotypecaller/KMerErrorCorrector.java | 135 ++++++++++++++---- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 2 +- 4 files changed, 127 insertions(+), 37 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java index d9df03539..0e20c311b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -90,7 +90,8 @@ public class DeBruijnGraph extends BaseGraph { /** * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return a freshly allocated graph + * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason + * we cannot actually do the error correction */ protected DeBruijnGraph errorCorrect() { final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables @@ -101,19 +102,23 @@ public class DeBruijnGraph extends BaseGraph { corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); } } - corrector.computeErrorCorrectionMap(); - final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); + if ( corrector.computeErrorCorrectionMap() ) { + final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); - for( final BaseEdge e : edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + for( final BaseEdge e : edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } } - } - return correctedGraph; + return correctedGraph; + } else { + // the error correction wasn't possible, simply return this graph + return this; + } } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java index 05bd1b881..b051e5411 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import org.apache.log4j.Logger; + import java.util.*; /** @@ -69,15 +71,54 @@ import java.util.*; * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient * TODO -- version would use the byte[] directly * + * TODO -- this is just not the right way to implement error correction in the graph. Basically, the + * right way to think about this is error correcting reads: + * + * * + * ACTGAT + * ACT + * CTG + * TGA + * GAT + * + * Now suppose the G is an error. What you are doing is asking for each 3mer in the read whether it's high quality + * or not. Suppose the answer is + * + * * + * ACTGAT + * ACT -- yes + * CTG -- no [CTG is unusual] + * TGA -- no [TGA is unusual] + * GAT -- yes [maybe GAT is just common, even through its an error] + * + * As we do this process it's clear how we can figure out which positions in the read likely harbor errors, and + * then go search around those bases in the read in an attempt to fix the read. We don't have to compute for + * every bad kmer it's best match, as that's just not the problem we are thinking looking to solve. We are actually + * looking for a change to a read such that all spanning kmers are well-supported. This class is being disabled + * until we figure implement this change. + * + * * User: depristo * Date: 3/8/13 * Time: 1:16 PM */ public class KMerErrorCorrector { + private final static Logger logger = Logger.getLogger(KMerErrorCorrector.class); + + /** + * The maximum number of bad kmer -> good kmer correction operations we'll consider doing before + * aborting for efficiency reasons. Basically, the current algorithm sucks, and is O(n^2), and + * so we cannot simply error correct 10K bad kmers against a db of 100K kmers if we ever want + * to finish running in a reasonable amount of time. This isn't worth fixing because fundamentally + * the entire error correction algorithm is just not right (i.e., it's correct but not ideal conceptually + * so we'll just fix the conceptual problem than the performance issue). + */ + private final static int MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000; + /** * A map of for each kmer to its num occurrences in addKmers */ - Map countsByKMer = new HashMap(); + Map countsByKMer = new HashMap(); /** * A map from raw kmer -> error corrected kmer @@ -154,35 +195,45 @@ public class KMerErrorCorrector { * Indicate that no more kmers will be added to the kmer error corrector, so that the * error correction data structure should be computed from the added kmers. Enabled calls * to getErrorCorrectedKmer, and disable calls to addKmer. + * + * @return true if the error correction map could actually be computed, false if for any reason + * (efficiency, memory, we're out to lunch) a correction map couldn't be created. */ - public void computeErrorCorrectionMap() { + public boolean computeErrorCorrectionMap() { if ( countsByKMer == null ) throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); - final LinkedList needsCorrection = new LinkedList(); - final LinkedList goodKmers = new LinkedList(); + final LinkedList needsCorrection = new LinkedList(); + final List goodKmers = new ArrayList(countsByKMer.size()); - rawToErrorCorrectedMap = new HashMap(); - for ( Map.Entry kmerCounts: countsByKMer.entrySet() ) { - if ( kmerCounts.getValue() <= maxCountToCorrect ) - needsCorrection.add(kmerCounts.getKey()); + rawToErrorCorrectedMap = new HashMap(countsByKMer.size()); + for ( final CountedKmer countedKmer: countsByKMer.values() ) { + if ( countedKmer.count <= maxCountToCorrect ) + needsCorrection.add(countedKmer); else { // todo -- optimization could make not in map mean == - rawToErrorCorrectedMap.put(kmerCounts.getKey(), kmerCounts.getKey()); + rawToErrorCorrectedMap.put(countedKmer.kmer, countedKmer.kmer); // only allow corrections to kmers with at least this count - if ( kmerCounts.getValue() >= minCountOfKmerToBeCorrection ) - goodKmers.add(kmerCounts.getKey()); + if ( countedKmer.count >= minCountOfKmerToBeCorrection ) + goodKmers.add(countedKmer); } } - for ( final String toCorrect : needsCorrection ) { - final String corrected = findClosestKMer(toCorrect, goodKmers); - rawToErrorCorrectedMap.put(toCorrect, corrected); - } - // cleanup memory -- we don't need the counts for each kmer any longer countsByKMer = null; + + if ( goodKmers.size() * needsCorrection.size() > MAX_CORRECTION_OPS_TO_ALLOW ) + return false; + else { + Collections.sort(goodKmers); + for ( final CountedKmer toCorrect : needsCorrection ) { + final String corrected = findClosestKMer(toCorrect, goodKmers); + rawToErrorCorrectedMap.put(toCorrect.kmer, corrected); + } + + return true; + } } protected void addKmer(final String rawKmer, final int kmerCount) { @@ -190,30 +241,42 @@ public class KMerErrorCorrector { if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); - final Integer countFromMap = countsByKMer.get(rawKmer); - final int count = countFromMap == null ? 0 : countFromMap; - countsByKMer.put(rawKmer, count + kmerCount); + CountedKmer countFromMap = countsByKMer.get(rawKmer); + if ( countFromMap == null ) { + countFromMap = new CountedKmer(rawKmer); + countsByKMer.put(rawKmer, countFromMap); + } + countFromMap.count += kmerCount; } - protected String findClosestKMer(final String kmer, final Collection goodKmers) { + protected String findClosestKMer(final CountedKmer kmer, final Collection goodKmers) { String bestMatch = null; int minMismatches = Integer.MAX_VALUE; - for ( final String goodKmer : goodKmers ) { - final int mismatches = countMismatches(kmer, goodKmer); + for ( final CountedKmer goodKmer : goodKmers ) { + final int mismatches = countMismatches(kmer.kmer, goodKmer.kmer, minMismatches); if ( mismatches < minMismatches ) { minMismatches = mismatches; - bestMatch = goodKmer; + bestMatch = goodKmer.kmer; } + + // if we find an edit-distance 1 result, abort early, as we know there can be no edit distance 0 results + if ( mismatches == 1 ) + break; } return minMismatches > maxMismatchesToCorrect ? null : bestMatch; } - protected int countMismatches(final String one, final String two) { + protected int countMismatches(final String one, final String two, final int currentBest) { int mismatches = 0; - for ( int i = 0; i < one.length(); i++ ) + for ( int i = 0; i < one.length(); i++ ) { mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; + if ( mismatches > currentBest ) + break; + if ( mismatches > maxMismatchesToCorrect ) + return Integer.MAX_VALUE; + } return mismatches; } @@ -238,4 +301,26 @@ public class KMerErrorCorrector { b.append("\n}"); return b.toString(); } + + private static class CountedKmer implements Comparable { + final String kmer; + int count; + + private CountedKmer(String kmer) { + this.kmer = kmer; + } + + @Override + public String toString() { + return "CountedKmer{" + + "kmer='" + kmer + '\'' + + ", count=" + count + + '}'; + } + + @Override + public int compareTo(CountedKmer o) { + return o.count - count; + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index fd16ed856..12dc71799 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "2b9355ab532314bce157c918c7606409"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "91f4880910e436bf5aca0abbebd58948"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c93e54f87..5ee0a6b81 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "9f9062a6eb93f984658492400102b0c7"); + "d41a886f69a67e01af2ba1a6b4a681d9"); } private void HCTestIndelQualityScores(String bam, String args, String md5) {