Added some docs, unit test, and contracts to SimpleDeBruijnAssembler.

-- Testing that cycles in the reference graph fail graph construction appropriately.
-- Minor bug fix in assembly with reduced reads.

Added some docs and contracts to SimpleDeBruijnAssembler

Added a unit test to SimpleDeBruijnAssembler
This commit is contained in:
Ryan Poplin 2013-02-04 11:03:52 -05:00
parent 43e3a040b6
commit 79ef41e7b1
2 changed files with 62 additions and 29 deletions

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
@ -96,7 +97,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
MIN_KMER = minKmer;
}
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
* @param refHaplotype reference haplotype object
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
* @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
* @return a non-empty list of all the haplotypes that are produced during assembly
*/
@Ensures({"result.contains(refHaplotype)"})
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List<VariantContext> activeAllelesToGenotype ) {
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
// set the pruning factor for this run of the assembly engine
this.PRUNE_FACTOR = PRUNE_FACTOR;
// create the graphs
@ -109,31 +127,35 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
mergeNodes( graph );
}
// print the graphs if the appropriate debug option has been turned on
if( GRAPH_WRITER != null ) {
printGraphs();
}
// find the best paths in the graphs
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
@Requires({"reads != null", "refHaplotype != null"})
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear();
final int maxKmer = refHaplotype.getBases().length;
// create the graph
// create the graph for each possible kmer
for( int kmer = MIN_KMER; kmer <= maxKmer; kmer += 6 ) {
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) {
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
graphs.add(graph);
}
}
}
@Requires({"graph != null"})
protected static void mergeNodes( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
for( final DeBruijnEdge e : graph.edgeSet() ) {
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
@ -211,25 +233,26 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
}
}
private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Collection<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
protected static DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
// First pull kmers from the reference haplotype and add them to the graph
final byte[] refSequence = refHaplotype.getBases();
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(refSequence, i, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(refSequence, i+1, kmer2, 0, KMER_LENGTH);
if( !addKmersToGraph(graph, kmer1, kmer2, true) ) {
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
if( !addKmersToGraph(graph, Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) {
if( DEBUG ) {
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
}
return false;
return null;
}
}
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
@ -245,31 +268,28 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
break;
}
}
int countNumber = 1;
if (read.isReducedRead()) {
// compute mean number of reduced read counts in current kmer span
final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
// precise rounding can make a difference with low consensus counts
countNumber = MathUtils.arrayMax(counts);
// countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
}
if( !badKmer ) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(sequence, iii, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
int countNumber = 1;
if( read.isReducedRead() ) {
// compute mean number of reduced read counts in current kmer span
// precise rounding can make a difference with low consensus counts
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH));
}
for (int k=0; k < countNumber; k++)
final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH);
final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH);
for( int kkk=0; kkk < countNumber; kkk++ ) {
addKmersToGraph(graph, kmer1, kmer2, false);
}
}
}
}
}
return true;
return graph;
}
@Requires({"graph != null", "kmer1.length > 0", "kmer2.length > 0"})
protected static boolean addKmersToGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) {
final int numVertexBefore = graph.vertexSet().size();
@ -378,6 +398,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
return returnHaplotypes;
}
// this function is slated for removal when SWing is removed
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final List<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
if( haplotype == null ) { return false; }

View File

@ -56,6 +56,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
@ -298,4 +299,15 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
}
return true;
}
@Test(enabled = true)
public void testReferenceCycleGraph() {
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
final DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g1 = SimpleDeBruijnAssembler.createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), false);
final DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g2 = SimpleDeBruijnAssembler.createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), false);
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
}
}