diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java similarity index 60% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 4edb3f9fa..087d526da 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -48,19 +48,20 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.jgrapht.graph.DefaultDirectedGraph; import java.io.PrintStream; import java.util.*; @@ -71,13 +72,15 @@ import java.util.*; * Date: Mar 14, 2011 */ -public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { +public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 12; private static final byte MIN_QUALITY = (byte) 16; + private static final int MAX_POSSIBLE_KMER = 75; + private static final int GRAPH_KMER_STEP = 6; - // Smith-Waterman parameters originally copied from IndelRealigner + // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode private static final double SW_MATCH = 5.0; // 1.0; private static final double SW_MISMATCH = -10.0; //-1.0/3.0; private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; @@ -85,12 +88,12 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private final boolean DEBUG; private final PrintStream GRAPH_WRITER; - private final List> graphs = new ArrayList>(); + private final List graphs = new ArrayList(); private final int MIN_KMER; private int PRUNE_FACTOR = 2; - public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter, final int minKmer ) { + public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) { super(); DEBUG = debug; GRAPH_WRITER = graphWriter; @@ -120,13 +123,6 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // create the graphs createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); - // clean up the graphs by pruning and merging - for( final DefaultDirectedGraph graph : graphs ) { - pruneGraph( graph, PRUNE_FACTOR ); - //eliminateNonRefPaths( graph ); - mergeNodes( graph ); - } - // print the graphs if the appropriate debug option has been turned on if( GRAPH_WRITER != null ) { printGraphs(); @@ -140,18 +136,25 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { graphs.clear(); - final int maxKmer = refHaplotype.getBases().length; + final int maxKmer = Math.min(MAX_POSSIBLE_KMER, refHaplotype.getBases().length - KMER_OVERLAP); // create the graph for each possible kmer - for( int kmer = MIN_KMER; kmer <= maxKmer; kmer += 6 ) { - final DefaultDirectedGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); + for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { + final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object - graphs.add(graph); + // do a series of steps to clean up the raw assembly graph to make it analysis-ready + pruneGraph(graph, PRUNE_FACTOR); + cleanNonRefPaths(graph); + mergeNodes(graph); + if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + sanityCheckReferenceGraph(graph, refHaplotype); + graphs.add(graph); + } } } } @Requires({"graph != null"}) - protected static void mergeNodes( final DefaultDirectedGraph graph ) { + protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) { boolean foundNodesToMerge = true; while( foundNodesToMerge ) { foundNodesToMerge = false; @@ -159,7 +162,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DeBruijnEdge e : graph.edgeSet() ) { final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) && graph.inDegreeOf(outgoingVertex) == 1 && graph.outDegreeOf(incomingVertex) == 1) { + if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && + graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); final Set inEdges = graph.incomingEdgesOf(incomingVertex); if( inEdges.size() == 1 && outEdges.size() == 1 ) { @@ -189,7 +193,42 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } } - protected static void pruneGraph( final DefaultDirectedGraph graph, final int pruneFactor ) { + protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { + if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { + return; + } + // Remove non-ref edges connected before and after the reference path + final Set edgesToCheck = new HashSet(); + edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex())); + while( !edgesToCheck.isEmpty() ) { + final DeBruijnEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) ); + graph.removeEdge(e); + } + edgesToCheck.remove(e); + } + edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex())); + while( !edgesToCheck.isEmpty() ) { + final DeBruijnEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ); + graph.removeEdge(e); + } + edgesToCheck.remove(e); + } + + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new ArrayList(); + for( final DeBruijnVertex v : graph.vertexSet() ) { + if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + graph.removeAllVertices(verticesToRemove); + } + + protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) { final List edgesToRemove = new ArrayList(); for( final DeBruijnEdge e : graph.edgeSet() ) { if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor @@ -208,42 +247,32 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { graph.removeAllVertices(verticesToRemove); } - protected static void eliminateNonRefPaths( final DefaultDirectedGraph graph ) { - final List verticesToRemove = new ArrayList(); - boolean done = false; - while( !done ) { - done = true; - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 || graph.outDegreeOf(v) == 0 ) { - boolean isRefNode = false; - for( final DeBruijnEdge e : graph.edgesOf(v) ) { - if( e.isRef() ) { - isRefNode = true; - break; - } - } - if( !isRefNode ) { - done = false; - verticesToRemove.add(v); - } - } - } - graph.removeAllVertices(verticesToRemove); - verticesToRemove.clear(); + protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) { + if( graph.getReferenceSourceVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference source vertex."); + } + if( graph.getReferenceSinkVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference sink vertex."); + } + if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) { + throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path." + + " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) + + " haplotype = " + new String(refHaplotype.getBases()) + ); } } @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected static DefaultDirectedGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); // First pull kmers from the reference haplotype and add them to the graph final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !addKmersToGraph(graph, Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) { + if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) { if( DEBUG ) { System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); } @@ -280,7 +309,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); for( int kkk=0; kkk < countNumber; kkk++ ) { - addKmersToGraph(graph, kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false); } } } @@ -289,32 +318,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return graph; } - @Requires({"graph != null", "kmer1.length > 0", "kmer2.length > 0"}) - protected static boolean addKmersToGraph( final DefaultDirectedGraph graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) { - - final int numVertexBefore = graph.vertexSet().size(); - final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length ); - graph.addVertex(v1); - final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length ); - graph.addVertex(v2); - if( isRef && graph.vertexSet().size() == numVertexBefore ) { return false; } - - final DeBruijnEdge targetEdge = graph.getEdge(v1, v2); - if ( targetEdge == null ) { - graph.addEdge(v1, v2, new DeBruijnEdge( isRef )); - } else { - if( isRef ) { - targetEdge.setIsRef( true ); - } - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1); - } - return true; - } - protected void printGraphs() { - int count = 0; - for( final DefaultDirectedGraph graph : graphs ) { - GRAPH_WRITER.println("digraph kmer" + count++ +" {"); + GRAPH_WRITER.println("digraph assemblyGraphs {"); + for( final DeBruijnAssemblyGraph graph : graphs ) { for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); @@ -325,24 +331,23 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - final String label = ( graph.inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); + GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); } - GRAPH_WRITER.println("}"); } + GRAPH_WRITER.println("}"); } + @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @Ensures({"result.contains(refHaplotype)"}) - private List findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { - final List returnHaplotypes = new ArrayList(); + private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { - // add the reference haplotype separately from all the others - final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - refHaplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); - refHaplotype.setCigar( swConsensus.getCigar() ); - if( !returnHaplotypes.add( refHaplotype ) ) { - throw new ReviewedStingException("Unable to add reference haplotype during assembly: " + refHaplotype); - } + // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + final List returnHaplotypes = new ArrayList(); + refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); + final Cigar c = new Cigar(); + c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); + refHaplotype.setCigar(c); + returnHaplotypes.add( refHaplotype ); final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); @@ -351,30 +356,50 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); + addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); } } - for( final DefaultDirectedGraph graph : graphs ) { + for( final DeBruijnAssemblyGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + Haplotype h = new Haplotype( path.getBases() ); + if( !returnHaplotypes.contains(h) ) { + final Cigar cigar = path.calculateCigar(); + if( cigar.isEmpty() ) { + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); + } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < 60 ) { // N cigar elements means that a bubble was too divergent from the reference so skip over this path + continue; + } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); + } + h.setCigar(cigar); - final Haplotype h = new Haplotype( path.getBases() ); - if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) { + // extend partial haplotypes which are anchored in the reference to include the full active region + h = extendPartialHaplotype(h, activeRegionStart, refWithPadding); + final Cigar leftAlignedCigar = leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(h.getCigar()), refWithPadding, h.getBases(), activeRegionStart, 0); + if( leftAlignedCigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // left alignment failure + continue; + } + if( !returnHaplotypes.contains(h) ) { + h.setAlignmentStartHapwrtRef(activeRegionStart); + h.setCigar( leftAlignedCigar ); + returnHaplotypes.add(h); - // for GGA mode, add the desired allele into the haplotype if it isn't already present - if( !activeAllelesToGenotype.isEmpty() ) { - final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place - for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present - final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); + // for GGA mode, add the desired allele into the haplotype if it isn't already present + if( !activeAllelesToGenotype.isEmpty() ) { + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); - // This if statement used to additionally have: - // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" - // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto - // a haplotype that already contains a 1bp insertion (so practically it is reference but - // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). - if( vcOnHaplotype == null ) { - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); + // This if statement used to additionally have: + // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" + // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto + // a haplotype that already contains a 1bp insertion (so practically it is reference but + // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). + if( vcOnHaplotype == null ) { + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); + } } } } @@ -383,7 +408,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } } - if( DEBUG ) { + if( DEBUG ) { if( returnHaplotypes.size() > 1 ) { System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); } else { @@ -391,15 +416,124 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } for( final Haplotype h : returnHaplotypes ) { System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); + System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() ); } } - return returnHaplotypes; } - // this function is slated for removal when SWing is removed - private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { + /** + * Extend partial haplotypes which are anchored in the reference to include the full active region + * @param haplotype the haplotype to extend + * @param activeRegionStart the place where the active region starts in the ref byte array + * @param refWithPadding the full reference byte array with padding which encompasses the active region + * @return a haplotype fully extended to encompass the active region + */ + @Requires({"haplotype != null", "activeRegionStart > 0", "refWithPadding != null", "refWithPadding.length > 0"}) + @Ensures({"result != null", "result.getCigar() != null"}) + private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) { + final Cigar cigar = haplotype.getCigar(); + final Cigar newCigar = new Cigar(); + byte[] newHaplotypeBases = haplotype.getBases(); + int refPos = activeRegionStart; + int hapPos = 0; + for( CigarElement ce : cigar.getCigarElements() ) { + switch (ce.getOperator()) { + case M: + refPos += ce.getLength(); + hapPos += ce.getLength(); + newCigar.add(ce); + break; + case I: + hapPos += ce.getLength(); + newCigar.add(ce); + break; + case D: + refPos += ce.getLength(); + newCigar.add(ce); + break; + case X: + newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), + ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), + Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); + refPos += ce.getLength(); + hapPos += ce.getLength(); + newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); + break; + default: + throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator()); + } + } + final Haplotype returnHaplotype = new Haplotype(newHaplotypeBases, haplotype.isReference()); + returnHaplotype.setCigar( newCigar ); + return returnHaplotype; + } + + /** + * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal + * @param c the cigar to test + * @return true if we should skip over this path + */ + @Requires("c != null") + private boolean pathIsTooDivergentFromReference( final Cigar c ) { + for( final CigarElement ce : c.getCigarElements() ) { + if( ce.getOperator().equals(CigarOperator.N) ) { + return true; + } + } + return false; + } + + /** + * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. + * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. + * @param cigar the cigar to left align + * @param refSeq the reference byte array + * @param readSeq the read byte array + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return the left-aligned cigar + */ + @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) + protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + final Cigar cigarToReturn = new Cigar(); + Cigar cigarToAlign = new Cigar(); + for (int i = 0; i < cigar.numCigarElements(); i++) { + final CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + cigarToAlign.add(ce); + for( final CigarElement toAdd : AlignmentUtils.leftAlignIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false).getCigarElements() ) { + cigarToReturn.add(toAdd); + } + refIndex += cigarToAlign.getReferenceLength(); + readIndex += cigarToAlign.getReadLength(); + cigarToAlign = new Cigar(); + } else { + cigarToAlign.add(ce); + } + } + if( !cigarToAlign.isEmpty() ) { + for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { + cigarToReturn.add(toAdd); + } + } + return cigarToReturn; + } + + /** + * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype. + * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information. + * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based. + * @param haplotype the candidate haplotype + * @param ref the reference bases to align against + * @param haplotypeList the current list of haplotypes + * @param activeRegionStart the start of the active region in the reference byte array + * @param activeRegionStop the stop of the active region in the reference byte array + * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists + * @return true if the candidate haplotype was successfully incorporated into the haplotype list + */ + @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"}) + private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); @@ -411,33 +545,21 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) ); - final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true ); + final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true); int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true ); if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) { hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal } byte[] newHaplotypeBases; // extend partial haplotypes to contain the full active region sequence - int leftBreakPoint = 0; - int rightBreakPoint = 0; if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), - haplotype.getBases()), - ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); - leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart; - rightBreakPoint = leftBreakPoint + haplotype.getBases().length; - //newHaplotypeBases = haplotype.getBases(); - //return false; // piece of haplotype isn't anchored within the active region so don't build a haplotype out of it + haplotype.getBases()), + ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - //return false; newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) ); - //newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), 0, hapStop); - leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart; } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - //return false; newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); - //newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length); - rightBreakPoint = haplotype.getBases().length - hapStart; } else { newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop); } @@ -449,8 +571,6 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { if ( haplotype.isArtificialHaplotype() ) { h.setArtificialEvent(haplotype.getArtificialEvent()); } - h.leftBreakPoint = leftBreakPoint; - h.rightBreakPoint = rightBreakPoint; if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java new file mode 100644 index 000000000..6a95049d1 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -0,0 +1,321 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.jgrapht.graph.DefaultDirectedGraph; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 2/6/13 + */ + +public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + + public DeBruijnAssemblyGraph() { + super(DeBruijnEdge.class); + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph) + */ + public boolean isReferenceNode( final DeBruijnVertex v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + for( final DeBruijnEdge e : edgesOf(v) ) { + if( e.isRef() ) { return true; } + } + return false; + } + + /** + * @param v the vertex to test + * @return true if this vertex is a source node + */ + public boolean isSource( final DeBruijnVertex v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + return inDegreeOf(v) == 0; + } + + /** + * Pull out the additional sequence implied by traversing this node in the graph + * @param v the vertex from which to pull out the additional base sequence + * @return non-null byte array + */ + @Ensures({"result != null"}) + public byte[] getAdditionalSequence( final DeBruijnVertex v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); } + return ( isSource(v) ? v.getSequence() : v.getSuffix() ); + } + + /** + * @param e the edge to test + * @return true if this edge is a reference source edge + */ + public boolean isRefSource( final DeBruijnEdge e ) { + if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } + for( final DeBruijnEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) { + if( edgeToTest.isRef() ) { return false; } + } + return true; + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference source + */ + public boolean isRefSource( final DeBruijnVertex v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) { + if( edgeToTest.isRef() ) { return false; } + } + return true; + } + + /** + * @param e the edge to test + * @return true if this edge is a reference sink edge + */ + public boolean isRefSink( final DeBruijnEdge e ) { + if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } + for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) { + if( edgeToTest.isRef() ) { return false; } + } + return true; + } + + /** + * @param v the vertex to test + * @return true if this vertex is a reference sink + */ + public boolean isRefSink( final DeBruijnVertex v ) { + if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } + for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) { + if( edgeToTest.isRef() ) { return false; } + } + return true; + } + + /** + * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph + */ + public DeBruijnVertex getReferenceSourceVertex( ) { + for( final DeBruijnVertex v : vertexSet() ) { + if( isReferenceNode(v) && isRefSource(v) ) { + return v; + } + } + return null; + } + + /** + * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph + */ + public DeBruijnVertex getReferenceSinkVertex( ) { + for( final DeBruijnVertex v : vertexSet() ) { + if( isReferenceNode(v) && isRefSink(v) ) { + return v; + } + } + return null; + } + + /** + * Traverse the graph and get the next reference vertex if it exists + * @param v the current vertex, can be null + * @return the next reference vertex if it exists + */ + public DeBruijnVertex getNextReferenceVertex( final DeBruijnVertex v ) { + if( v == null ) { return null; } + for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) { + if( edgeToTest.isRef() ) { + return getEdgeTarget(edgeToTest); + } + } + return null; + } + + /** + * Traverse the graph and get the previous reference vertex if it exists + * @param v the current vertex, can be null + * @return the previous reference vertex if it exists + */ + public DeBruijnVertex getPrevReferenceVertex( final DeBruijnVertex v ) { + if( v == null ) { return null; } + for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) { + if( isReferenceNode(getEdgeSource(edgeToTest)) ) { + return getEdgeSource(edgeToTest); + } + } + return null; + } + + /** + * Does a reference path exist between the two vertices? + * @param fromVertex from this vertex, can be null + * @param toVertex to this vertex, can be null + * @return true if a reference path exists in the graph between the two vertices + */ + public boolean referencePathExists(final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex) { + DeBruijnVertex v = fromVertex; + if( v == null ) { + return false; + } + v = getNextReferenceVertex(v); + if( v == null ) { + return false; + } + while( !v.equals(toVertex) ) { + v = getNextReferenceVertex(v); + if( v == null ) { + return false; + } + } + return true; + } + + /** + * Walk along the reference path in the graph and pull out the corresponding bases + * @param fromVertex starting vertex + * @param toVertex ending vertex + * @param includeStart should the starting vertex be included in the path + * @param includeStop should the ending vertex be included in the path + * @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example) + */ + public byte[] getReferenceBytes( final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) { + if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); } + if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); } + + byte[] bytes = null; + DeBruijnVertex v = fromVertex; + if( includeStart ) { + bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); + } + v = getNextReferenceVertex(v); // advance along the reference path + while( v != null && !v.equals(toVertex) ) { + bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) ); + v = getNextReferenceVertex(v); // advance along the reference path + } + if( includeStop && v != null && v.equals(toVertex)) { + bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); + } + return bytes; + } + + /** + * Pull kmers out of the given long sequence and throw them on in the graph + * @param sequence byte array holding the sequence with which to build the assembly graph + * @param KMER_LENGTH the desired kmer length to use + * @param isRef if true the kmers added to the graph will have reference edges linking them + */ + public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) { + if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); } + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef); + } + } + + /** + * Add edge to assembly graph connecting the two kmers + * @param kmer1 the source kmer for the edge + * @param kmer2 the target kmer for the edge + * @param isRef true if the added edge is a reference edge + * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph + */ + public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef ) { + if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } + + final int numVertexBefore = vertexSet().size(); + final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length ); + addVertex(v1); + final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length ); + addVertex(v2); + if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + + final DeBruijnEdge targetEdge = getEdge(v1, v2); + if ( targetEdge == null ) { + addEdge(v1, v2, new DeBruijnEdge( isRef )); + } else { + if( isRef ) { + targetEdge.setIsRef( true ); + } + targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1); + } + return true; + } + + /** + * Print out the graph in the dot language for visualization + * @param GRAPH_WRITER PrintStream to write to + */ + public void printGraph( final PrintStream GRAPH_WRITER ) { + if( GRAPH_WRITER == null ) { throw new IllegalArgumentException("PrintStream cannot be null."); } + + GRAPH_WRITER.println("digraph assembly {"); + for( final DeBruijnEdge edge : edgeSet() ) { + GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + "label=\""+ edge.getMultiplicity() +"\"" + "];"); + if( edge.isRef() ) { + GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + } + } + for( final DeBruijnVertex v : vertexSet() ) { + final String label = ( inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); + GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); + } + GRAPH_WRITER.println("}"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java index 8d7732a87..28c735b5c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java @@ -95,12 +95,12 @@ public class DeBruijnEdge { } // For use when comparing edges pulled from the same graph - public boolean equals( final DefaultDirectedGraph graph, final DeBruijnEdge edge ) { + public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) { return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); } // For use when comparing edges across graphs! - public boolean equals( final DefaultDirectedGraph graph, final DeBruijnEdge edge, final DefaultDirectedGraph graph2 ) { + public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index c6f23359b..1390b0ee9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -83,7 +83,7 @@ public class DeBruijnVertex { } public String getSuffixString() { - return new String( getSuffix() ); + return new String(getSuffix()); } @Ensures("result != null") diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 53dc4f1bd..bef0cd96c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -142,7 +142,6 @@ public class GenotypingEngine { if( DEBUG ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() ); - System.out.println( "> Left and right breaks = (" + h.leftBreakPoint + " , " + h.rightBreakPoint + ")"); System.out.println( ">> Events = " + h.getEventMap()); } } @@ -665,7 +664,8 @@ public class GenotypingEngine { if( refPos < 0 ) { return null; } // Protection against SW failures int alignmentPos = 0; - for( final CigarElement ce : cigar.getCigarElements() ) { + for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { + final CigarElement ce = cigar.getCigarElement(cigarIndex); final int elementLength = ce.getLength(); switch( ce.getOperator() ) { case I: @@ -676,7 +676,7 @@ public class GenotypingEngine { if( BaseUtils.isRegularBase(refByte) ) { insertionAlleles.add( Allele.create(refByte, true) ); } - if( (haplotype.leftBreakPoint != 0 || haplotype.rightBreakPoint != 0) && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); } else { byte[] insertionBases = new byte[]{}; @@ -702,20 +702,12 @@ public class GenotypingEngine { final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base final List deletionAlleles = new ArrayList(); final int deletionStart = refLoc.getStart() + refPos - 1; - // BUGBUG: how often does this symbolic deletion allele case happen? - //if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) - // || (haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) ) ) { - // deletionAlleles.add( Allele.create(ref[refPos-1], true) ); - // deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); - // vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make()); - //} else { final byte refByte = ref[refPos-1]; if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { deletionAlleles.add( Allele.create(deletionBases, true) ); deletionAlleles.add( Allele.create(refByte, false) ); vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); } - //} refPos += elementLength; break; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 1dfec494a..30749a820 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -50,7 +50,6 @@ import com.google.java.contract.Ensures; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -133,8 +132,8 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=65, maxRegion=300) -@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=20) +@ActiveRegionTraversalParameters(extension=85, maxRegion=300) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=30) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { /** @@ -270,7 +269,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private CachingIndexedFastaSequenceFile referenceReader; // reference base padding size - private static final int REFERENCE_PADDING = 400; + private static final int REFERENCE_PADDING = 500; // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; @@ -350,7 +349,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer ); + assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -475,17 +474,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! - finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails - - // note this operation must be performed before we clip the reads down, as this must correspond to the full reference region - final GenomeLoc fullSpanBeforeClipping = getPaddedLoc(activeRegion); + finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region - final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! - activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! @@ -506,7 +503,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem stratifiedReadMap, perSampleFilteredReadList, fullReferenceWithPadding, - fullSpanBeforeClipping, + paddedReferenceLoc, activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { @@ -518,7 +515,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( bamWriter != null ) { // write the haplotypes to the bam for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, fullSpanBeforeClipping, bestHaplotypes.contains(haplotype)); + writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); @@ -530,7 +527,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), fullSpanBeforeClipping.getStart()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); } } } @@ -584,7 +581,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { - final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { readsToUse.add(clippedRead); } @@ -605,9 +603,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final int padLeft = Math.max(activeRegion.getReadSpanLoc().getStart()-REFERENCE_PADDING, 1); - final int padRight = Math.min(activeRegion.getReadSpanLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReadSpanLoc().getContig()).getSequenceLength()); - return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReadSpanLoc().getContig(), padLeft, padRight); + final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); } private Map> splitReadsBySample( final List reads ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 49e926e32..90c2e6a2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -52,10 +52,13 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.jgrapht.graph.DefaultDirectedGraph; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; @@ -88,15 +91,17 @@ public class KBestPaths { private final int totalScore; // the graph from which this path originated - private final DefaultDirectedGraph graph; + private final DeBruijnAssemblyGraph graph; // used in the bubble state machine to apply Smith-Waterman to the bubble sequence - private final double SW_MATCH = 15.0; - private final double SW_MISMATCH = -15.0; - private final double SW_GAP = -25.0; - private final double SW_GAP_EXTEND = -1.2; + // these values were chosen via optimization against the NA12878 knowledge base + private static final double SW_MATCH = 20.0; + private static final double SW_MISMATCH = -15.0; + private static final double SW_GAP = -26.0; + private static final double SW_GAP_EXTEND = -1.1; + private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - public Path( final DeBruijnVertex initialVertex, final DefaultDirectedGraph graph ) { + public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) { lastVertex = initialVertex; edges = new ArrayList(0); totalScore = 0; @@ -119,6 +124,8 @@ public class KBestPaths { * @return true if the edge is found in this path */ public boolean containsEdge( final DeBruijnEdge edge ) { + if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } + for( final DeBruijnEdge e : edges ) { if( e.equals(graph, edge) ) { return true; @@ -128,7 +135,14 @@ public class KBestPaths { return false; } - public int numInPath( final DefaultDirectedGraph graph, final DeBruijnEdge edge ) { + /** + * Calculate the number of times this edge appears in the path + * @param edge the given edge to test + * @return number of times this edge appears in the path + */ + public int numInPath( final DeBruijnEdge edge ) { + if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } + int numInPath = 0; for( final DeBruijnEdge e : edges ) { if( e.equals(graph, edge) ) { @@ -139,13 +153,17 @@ public class KBestPaths { return numInPath; } - + /** + * Does this path contain a reference edge? + * @return true if the path contains a reference edge + */ public boolean containsRefEdge() { for( final DeBruijnEdge e : edges ) { if( e.isRef() ) { return true; } } return false; } + public List getEdges() { return edges; } public int getScore() { return totalScore; } @@ -153,41 +171,31 @@ public class KBestPaths { public DeBruijnVertex getLastVertexInPath() { return lastVertex; } /** - * The base sequence for this path. Pull the full sequence for the source of the path and then the suffix for all subsequent nodes + * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes * @return non-null sequence of bases corresponding to this path */ @Ensures({"result != null"}) public byte[] getBases() { - if( edges.size() == 0 ) { return lastVertex.getSequence(); } + if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); } - byte[] bases = graph.getEdgeSource( edges.get(0) ).getSequence(); + byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0))); for( final DeBruijnEdge e : edges ) { - bases = ArrayUtils.addAll(bases, graph.getEdgeTarget( e ).getSuffix()); + bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); } return bases; } - /** - * Pull the added base sequence implied by visiting this node in a path - * @param graph the graph from which the vertex originated - * @param v the vertex whose sequence to grab - * @return non-null sequence of bases corresponding to this node in the graph - */ - @Ensures({"result != null"}) - public byte[] getAdditionalSequence( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - return ( edges.size()==0 || graph.getEdgeSource(edges.get(0)).equals(v) ? v.getSequence() : v.getSuffix() ); - } - /** * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble + * @return non-null Cigar string with reference length equal to the refHaplotype's reference length */ @Ensures("result != null") public Cigar calculateCigar() { final Cigar cigar = new Cigar(); // special case for paths that start on reference but not at the reference source node - if( edges.get(0).isRef() && !isRefSource(graph, edges.get(0)) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(graph, null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) { + if( edges.get(0).isRef() && !graph.isRefSource(edges.get(0)) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) { cigar.add(ce); } } @@ -197,18 +205,18 @@ public class KBestPaths { for( final DeBruijnEdge e : edges ) { if( e.equals(graph, edges.get(0)) ) { - advanceBubbleStateMachine( bsm, graph, graph.getEdgeSource(e), null ); + advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); } - advanceBubbleStateMachine( bsm, graph, graph.getEdgeTarget(e), e ); + advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); } // special case for paths that don't end on reference if( bsm.inBubble ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(graph, bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { bsm.cigar.add(ce); } - } else if( edges.get(edges.size()-1).isRef() && !isRefSink(graph, edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit - for( final CigarElement ce : calculateCigarForCompleteBubble(graph, bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) { + } else if( edges.get(edges.size()-1).isRef() && !graph.isRefSink(edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) { bsm.cigar.add(ce); } } @@ -216,59 +224,72 @@ public class KBestPaths { return AlignmentUtils.consolidateCigar(bsm.cigar); } + /** + * Advance the bubble state machine by incorporating the next node in the path. + * @param bsm the current bubble state machine + * @param node the node to be incorporated + * @param e the edge which generated this node in the path + */ @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DefaultDirectedGraph graph, final DeBruijnVertex node, final DeBruijnEdge e ) { - if( isReferenceNode( graph, node ) ) { + private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) { + if( graph.isReferenceNode( node ) ) { if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else if( e !=null && !e.isRef() ) { - if( referencePathExists( graph, graph.getEdgeSource(e), node) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(graph, null, graph.getEdgeSource(e), node).getCigarElements() ) { + if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { bsm.cigar.add(ce); } - bsm.cigar.add( new CigarElement( getAdditionalSequence(graph, node).length, CigarOperator.M) ); + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself - bsm.cigar.add( new CigarElement( getAdditionalSequence(graph, node).length, CigarOperator.I) ); + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); } else { bsm.inBubble = true; bsm.bubbleBytes = null; bsm.lastSeenReferenceNode = graph.getEdgeSource(e); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, getAdditionalSequence(graph, node) ); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); } } else { - bsm.cigar.add( new CigarElement( getAdditionalSequence(graph, node).length, CigarOperator.M) ); + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); } - } else if( bsm.lastSeenReferenceNode != null && !referencePathExists( graph, bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, getAdditionalSequence(graph, node) ); + } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); } else { // close the bubble and use a local SW to determine the Cigar string - for( final CigarElement ce : calculateCigarForCompleteBubble(graph, bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { bsm.cigar.add(ce); } bsm.inBubble = false; bsm.bubbleBytes = null; bsm.lastSeenReferenceNode = null; - bsm.cigar.add( new CigarElement( getAdditionalSequence(graph, node).length, CigarOperator.M) ); + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); } } else { // non-ref vertex if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, getAdditionalSequence(graph, node) ); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); } else { // open up a bubble bsm.inBubble = true; bsm.bubbleBytes = null; bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, getAdditionalSequence(graph, node) ); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); } } } + /** + * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble + * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble + * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) + * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) + * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble + */ @Requires({"graph != null"}) - @Ensures({"result != null", "result.getReadLength() == bubbleBytes.length"}) - private Cigar calculateCigarForCompleteBubble( final DefaultDirectedGraph graph, final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) { - final byte[] refBytes = getReferenceBytes(this, graph, fromVertex, toVertex); + @Ensures({"result != null"}) + private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) { + final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); - final Cigar cigar = new Cigar(); + final Cigar returnCigar = new Cigar(); // add padding to anchor ref/alt bases in the SW matrix - byte[] padding = "XXXXXX".getBytes(); + byte[] padding = STARTING_SW_ANCHOR_BYTES; boolean goodAlignment = false; SWPairwiseAlignment swConsensus = null; while( !goodAlignment && padding.length < 1000 ) { @@ -280,27 +301,48 @@ public class KBestPaths { goodAlignment = true; } } - if( !goodAlignment && swConsensus != null ) { - throw new ReviewedStingException("SmithWaterman offset failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); + if( !goodAlignment ) { + returnCigar.add(new CigarElement(1, CigarOperator.N)); + return returnCigar; } - if( swConsensus != null ) { - final Cigar swCigar = swConsensus.getCigar(); + final Cigar swCigar = swConsensus.getCigar(); + if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference + returnCigar.add(new CigarElement(1, CigarOperator.N)); + } else { + int skipElement = -1; + if( fromVertex == null ) { + for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { + final CigarElement ce = swCigar.getCigarElement(iii); + if( ce.getOperator().equals(CigarOperator.D) ) { + skipElement = iii; + break; + } + } + } else if (toVertex == null ) { + for( int iii = swCigar.numCigarElements() - 1; iii >= 0; iii-- ) { + final CigarElement ce = swCigar.getCigarElement(iii); + if( ce.getOperator().equals(CigarOperator.D) ) { + skipElement = iii; + break; + } + } + } for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { // now we need to remove the padding from the cigar string int length = swCigar.getCigarElement(iii).getLength(); if( iii == 0 ) { length -= padding.length; } if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } if( length > 0 ) { - cigar.add( new CigarElement(length, swCigar.getCigarElement(iii).getOperator()) ); + returnCigar.add(new CigarElement(length, (skipElement == iii ? CigarOperator.X : swCigar.getCigarElement(iii).getOperator()))); } } - if( (refBytes == null && cigar.getReferenceLength() != 0) || ( refBytes != null && cigar.getReferenceLength() != refBytes.length ) ) { - throw new ReviewedStingException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); + if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { + throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); } } - return cigar; + return returnCigar; } // class to keep track of the bubble state machine @@ -326,8 +368,18 @@ public class KBestPaths { } } - public static List getKBestPaths( final DefaultDirectedGraph graph, final int k ) { - if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); } + /** + * Traverse the graph and pull out the best k paths. + * Paths are scored via their comparator function. The default being PathComparatorTotalScore() + * @param graph the graph from which to pull paths + * @param k the number of paths to find + * @return a list with at most k top-scoring paths from the graph + */ + @Ensures({"result != null", "result.size() <= k"}) + public static List getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) { + if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } + if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); } + final ArrayList bestPaths = new ArrayList(); // run a DFS for best paths @@ -350,12 +402,14 @@ public class KBestPaths { // did we hit the end of a path? if ( allOutgoingEdgesHaveBeenVisited(path) ) { - if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { - // clean out some low scoring paths - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 + if( path.containsRefEdge() ) { + if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { + // clean out some low scoring paths + Collections.sort(bestPaths, new PathComparatorTotalScore() ); + for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 + } + bestPaths.add(path); } - bestPaths.add(path); } else if( n.val > 10000) { // do nothing, just return } else { @@ -376,227 +430,16 @@ public class KBestPaths { } } + /** + * @param path the path to test + * @return true if all the outgoing edges at the end of this path have already been visited + */ private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { - if( !path.containsEdge(edge) ) { + if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles return false; } } return true; } - - /**************************************************************** - * Collection of graph functions used by KBestPaths * - ***************************************************************/ - - /** - * Test if the vertex is on a reference path in the graph. If so it is referred to as a reference node - * @param graph the graph from which the vertex originated - * @param v the vertex to test - * @return true if the vertex is on the reference path - */ - public static boolean isReferenceNode( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - for( final DeBruijnEdge e : graph.edgesOf(v) ) { - if( e.isRef() ) { return true; } - } - return false; - } - - /** - * Is this edge a source edge (the source vertex of the edge is a source node in the graph) - * @param graph the graph from which the edge originated - * @param e the edge to test - * @return true if the source vertex of the edge is a source node in the graph - */ - public static boolean isSource( final DefaultDirectedGraph graph, final DeBruijnEdge e ) { - return graph.inDegreeOf(graph.getEdgeSource(e)) == 0; - } - - /** - * Is this vertex a source vertex - * @param graph the graph from which the vertex originated - * @param v the vertex to test - * @return true if the vertex is a source vertex - */ - public static boolean isSource( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - return graph.inDegreeOf(v) == 0; - } - - /** - * Is this edge both a reference edge and a source edge for the reference path - * @param graph the graph from which the edge originated - * @param e the edge to test - * @return true if the edge is both a reference edge and a reference path source edge - */ - public static boolean isRefSource( final DefaultDirectedGraph graph, final DeBruijnEdge e ) { - for( final DeBruijnEdge edgeToTest : graph.incomingEdgesOf(graph.getEdgeSource(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * Is this vertex both a reference node and a source node for the reference path - * @param graph the graph from which the vertex originated - * @param v the vertex to test - * @return true if the vertex is both a reference node and a reference path source node - */ - public static boolean isRefSource( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - for( final DeBruijnEdge edgeToTest : graph.incomingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * Is this edge both a reference edge and a sink edge for the reference path - * @param graph the graph from which the edge originated - * @param e the edge to test - * @return true if the edge is both a reference edge and a reference path sink edge - */ - public static boolean isRefSink( final DefaultDirectedGraph graph, final DeBruijnEdge e ) { - for( final DeBruijnEdge edgeToTest : graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * Is this vertex both a reference node and a sink node for the reference path - * @param graph the graph from which the node originated - * @param v the node to test - * @return true if the vertex is both a reference node and a reference path sink node - */ - public static boolean isRefSink( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - for( final DeBruijnEdge edgeToTest : graph.outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - public static DeBruijnEdge getReferenceSourceEdge( final DefaultDirectedGraph graph ) { - for( final DeBruijnEdge e : graph.edgeSet() ) { - if( e.isRef() && isRefSource(graph, e) ) { - return e; - } - } - throw new ReviewedStingException("All reference graphs should have a source node"); - } - - public static DeBruijnVertex getReferenceSourceVertex( final DefaultDirectedGraph graph ) { - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( isReferenceNode(graph, v) && isRefSource(graph, v) ) { - return v; - } - } - return null; - } - - public static DeBruijnEdge getReferenceSinkEdge( final DefaultDirectedGraph graph ) { - for( final DeBruijnEdge e : graph.edgeSet() ) { - if( e.isRef() && isRefSink(graph, e) ) { - return e; - } - } - throw new ReviewedStingException("All reference graphs should have a sink node"); - } - - public static DeBruijnVertex getReferenceSinkVertex( final DefaultDirectedGraph graph ) { - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( isReferenceNode(graph, v) && isRefSink(graph, v) ) { - return v; - } - } - throw new ReviewedStingException("All reference graphs should have a sink node"); - } - - public static DeBruijnEdge getNextReferenceEdge( final DefaultDirectedGraph graph, final DeBruijnEdge e ) { - if( e == null ) { return null; } - for( final DeBruijnEdge edgeToTest : graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ) { - if( edgeToTest.isRef() ) { - return edgeToTest; - } - } - return null; - } - - public static DeBruijnVertex getNextReferenceVertex( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - if( v == null ) { return null; } - for( final DeBruijnEdge edgeToTest : graph.outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { - return graph.getEdgeTarget(edgeToTest); - } - } - return null; - } - - public static DeBruijnEdge getPrevReferenceEdge( final DefaultDirectedGraph graph, final DeBruijnEdge e ) { - for( final DeBruijnEdge edgeToTest : graph.incomingEdgesOf(graph.getEdgeSource(e)) ) { - if( edgeToTest.isRef() ) { - return edgeToTest; - } - } - return null; - } - - public static DeBruijnVertex getPrevReferenceVertex( final DefaultDirectedGraph graph, final DeBruijnVertex v ) { - for( final DeBruijnEdge edgeToTest : graph.incomingEdgesOf(v) ) { - if( isReferenceNode(graph, graph.getEdgeSource(edgeToTest)) ) { - return graph.getEdgeSource(edgeToTest); - } - } - return null; - } - - public static boolean referencePathExists(final DefaultDirectedGraph graph, final DeBruijnEdge fromEdge, final DeBruijnEdge toEdge) { - DeBruijnEdge e = fromEdge; - if( e == null ) { - return false; - } - while( !e.equals(graph, toEdge) ) { - e = getNextReferenceEdge(graph, e); - if( e == null ) { - return false; - } - } - return true; - } - - public static boolean referencePathExists(final DefaultDirectedGraph graph, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex) { - DeBruijnVertex v = fromVertex; - if( v == null ) { - return false; - } - v = getNextReferenceVertex(graph, v); - if( v == null ) { - return false; - } - while( !v.equals(toVertex) ) { - v = getNextReferenceVertex(graph, v); - if( v == null ) { - return false; - } - } - return true; - } - - // fromVertex (exclusive) -> toVertex (exclusive) - public static byte[] getReferenceBytes( final Path path, final DefaultDirectedGraph graph, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) { - byte[] bytes = null; - if( fromVertex != null && toVertex != null && !referencePathExists(graph, fromVertex, toVertex) ) { - throw new ReviewedStingException("Asked for a reference path which doesn't exist. " + fromVertex + " --> " + toVertex); - } - DeBruijnVertex v = fromVertex; - if( v == null ) { - v = getReferenceSourceVertex(graph); - bytes = ArrayUtils.addAll( bytes, path.getAdditionalSequence(graph, v) ); - } - v = getNextReferenceVertex(graph, v); - while( (toVertex != null && !v.equals(toVertex)) || (toVertex == null && v != null) ) { - bytes = ArrayUtils.addAll( bytes, path.getAdditionalSequence(graph, v) ); - // advance along the reference path - v = getNextReferenceVertex(graph, v); - } - return bytes; - } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java index d42cf5f8e..794ee8dee 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java @@ -255,7 +255,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { final String baseCommand = "-T HaplotypeCaller -R " + b36KGReference + " --no_cmdline_in_header --dbsnp " + b36dbSNP129; WalkerTestSpec spec = new WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1, - Arrays.asList("1b2d71f72b49e36325a3cb7aeab37270")); + Arrays.asList("3a66513cdfef46f315d5ada8a104822f")); executeTest("HC calling with contamination_percentage_to_filter 0.20", spec); } @@ -283,17 +283,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { @Test public void testHCFlatContaminationCase1() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "a55335e075b4ebaea31f54b88a96e829"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "0b9d6aabd5ab448f0a2d32f24ff64840"); } @Test public void testHCFlatContaminationCase2() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "5b0c3dfd6885dd0b0dfc4d979e1bef67"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "a4ef4a6ce557a6b9666e234fad5c7c80"); } @Test public void testHCFlatContaminationCase3() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "68c23ceccd4d10fccd1b59432b374c5c"); + testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "bacc98eb2baa5bb1777da24cf0f84913"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java similarity index 73% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 2489f5f0f..f4a6d5494 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -52,20 +52,21 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; * Date: 3/27/12 */ +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.jgrapht.graph.DefaultDirectedGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.PrintStream; import java.util.*; -public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { +public class DeBruijnAssemblerUnitTest extends BaseTest { private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { @@ -78,16 +79,16 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { KMER_LENGTH = kmer; } - public DefaultDirectedGraph expectedGraph() { + public DeBruijnAssemblyGraph expectedGraph() { DeBruijnVertex v = new DeBruijnVertex(sequence, KMER_LENGTH); - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); graph.addVertex(v); return graph; } - public DefaultDirectedGraph calcGraph() { + public DeBruijnAssemblyGraph calcGraph() { - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); final int kmersInSequence = sequence.length - KMER_LENGTH + 1; for (int i = 0; i < kmersInSequence - 1; i++) { // get the kmers @@ -96,9 +97,9 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { final byte[] kmer2 = new byte[KMER_LENGTH]; System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - SimpleDeBruijnAssembler.addKmersToGraph(graph, kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false); } - SimpleDeBruijnAssembler.mergeNodes(graph); + DeBruijnAssembler.mergeNodes(graph); return graph; } } @@ -125,8 +126,8 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { @Test(enabled = true) public void testPruneGraph() { - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); - DefaultDirectedGraph expectedGraph = new DefaultDirectedGraph(DeBruijnEdge.class); + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph(); DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1); DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1); @@ -155,12 +156,12 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - SimpleDeBruijnAssembler.pruneGraph(graph, 2); + DeBruijnAssembler.pruneGraph(graph, 2); Assert.assertTrue(graphEquals(graph, expectedGraph)); - graph = new DefaultDirectedGraph(DeBruijnEdge.class); - expectedGraph = new DefaultDirectedGraph(DeBruijnEdge.class); + graph = new DeBruijnAssemblyGraph(); + expectedGraph = new DeBruijnAssemblyGraph(); graph.addVertex(v); graph.addVertex(v2); @@ -183,103 +184,12 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - SimpleDeBruijnAssembler.pruneGraph(graph, 2); + DeBruijnAssembler.pruneGraph(graph, 2); Assert.assertTrue(graphEquals(graph, expectedGraph)); } - @Test(enabled = true) - public void testEliminateNonRefPaths() { - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); - DefaultDirectedGraph expectedGraph = new DefaultDirectedGraph(DeBruijnEdge.class); - - DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1); - DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1); - DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 1); - DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 1); - DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 1); - DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 1); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(false)); - graph.addEdge(v2, v3, new DeBruijnEdge(true)); - graph.addEdge(v3, v4, new DeBruijnEdge(true)); - graph.addEdge(v4, v5, new DeBruijnEdge(true)); - graph.addEdge(v5, v6, new DeBruijnEdge(false)); - - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge()); - expectedGraph.addEdge(v3, v4, new DeBruijnEdge()); - expectedGraph.addEdge(v4, v5, new DeBruijnEdge()); - - SimpleDeBruijnAssembler.eliminateNonRefPaths(graph); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - - - - - graph = new DefaultDirectedGraph(DeBruijnEdge.class); - expectedGraph = new DefaultDirectedGraph(DeBruijnEdge.class); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(true)); - graph.addEdge(v2, v3, new DeBruijnEdge(true)); - graph.addEdge(v4, v5, new DeBruijnEdge(false)); - graph.addEdge(v5, v6, new DeBruijnEdge(false)); - - expectedGraph.addVertex(v); - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addEdge(v, v2, new DeBruijnEdge()); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge()); - - SimpleDeBruijnAssembler.eliminateNonRefPaths(graph); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - - - - graph = new DefaultDirectedGraph(DeBruijnEdge.class); - expectedGraph = new DefaultDirectedGraph(DeBruijnEdge.class); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(true)); - graph.addEdge(v2, v3, new DeBruijnEdge(true)); - graph.addEdge(v4, v5, new DeBruijnEdge(false)); - graph.addEdge(v5, v6, new DeBruijnEdge(false)); - graph.addEdge(v4, v2, new DeBruijnEdge(false)); - - expectedGraph.addVertex(v); - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addEdge(v, v2, new DeBruijnEdge()); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge()); - - SimpleDeBruijnAssembler.eliminateNonRefPaths(graph); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - } - - private boolean graphEquals(DefaultDirectedGraph g1, DefaultDirectedGraph g2) { + private boolean graphEquals(DeBruijnAssemblyGraph g1, DeBruijnAssemblyGraph g2) { if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { return false; } @@ -304,10 +214,53 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest { public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DefaultDirectedGraph g1 = SimpleDeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DefaultDirectedGraph g2 = SimpleDeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnAssemblyGraph g1 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); + final DeBruijnAssemblyGraph g2 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); } + + @Test(enabled = true) + public void testLeftAlignCigarSequentially() { + String preRefString = "GATCGATCGATC"; + String postRefString = "TTT"; + String refString = "ATCGAGGAGAGCGCCCCG"; + String indelString1 = "X"; + String indelString2 = "YZ"; + int refIndel1 = 10; + int refIndel2 = 12; + + for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp1 : Arrays.asList(1, -1) ) { + for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp2 : Arrays.asList(1, -1) ) { + + Cigar expectedCigar = new Cigar(); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + + Cigar givenCigar = new Cigar(); + givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M)); + + String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; + String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; + + Cigar calculatedCigar = DeBruijnAssembler.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); + } + } + } + } + } + } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java new file mode 100644 index 000000000..5a1497236 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -0,0 +1,123 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 2/8/13 + */ + +public class DeBruijnAssemblyGraphUnitTest { + private class GetReferenceBytesTestProvider extends BaseTest.TestDataProvider { + public byte[] refSequence; + public byte[] altSequence; + public int KMER_LENGTH; + + public GetReferenceBytesTestProvider(String ref, String alt, int kmer) { + super(GetReferenceBytesTestProvider.class, String.format("Testing reference bytes. kmer = %d, ref = %s, alt = %s", kmer, ref, alt)); + refSequence = ref.getBytes(); + altSequence = alt.getBytes(); + KMER_LENGTH = kmer; + } + + public byte[] expectedReferenceBytes() { + return refSequence; + } + + public byte[] calculatedReferenceBytes() { + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + graph.addSequenceToGraph(refSequence, KMER_LENGTH, true); + if( altSequence.length > 0 ) { + graph.addSequenceToGraph(altSequence, KMER_LENGTH, false); + } + return graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true); + } + } + + @DataProvider(name = "GetReferenceBytesTestProvider") + public Object[][] GetReferenceBytesTests() { + new GetReferenceBytesTestProvider("GGTTAACC", "", 3); + new GetReferenceBytesTestProvider("GGTTAACC", "", 4); + new GetReferenceBytesTestProvider("GGTTAACC", "", 5); + new GetReferenceBytesTestProvider("GGTTAACC", "", 6); + new GetReferenceBytesTestProvider("GGTTAACC", "", 7); + new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "", 6); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 66); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 76); + + new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 3); + new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 4); + new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 5); + new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 6); + new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 7); + new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); + + new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 3); + new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 4); + new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 5); + new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 6); + new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 7); + new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "AAAAAAAAAAAAA", 6); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 66); + new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 76); + + return GetReferenceBytesTestProvider.getTests(GetReferenceBytesTestProvider.class); + } + + @Test(dataProvider = "GetReferenceBytesTestProvider", enabled = true) + public void testGetReferenceBytes(GetReferenceBytesTestProvider cfg) { + Assert.assertEquals(cfg.calculatedReferenceBytes(), cfg.expectedReferenceBytes(), "Reference sequences do not match"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 7676ab3e5..489cab95a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "1e49fd927d79594a993ea6c4a1d10004"); + HCTest(CEUTRIO_BAM, "", "ecf563b63ca3f640d9cfcc548e8ad776"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "1b39ac32c9cbba26ed60c6b06be81359"); + HCTest(NA12878_BAM, "", "874389182141f41879abea7cb350c9d4"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "f751363288740c6fd9179a487be61fb4"); + "4aa3d0d0a859c0fc0533f29529cc3d95"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "719402122fe92cfe7a3fa6b7cdb66f26"); + "1d9cd5017e420d5862b7b94e6cb5de3b"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "71ef8d0217c1a73dd360413dccd05f4d"); + "cfd717dd79ace99a266e8bb58d6cc7a6"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88ae6e7b34514043bfc78b1ecf29a341"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "58b484324f0ea00aaac25fb7711ad657"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -122,9 +122,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } + // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "855827f901b63b41dcd37dd49dd3a1ac"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "f893aa7afef71705df7f040b22440a2d"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,25 +136,24 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c0ac5a1f75c66052b19684eb37c088cb"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "0e8a3a31b8fe5f097d6975aee8b67cdc"); } - // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper + // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust - // any of the calls in that region because it is so messy. The only thing I would maybe be worried about is - // that the three calls that are missing happen to all be the left most calls in the region + // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("598d245498c0d0b55e263f0a061a77e3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2acd853da3a0380650de6827b7c790ac")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a4e74226b16a7d8c5999620c2f6be1ba")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("061a95cab149723866ce7c797ba6bdd4")); executeTest("HCTestStructuralIndels: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("9f0bb0b97857c66937de39670e195d00")); + Arrays.asList("2ab038f4f6c262b3245b6fa549659c5e")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("87bd7ac2f7d65580838c7c956ccf52b7")); + Arrays.asList("56fc9110974bfa9c9fe196b0d4af4e64")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java index a39ca23e3..53400b790 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java @@ -82,7 +82,7 @@ public class KBestPathsUnitTest { @Test(dataProvider = "BasicBubbleDataProvider") public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { // Construct the assembly graph - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); final int KMER_LENGTH = 3; final String preRef = "ATGG"; final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGGGC"; @@ -142,7 +142,7 @@ public class KBestPathsUnitTest { @Test(dataProvider = "TripleBubbleDataProvider") public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph - DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); + DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); final int KMER_LENGTH = 3; final String preAltOption = "ATCGATCGATCGATCGATCG"; final String postAltOption = "CCCC"; @@ -211,7 +211,7 @@ public class KBestPathsUnitTest { if( offRefBeginning ) { expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); } - expectedCigar.add(new CigarElement(preRef.length() - ( offRefBeginning ? KMER_LENGTH - 1 : 0 ), CigarOperator.M)); + expectedCigar.add(new CigarElement(preRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); // first bubble if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index cdb5f8279..cce6abbee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -43,8 +43,6 @@ public class Haplotype extends Allele { private Map eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; - public int leftBreakPoint = 0; - public int rightBreakPoint = 0; private Event artificialEvent = null; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index f6f4b721c..b38d6575e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -346,19 +346,6 @@ public class ActiveRegion implements HasGenomeLocation { } } - /** - * Clips all of the reads in this active region so that none extend beyond the active region extended loc - * - * This function may change the getReadSpanLoc, as it updates the read span based on the new clipped - * read coordinates. - */ - public void hardClipToActiveRegion() { - final List clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); - ReadUtils.sortReadsByCoordinate(clippedReads); - clearReads(); - addAll(clippedReads); - } - /** * Is this region equal to other, excluding any reads in either region in the comparison * @param other the other active region we want to test diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index 6ab429015..7f0f93704 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -189,14 +189,6 @@ public class ActiveRegionUnitTest extends BaseTest { Assert.assertEquals(region.getExtendedLoc(), loc); Assert.assertEquals(region.getReadSpanLoc(), loc); Assert.assertTrue(region.equalExceptReads(region2)); - - region.add(read); - region.hardClipToActiveRegion(); - Assert.assertEquals(region.size(), 1); - Assert.assertEquals(region.getExtendedLoc(), loc); - Assert.assertEquals(region.getReadSpanLoc(), loc); - Assert.assertTrue(region.getReads().get(0).getAlignmentStart() >= region.getExtendedLoc().getStart()); - Assert.assertTrue(region.getReads().get(0).getAlignmentEnd() <= region.getExtendedLoc().getStop()); } // -----------------------------------------------------------------------------------------------