From 53a904bcbd8ec63420a76e98e7dda6432d2907f8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 8 Mar 2013 11:28:22 -0500 Subject: [PATCH] Bugfix for HaplotypeCaller: GSA-822 for trimming softclipped reads -- Previous version would not trim down soft clip bases that extend beyond the active region, causing the assembly graph to go haywire. The new code explicitly reverts soft clips to M bases with the ever useful ReadClipper, and then trims. Note this isn't a 100% fix for the issue, as it's possible that the newly unclipped bases might in reality extend beyond the active region, should their true alignment include a deletion in the reference. Needs to be fixed. JIRA added -- See https://jira.broadinstitute.org/browse/GSA-822 -- #resolve #fix GSA-822 --- .../haplotypecaller/DeBruijnAssembler.java | 18 +++++++++++-- .../DeBruijnAssemblyGraph.java | 27 ++++++++++++++++--- .../haplotypecaller/HaplotypeCaller.java | 12 +++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index bf08d1526..33198ce8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -271,9 +271,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph + //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -289,6 +290,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { + //if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue; + //logger.info("Adding read " + read + " with sequence " + read.getReadString()); final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced @@ -325,8 +328,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { + final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size + final int writeFirstGraphWithSizeSmallerThan = 50; + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { + if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); @@ -337,8 +348,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]"); } + + if ( onlyWriteOneGraph ) + break; } graphWriter.println("}"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java index 6a95049d1..d28f81b55 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -47,9 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.jgrapht.graph.DefaultDirectedGraph; import java.io.PrintStream; @@ -62,9 +60,32 @@ import java.util.Arrays; */ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + private final int kmerSize; - public DeBruijnAssemblyGraph() { + /** + * Construct a DeBruijnAssemblyGraph with kmerSize + * @param kmerSize + */ + public DeBruijnAssemblyGraph(final int kmerSize) { super(DeBruijnEdge.class); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; + } + + /** + * Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11 + */ + protected DeBruijnAssemblyGraph() { + this(11); + } + + /** + * How big of a kmer did we use to create this graph? + * @return + */ + public int getKmerSize() { + return kmerSize; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index cff631802..affad6450 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -608,8 +608,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + + // uncomment to remove hard clips from consideration at all + //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } }