Bugfix for HaplotypeCaller: GSA-822 for trimming softclipped reads

-- Previous version would not trim down soft clip bases that extend beyond the active region, causing the assembly graph to go haywire.  The new code explicitly reverts soft clips to M bases with the ever useful ReadClipper, and then trims.  Note this isn't a 100% fix for the issue, as it's possible that the newly unclipped bases might in reality extend beyond the active region, should their true alignment include a deletion in the reference.  Needs to be fixed.  JIRA added

-- See https://jira.broadinstitute.org/browse/GSA-822
-- #resolve #fix GSA-822
This commit is contained in:
Mark DePristo 2013-03-08 11:28:22 -05:00
parent ffea6dd95f
commit 53a904bcbd
3 changed files with 52 additions and 5 deletions

View File

@ -271,9 +271,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
protected static DeBruijnAssemblyGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH);
// First pull kmers from the reference haplotype and add them to the graph
//logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString());
final byte[] refSequence = refHaplotype.getBases();
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
@ -289,6 +290,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
//if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue;
//logger.info("Adding read " + read + " with sequence " + read.getReadString());
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
@ -325,8 +328,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
protected void printGraphs() {
final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size
final int writeFirstGraphWithSizeSmallerThan = 50;
graphWriter.println("digraph assemblyGraphs {");
for( final DeBruijnAssemblyGraph graph : graphs ) {
if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
continue;
}
for( final DeBruijnEdge edge : graph.edgeSet() ) {
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];");
@ -337,8 +348,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
}
for( final DeBruijnVertex v : graph.vertexSet() ) {
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]");
}
if ( onlyWriteOneGraph )
break;
}
graphWriter.println("}");
}

View File

@ -47,9 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.PrintStream;
@ -62,9 +60,32 @@ import java.util.Arrays;
*/
public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> {
private final int kmerSize;
public DeBruijnAssemblyGraph() {
/**
* Construct a DeBruijnAssemblyGraph with kmerSize
* @param kmerSize
*/
public DeBruijnAssemblyGraph(final int kmerSize) {
super(DeBruijnEdge.class);
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
this.kmerSize = kmerSize;
}
/**
* Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11
*/
protected DeBruijnAssemblyGraph() {
this(11);
}
/**
* How big of a kmer did we use to create this graph?
* @return
*/
public int getKmerSize() {
return kmerSize;
}
/**

View File

@ -608,8 +608,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
// uncomment to remove hard clips from consideration at all
//clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
readsToUse.add(clippedRead);
}
}