Bugfix for HaplotypeCaller: GSA-822 for trimming softclipped reads
-- Previous version would not trim down soft clip bases that extend beyond the active region, causing the assembly graph to go haywire. The new code explicitly reverts soft clips to M bases with the ever useful ReadClipper, and then trims. Note this isn't a 100% fix for the issue, as it's possible that the newly unclipped bases might in reality extend beyond the active region, should their true alignment include a deletion in the reference. Needs to be fixed. JIRA added -- See https://jira.broadinstitute.org/browse/GSA-822 -- #resolve #fix GSA-822
This commit is contained in:
parent
ffea6dd95f
commit
53a904bcbd
|
|
@ -271,9 +271,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
|
||||
protected static DeBruijnAssemblyGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
|
||||
|
||||
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
|
||||
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH);
|
||||
|
||||
// First pull kmers from the reference haplotype and add them to the graph
|
||||
//logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString());
|
||||
final byte[] refSequence = refHaplotype.getBases();
|
||||
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
|
||||
|
|
@ -289,6 +290,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
//if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue;
|
||||
//logger.info("Adding read " + read + " with sequence " + read.getReadString());
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
|
|
@ -325,8 +328,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
protected void printGraphs() {
|
||||
final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size
|
||||
final int writeFirstGraphWithSizeSmallerThan = 50;
|
||||
|
||||
graphWriter.println("digraph assemblyGraphs {");
|
||||
for( final DeBruijnAssemblyGraph graph : graphs ) {
|
||||
if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
|
||||
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
|
||||
continue;
|
||||
}
|
||||
|
||||
for( final DeBruijnEdge edge : graph.edgeSet() ) {
|
||||
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];");
|
||||
|
|
@ -337,8 +348,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
|
||||
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]");
|
||||
}
|
||||
|
||||
if ( onlyWriteOneGraph )
|
||||
break;
|
||||
}
|
||||
graphWriter.println("}");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,9 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -62,9 +60,32 @@ import java.util.Arrays;
|
|||
*/
|
||||
|
||||
public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> {
|
||||
private final int kmerSize;
|
||||
|
||||
public DeBruijnAssemblyGraph() {
|
||||
/**
|
||||
* Construct a DeBruijnAssemblyGraph with kmerSize
|
||||
* @param kmerSize
|
||||
*/
|
||||
public DeBruijnAssemblyGraph(final int kmerSize) {
|
||||
super(DeBruijnEdge.class);
|
||||
|
||||
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
|
||||
this.kmerSize = kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11
|
||||
*/
|
||||
protected DeBruijnAssemblyGraph() {
|
||||
this(11);
|
||||
}
|
||||
|
||||
/**
|
||||
* How big of a kmer did we use to create this graph?
|
||||
* @return
|
||||
*/
|
||||
public int getKmerSize() {
|
||||
return kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -608,8 +608,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
|
||||
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
|
||||
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
|
||||
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
|
||||
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
|
||||
// TODO -- reference haplotype start must be removed
|
||||
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
|
||||
|
||||
// uncomment to remove hard clips from consideration at all
|
||||
//clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
|
||||
|
||||
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
|
||||
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
|
||||
readsToUse.add(clippedRead);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue