diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 6f2991771..25edd49ef 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -285,8 +285,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In protected boolean recoverDanglingHeads = false; @Hidden - @Argument(fullName="recoverDanglingTails", shortName="recoverDanglingTails", doc="Should we enable dangling tail recovery in the read threading assembler?", required = false) - protected boolean recoverDanglingTails = false; + @Argument(fullName="doNotRecoverDanglingTails", shortName="doNotRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) + protected boolean doNotRecoverDanglingTails = false; @Advanced @Argument(fullName="consensus", shortName="consensus", doc="In 1000G consensus mode. Inject all provided alleles to the assembly graph but don't forcibly genotype all of them.", required = false) @@ -622,7 +622,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In assemblyEngine.setDebug(SCAC.DEBUG); assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); - assemblyEngine.setRecoverDanglingTails(recoverDanglingTails); + assemblyEngine.setRecoverDanglingTails(!doNotRecoverDanglingTails); assemblyEngine.setRecoverDanglingHeads(recoverDanglingHeads); assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 7fe3de973..5818cc1e2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -324,7 +324,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine activeAllelesToGenotype) { final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + // Using the cigar from each called haplotype to figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.DEBUG); if ( in_GGA_mode ) startPosKeySet.clear(); @@ -338,8 +338,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "b0404b464097d667843da057bdc53c40"}); + tests.add(new Object[]{nct, "31a7bb9fb5bc512120b88c5ecdd81139"}); } return tests.toArray(new Object[][]{}); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java index 60dcd4943..8cd2a692e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java @@ -51,6 +51,8 @@ import java.util.*; public class EventMap extends TreeMap { private final static Logger logger = Logger.getLogger(EventMap.class); protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; + private static final int MAX_EVENTS_PER_HAPLOTYPE = 3; + private static final int MAX_INDELS_PER_HAPLOTYPE = 2; public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); private final Haplotype haplotype; @@ -90,6 +92,8 @@ public class EventMap extends TreeMap { return; } // Protection against SW failures + final List proposedEvents = new ArrayList<>(); + int alignmentPos = 0; for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { @@ -117,7 +121,7 @@ public class EventMap extends TreeMap { } } if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); } } alignmentPos += elementLength; @@ -138,7 +142,7 @@ public class EventMap extends TreeMap { if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { deletionAlleles.add( Allele.create(deletionBases, true) ); deletionAlleles.add( Allele.create(refByte, false) ); - addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); } } refPos += elementLength; @@ -156,7 +160,7 @@ public class EventMap extends TreeMap { final List snpAlleles = new ArrayList(); snpAlleles.add( Allele.create( refByte, true ) ); snpAlleles.add( Allele.create( altByte, false ) ); - addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); } } refPos++; @@ -171,6 +175,97 @@ public class EventMap extends TreeMap { throw new ReviewedGATKException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); } } + + // handle the case where the event set for the haplotype is very complex + // TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so + if ( variationIsTooComplex(proposedEvents) ) { + addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef()); + } else { + for ( final VariantContext proposedEvent : proposedEvents ) + addVC(proposedEvent, true); + } + } + + /** + * Determine whether the provided set of variants is too complex for breaking up into individual parts + * + * @param events the individual events + * @return true if the cigar is too complex, false otherwise + */ + private boolean variationIsTooComplex(final List events) { + int indelCount = 0; + for ( final VariantContext event : events ) { + if ( event.isIndel() ) + indelCount++; + } + + // don't allow too many indels + return indelCount > MAX_INDELS_PER_HAPLOTYPE; + } + + /** + * Add a complex variant context to the events given the haplotype and its cigar + * + * @param cigar the cigar to convert + * @param haplotype the bases of the alternate haplotype + * @param refPos the position on the reference for this alignment + */ + private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) { + // ignore leading and trailing bases that match between this haplotype and the reference + final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0); + final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1); + + // edge case: too many matches + final int totalMatch = matchingPrefix + matchingSuffix; + if ( totalMatch >= haplotype.length || totalMatch >= ref.length ) + return; + + final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix); + final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix); + + final List alleles = new ArrayList<>(); + alleles.add( Allele.create( refBases, true ) ); + alleles.add( Allele.create( altBases, false ) ); + final int start = refLoc.getStart() + refPos + matchingPrefix; + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true); + } + + /** + * calculates the extent of the prefix match between 2 sequences + * + * @param seq1 the first sequence + * @param seq2 the second sequence + * @param startPos1 the index on seq1 to start from + * @param startPos2 the index on seq2 to start from + * @return non-negative int representing the matching prefix + */ + private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { + int matchingBases = 0; + for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) { + if ( seq1[pos1] != seq2[pos2] ) + break; + matchingBases++; + } + return matchingBases; + } + + /** + * calculates the extent of the suffix match between 2 sequences + * + * @param seq1 the first sequence + * @param seq2 the second sequence + * @param startPos1 the index on seq1 to start from + * @param startPos2 the index on seq2 to start from + * @return non-negative int representing the matching suffix + */ + private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { + int matchingBases = 0; + for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) { + if ( seq1[pos1] != seq2[pos2] ) + break; + matchingBases++; + } + return matchingBases; } /** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java index ea84f7bbf..cd492fec9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java @@ -107,7 +107,6 @@ public class CigarUtils { return TextCigarCodec.getSingleton().decode(cigarString); } - /** * A valid cigar object obeys the following rules: * - No Hard/Soft clips in the middle of the read