When converting a haplotype to a set of variants we now check for cases that are overly complex.

In these cases, where the alignment contains multiple indels, we output a single complex
variant instead of the multiple partial indels.

We also re-enable dangling tail recovery by default.
This commit is contained in:
Eric Banks 2014-07-01 10:19:10 -04:00
parent 297c2b0651
commit bad7865078
8 changed files with 121 additions and 28 deletions

View File

@ -285,8 +285,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
protected boolean recoverDanglingHeads = false;
@Hidden
@Argument(fullName="recoverDanglingTails", shortName="recoverDanglingTails", doc="Should we enable dangling tail recovery in the read threading assembler?", required = false)
protected boolean recoverDanglingTails = false;
@Argument(fullName="doNotRecoverDanglingTails", shortName="doNotRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false)
protected boolean doNotRecoverDanglingTails = false;
@Advanced
@Argument(fullName="consensus", shortName="consensus", doc="In 1000G consensus mode. Inject all provided alleles to the assembly graph but don't forcibly genotype all of them.", required = false)
@ -622,7 +622,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
assemblyEngine.setDebug(SCAC.DEBUG);
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths);
assemblyEngine.setRecoverDanglingTails(recoverDanglingTails);
assemblyEngine.setRecoverDanglingTails(!doNotRecoverDanglingTails);
assemblyEngine.setRecoverDanglingHeads(recoverDanglingHeads);
assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE);

View File

@ -324,7 +324,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
final List<VariantContext> activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
// Using the cigar from each called haplotype to figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.DEBUG);
if ( in_GGA_mode ) startPosKeySet.clear();
@ -338,8 +338,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
if ( mergedAnything )
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
}
if ( in_GGA_mode ) {
else {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}

View File

@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"fb9f244c09ac6781e2a984e90e0cd340");
"40ef4d4188d326ca0d4e29f4de4b53ff");
}
private void HCTestComplexConsensusMode(String bam, String args, String md5) {
@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
"23dab7ca085f13b3b23954a94630d04d");
"e328436354bdf7604d987c6e2c9172eb");
}
}

View File

@ -67,9 +67,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "97cfa815579021d84b17ead791b66af2"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "55638f5886e4a66d9787e31ac65481c7"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9ea9d65ad7fc0c9ed7880746d5e6ce7f"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "afb1c0ff2556b01ee03596b19bfc60bd"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "dad32e660bf4bffcb0f996d9f3fd5073"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "d4a44ce515fdbfa2017b7bd08dcddadc"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c95a777e3aac50b9f29302919a00ba4e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "05860d40923e8b074576c787037c5768"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f3abc3050b2708fc78c903fb70ac253e"});
@ -149,7 +149,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("8e6972866400eae202e72c959a8333a3"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f70f418ba99bbd82f74dc85afb02bca4"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
}

View File

@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "7c4b9454f80f28386cd1ca6634b3db42");
HCTest(CEUTRIO_BAM, "", "80e190483299828700dd04c221111b5c");
}
@Test
@ -104,7 +104,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "4d7a929ca7e15ec3e0173e4a4381a704");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "b8307688564f0735f630170d8e655a58");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"5d824bf8a95f21fbb6de99d180f72a21");
"fd3b7ac0cc8c3a727cf0e5783645ca28");
}
@Test
@ -178,7 +178,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("923fee5e23930e0b321258d6d3fa2ffe"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -199,7 +199,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDanglingTailMergingForDeletions() throws IOException {
final String base = String.format("-T HaplotypeCaller -recoverDanglingTails --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0);
@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("5537548b7d0c5615242695c48b4e8ecd"));
Arrays.asList("dc3c63e1e131806fe5455ad0286bebab"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("b25cbde7d1ad7efc43b894445f349a9f"));
Arrays.asList("69ece81900b78a3d2712898c99e15d75"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("91606273478c01613016e00f0f39ec8c"));
Arrays.asList("f369423c10d3aece3ef5e4655bd3af55"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("f52490340b7ced98700f93e09bec05f3"));
Arrays.asList("dad6d8c007822883e230eb4250b8b788"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
@ -304,7 +304,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16",
b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list",
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("8328a0f44eb8aa3b891ace2207ed488e"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("47c53227ca7be6b343a9900e61594281"));
spec.disableShadowBCF();
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
}
@ -313,7 +313,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testMissingKeyAlternativeHaplotypesBugFix() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("d8c6197a7060c16c849e15460a4287f2"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e613c8a2f637cff3e049de34c354edde"));
spec.disableShadowBCF();
executeTest("testMissingKeyAlternativeHaplotypesBugFix", spec);
}

View File

@ -60,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "b0404b464097d667843da057bdc53c40"});
tests.add(new Object[]{nct, "31a7bb9fb5bc512120b88c5ecdd81139"});
}
return tests.toArray(new Object[][]{});

View File

@ -51,6 +51,8 @@ import java.util.*;
public class EventMap extends TreeMap<Integer, VariantContext> {
private final static Logger logger = Logger.getLogger(EventMap.class);
protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3;
private static final int MAX_EVENTS_PER_HAPLOTYPE = 3;
private static final int MAX_INDELS_PER_HAPLOTYPE = 2;
public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
private final Haplotype haplotype;
@ -90,6 +92,8 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
return;
} // Protection against SW failures
final List<VariantContext> proposedEvents = new ArrayList<>();
int alignmentPos = 0;
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
@ -117,7 +121,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
}
}
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
}
}
alignmentPos += elementLength;
@ -138,7 +142,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
deletionAlleles.add( Allele.create(deletionBases, true) );
deletionAlleles.add( Allele.create(refByte, false) );
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
}
}
refPos += elementLength;
@ -156,7 +160,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
final List<Allele> snpAlleles = new ArrayList<Allele>();
snpAlleles.add( Allele.create( refByte, true ) );
snpAlleles.add( Allele.create( altByte, false ) );
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
}
}
refPos++;
@ -171,6 +175,97 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
throw new ReviewedGATKException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
}
}
// handle the case where the event set for the haplotype is very complex
// TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so
if ( variationIsTooComplex(proposedEvents) ) {
addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef());
} else {
for ( final VariantContext proposedEvent : proposedEvents )
addVC(proposedEvent, true);
}
}
/**
* Determine whether the provided set of variants is too complex for breaking up into individual parts
*
* @param events the individual events
* @return true if the cigar is too complex, false otherwise
*/
private boolean variationIsTooComplex(final List<VariantContext> events) {
int indelCount = 0;
for ( final VariantContext event : events ) {
if ( event.isIndel() )
indelCount++;
}
// don't allow too many indels
return indelCount > MAX_INDELS_PER_HAPLOTYPE;
}
/**
* Add a complex variant context to the events given the haplotype and its cigar
*
* @param cigar the cigar to convert
* @param haplotype the bases of the alternate haplotype
* @param refPos the position on the reference for this alignment
*/
private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) {
// ignore leading and trailing bases that match between this haplotype and the reference
final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0);
final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1);
// edge case: too many matches
final int totalMatch = matchingPrefix + matchingSuffix;
if ( totalMatch >= haplotype.length || totalMatch >= ref.length )
return;
final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix);
final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix);
final List<Allele> alleles = new ArrayList<>();
alleles.add( Allele.create( refBases, true ) );
alleles.add( Allele.create( altBases, false ) );
final int start = refLoc.getStart() + refPos + matchingPrefix;
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true);
}
/**
* calculates the extent of the prefix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching prefix
*/
private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
}
/**
* calculates the extent of the suffix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching suffix
*/
private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
}
/**

View File

@ -107,7 +107,6 @@ public class CigarUtils {
return TextCigarCodec.getSingleton().decode(cigarString);
}
/**
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read