Misc cleanup in the HaplotypeCaller. Cleaning up unused arguments after recent changes to HC-GenotypingEngine

This commit is contained in:
Ryan Poplin 2012-11-26 12:19:11 -05:00
parent 3784bb5258
commit c3b7dd1374
7 changed files with 21 additions and 76 deletions

View File

@ -41,13 +41,11 @@ import java.util.*;
public class GenotypingEngine {
private final boolean DEBUG;
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
public GenotypingEngine( final boolean DEBUG ) {
this.DEBUG = DEBUG;
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
noCall.add(Allele.NO_CALL);
}

View File

@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1;
@Advanced
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
@Advanced
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
// header lines for the experimental HaplotypeCaller-specific annotations
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
genotypingEngine = new GenotypingEngine( DEBUG );
}
//---------------------------------------------------------------------------------------------------------------
@ -428,43 +412,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
if( !GENOTYPE_FULL_ACTIVE_REGION ) {
// add some custom annotations to the calls
// Calculate the number of variants on the haplotype
int maxNumVar = 0;
for( final Allele allele : callResult.getFirst().getAlleles() ) {
if( !allele.isReference() ) {
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
final int numVar = haplotype.getEventMap().size();
if( numVar > maxNumVar ) { maxNumVar = numVar; }
}
}
}
// Calculate the event length
int maxLength = 0;
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
final int length = a.length() - annotatedCall.getReference().length();
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
}
myAttributes.put("NVH", maxNumVar);
myAttributes.put("NumHapEval", bestHaplotypes.size());
myAttributes.put("NumHapAssembly", haplotypes.size());
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
myAttributes.put("EVENTLENGTH", maxLength);
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
myAttributes.put("extType", annotatedCall.getType().toString() );
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
//}
if( annotatedCall.hasAttribute("QD") ) {
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
}
}
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
}
@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// protect against INTERVALS with abnormally high coverage
// BUGBUG: remove when positinal downsampler is hooked up to ART/HC
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
activeRegion.add(clippedRead);
}

View File

@ -169,7 +169,6 @@ public class LikelihoodCalculationEngine {
}
// compute the diploid haplotype likelihoods
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {

View File

@ -21,18 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3");
HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f");
}
// TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"541aa8291f03ba33bd1ad3d731fd5657");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -43,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -54,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -65,7 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762");
}
@Test
@ -78,7 +79,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -92,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("425f1a0fb00d7145edf1c55e54346fae"));
Arrays.asList("40bf739fb2b1743642498efe79ea6342"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
}

View File

@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
final List<Double> refQuals, final List<Double> altQuals) {
if (pileup != null && likelihoodMap == null) {
// no per-read likelihoods available:
// old UG snp-only path through the annotations
for ( final PileupElement p : pileup ) {
if ( isUsableBase(p) ) {
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
}
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
// BUGBUG: There needs to be a comparable isUsableBase check here
if (a.isNoCall())
continue; // read is non-informative
if (a.isReference())
refQuals.add((double)el.getKey().getMappingQuality());
else if (allAlleles.contains(a))
altQuals.add((double)el.getKey().getMappingQuality());
}
}

View File

@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
ReadBackedPileup pileup = null;
if (stratifiedContexts != null) {
if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context != null )
pileup = context.getBasePileup();

View File

@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
final List<Double> refQuals, final List<Double> altQuals) {
if (alleleLikelihoodMap == null) {
// use fast SNP-based version if we don't have per-read allele likelihoods
// use old UG SNP-based version if we don't have per-read allele likelihoods
for ( final PileupElement p : pileup ) {
if ( isUsableBase(p) ) {
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);