From 601c26a5922c47a51ee5e0b9659f9856111f1ab0 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 25 Aug 2016 11:54:02 -0400 Subject: [PATCH] More small refactorings of Mutect2 code --- .../gatk/tools/walkers/cancer/m2/MuTect2.java | 302 ++++++------------ .../cancer/m2/SomaticGenotypingEngine.java | 52 +-- .../cancer/m2/MuTect2IntegrationTest.java | 7 - 3 files changed, 116 insertions(+), 245 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index 4f509f796..4722e51f3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -71,6 +71,7 @@ import org.broadinstitute.gatk.tools.walkers.haplotypecaller.*; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; @@ -189,20 +190,18 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i protected SampleList samplesList; protected boolean printTCGAsampleHeader = false; - // fasta reference reader to supplement the edges of the reference sequence protected CachingIndexedFastaSequenceFile referenceReader; - - // the assembly engine protected LocalAssemblyEngine assemblyEngine = null; - - // the likelihoods engine protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null; - - // the genotyping engine - protected HaplotypeCallerGenotypingEngine genotypingEngine = null; + protected SomaticGenotypingEngine genotypingEngine = null; + private HaplotypeBAMWriter haplotypeBAMWriter; private byte MIN_TAIL_QUALITY; private double log10GlobalReadMismappingRate; + private static final int REFERENCE_PADDING = 500; + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @ArgumentCollection protected M2ArgumentCollection MTAC = new M2ArgumentCollection(); @@ -221,10 +220,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging") final public int MQthreshold = 20; - - public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } - - private HaplotypeBAMWriter haplotypeBAMWriter; /** * If a call overlaps with a record from the provided comp track, the INFO field will be annotated * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). @@ -291,12 +286,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; - // reference base padding size - private static final int REFERENCE_PADDING = 500; + /** + * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. + */ + @Advanced + @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) + protected boolean doNotRunPhysicalPhasing = false; - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } @Override public void initialize() { @@ -388,7 +385,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC, tumorSampleName, normalSampleName, DEBUG_READ_NAME); - genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); @@ -438,7 +434,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getSampleHeaderLines()); - final List outputSampleNames = getOutputSampleNames(); + // if printTCGAsampleHeader, we already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() + final List outputSampleNames = printTCGAsampleHeader ? Arrays.asList("TUMOR", "NORMAL") : SampleListUtils.asList(samplesList); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); @@ -507,19 +504,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return sampleLines; } - private List getOutputSampleNames(){ - if (printTCGAsampleHeader) { - //Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() - final List sampleNamePlaceholders = new ArrayList<>(2); - sampleNamePlaceholders.add("TUMOR"); - sampleNamePlaceholders.add("NORMAL"); - return sampleNamePlaceholders; - } - else { - return SampleListUtils.asList(samplesList); - } - } - @Override public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if( context == null || context.getBasePileup().isEmpty() ) @@ -564,7 +548,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i prob = 1; logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling"); } - } return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null); @@ -573,16 +556,13 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private final static List NO_CALLS = Collections.emptyList(); @Override public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { - if ( justDetermineActiveRegions ) - // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work + if ( justDetermineActiveRegions ) { return NO_CALLS; - - if( !originalActiveRegion.isActive() ) - // Not active so nothing to do! + } else if( !originalActiveRegion.isActive() ) { return referenceModelForNoVariation(originalActiveRegion, true); - - // No reads here so nothing to do! - if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } else if( originalActiveRegion.size() == 0 ) { + return referenceModelForNoVariation(originalActiveRegion, true); + } logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); // create the assembly using just high quality reads (Q20 or higher). We want to use lower @@ -593,30 +573,25 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i assemblyActiveRegion.add(rec); } } - logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region"); // run the local assembler, getting back a collection of information on how we should proceed - final List givenAlleles = new ArrayList<>(); - final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, givenAlleles); - - + final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, Collections.EMPTY_LIST); final TreeSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); - // TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway - // TODO - so check and remove if that is the case: - allVariationEvents.addAll(givenAlleles); - final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); - - - // Stop the trimming madness!!! - if (!trimmingResult.isVariationPresent()) - return referenceModelForNoVariation(originalActiveRegion,false); + if (!trimmingResult.isVariationPresent()) { + return referenceModelForNoVariation(originalActiveRegion, false); + } logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); final AssemblyResultSet assemblyResult = trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; + // it is conceivable that the region is active yet has no events upon assembling only the well-mapped reads + if( ! assemblyResult.isVariationPresent() ) { + return referenceModelForNoVariation(originalActiveRegion, false); + } + final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); @@ -624,7 +599,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); } - // filter out reads from genotyping which fail mapping quality based criteria //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method //TODO - on the originalActiveRegion? @@ -639,17 +613,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final Map> perSampleFilteredReadList = splitReadsBySample(filteredReads); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); - // abort early if something is out of the acceptable range - // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( ! assemblyResult.isVariationPresent() ) - return referenceModelForNoVariation(originalActiveRegion, false); - - // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( regionForGenotyping.size() == 0 ) { - // no reads remain after filtering so nothing else to do! - return referenceModelForNoVariation(originalActiveRegion, false); - } - // evaluate each sample's reads against all haplotypes final List haplotypes = assemblyResult.getHaplotypeList(); @@ -658,6 +621,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); } + //TODO: this should be written as + //TODO final Map ARreads_origNormalMQ = regionForGenotyping.getReads().stream() + //TODO .collect(Collectors.toMap(GATKSAMRecord::getReadName, GATKSAMRecord::getMappingQuality)); + //TODO: but for some reason sometimes streamifying Mutect2 code causes a MalformedWalkerArgumentsException + //TODO: this will probably not occur after the port to GATK4 final HashMap ARreads_origNormalMQ = new HashMap<>(); for (final GATKSAMRecord read : regionForGenotyping.getReads()) { ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); @@ -671,14 +639,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString()); + final ReadLikelihoods readLikelihoods = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); - - // Calculate the likelihoods: CPU intensive part. - final ReadLikelihoods readLikelihoods = - likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); - - // Realign reads to their best haplotype. - // KCIBUL: this is new stuff -- review it! final Map readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc()); if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { @@ -693,22 +655,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); } - // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there - // was a bad interaction between that selection and the marginalization that happens over each event when computing - // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the - // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included - // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] - - final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = ((SomaticGenotypingEngine)genotypingEngine).callMutations( - haplotypes, + final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.callMutations( readLikelihoods, ARreads_origNormalMQ, perSampleFilteredReadList, assemblyResult.getFullReferenceWithPadding(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping.getLocation(), - metaDataTracker, - givenAlleles); + metaDataTracker); if ( MTAC.bamWriter != null ) { final Set calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); @@ -727,66 +681,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - - - final List annotatedCalls = new ArrayList<>(); - final int eventCount = calledHaplotypes.getCalls().size(); - Integer minEventDistance = null; - Integer maxEventDistance = null; - Integer lastPosition = null; - for (final VariantContext vc : calledHaplotypes.getCalls()) { - if (lastPosition == null) { - lastPosition = vc.getStart(); - } else { - final int dist = Math.abs(vc.getStart() - lastPosition); - if (maxEventDistance == null || dist > maxEventDistance) { - maxEventDistance = dist; - } - if (minEventDistance == null || dist < minEventDistance) { - minEventDistance = dist; - } - } - } - final Map eventDistanceAttributes = new HashMap<>(); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance); - - - // can we do this with the Annotation classes instead? - for (final VariantContext originalVC : calledHaplotypes.getCalls()) { - final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); - - final Map attributes = new HashMap<>(originalVC.getAttributes()); - attributes.putAll(eventDistanceAttributes); - vcb.attributes(attributes); - - final Set filters = new HashSet<>(originalVC.getFilters()); - - final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); - if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { - filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); - } - - // if we are in artifact detection mode, apply the thresholds for the LOD scores - if (!MTAC.ARTIFACT_DETECTION_MODE) { - filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); - } - - vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); - - if (printTCGAsampleHeader) { - final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); - final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); - vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); - } - - annotatedCalls.add(vcb.make()); - } - - // TODO: find a better place for this debug message - // logger.info("We had " + TumorPowerCalculator.numCacheHits + " hits in starnd artifact power calculation"); - return annotatedCalls; + return annotateVCs(calledHaplotypes, metaDataTracker); } private Set calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map eventDistanceAttributes) { @@ -828,15 +723,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) { filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME); - } else { - - // NOTE: does normal alt counts presume the normal had all these events in CIS? - if ( eventCount > 1 && normalAltCounts >= 1) { - filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME); - } else if (eventCount >= 3) { - filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME); - } - + } else if ( eventCount > 1 && normalAltCounts >= 1) { + filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME); + } else if (eventCount >= 3) { + filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME); } // STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit) @@ -861,7 +751,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); } - // clustered read position filter if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); @@ -878,7 +767,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return filters; } - private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt @@ -916,6 +804,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return (normalSampleName != null); } + //TODO: streamify in GATK4 protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { int i=0; for( final PileupElement p : pileup ) { @@ -960,37 +849,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i protected Set filterNonPassingReads( final ActiveRegion activeRegion) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - + //TODO: Takuto points out that this is questionable. Let's think hard abut it. // KCIBUL: only perform read quality filtering on tumor reads... if (isReadFromNormal(rec)) { - if( rec.getReadLength() < MIN_READ_LENGTH ) { readsToRemove.add(rec); } - - } else { - - - if( rec.getReadLength() < MIN_READ_LENGTH || - rec.getMappingQuality() < MQthreshold || - BadMateFilter.hasBadMate(rec) || - + } else if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < MQthreshold || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { - readsToRemove.add(rec); - } + readsToRemove.add(rec); } } activeRegion.removeAll(readsToRemove); return readsToRemove; } - private static GATKSAMRecord findReadByName(final Collection reads, final String name) { - for(final GATKSAMRecord read : reads) { - if (name.equals(read.getReadName())) return read; - } - return null; - } - /** * Instantiates the appropriate likelihood calculation engine. * @@ -1025,13 +898,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { -// if ( includeUnmappedReads ) -// throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); -// else - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED); + return EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED); } //--------------------------------------------------------------------------------------------------------------- @@ -1063,7 +930,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logger.info("Ran local assembly on " + result + " active regions"); } - // The following are not used but are required by the AnnotatorCompatible interface public RodBinding getSnpEffRodBinding() { return null; } public List> getResourceRodBindings() { return Collections.emptyList(); } @@ -1087,15 +953,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); // Create ReadErrorCorrector object if requested - will be used within assembly engine. - ReadErrorCorrector readErrorCorrector = null; - if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding); - + final ReadErrorCorrector readErrorCorrector = errorCorrectReads ? new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, + RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding) : null; try { final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector ); assemblyResultSet.debugDump(logger); return assemblyResultSet; - } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { @@ -1151,10 +1014,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads); } - // handle overlapping read pairs from the same fragment - // KC: commented out as we handle overlapping read pairs in a different way... - //cleanOverlappingReadPairs(downsampledReads, normalSampleNames); - activeRegion.clearReads(); activeRegion.addAll(downsampledReads); activeRegion.setFinalized(true); @@ -1189,11 +1048,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) - - // in MuTect -- right now we compare the FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); - - } } @@ -1236,20 +1091,61 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private boolean isReadFromNormal(final GATKSAMRecord rec) { return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); - } public String getTumorSampleName(){ return tumorSampleName; } - // KCIBUL: new stuff -- read up on this!! - /** - * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. - */ - @Advanced - @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) - protected boolean doNotRunPhysicalPhasing = false; + final List annotateVCs(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes, final RefMetaDataTracker metaDataTracker) { + final int eventCount = calledHaplotypes.getCalls().size(); + final Map eventDistanceAttributes = new HashMap<>(); //TODO: should be Map -- see TODO below + eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); + if (eventCount > 1) { + final int lastPosition = calledHaplotypes.getCalls().get(0).getStart(); + final int[] eventDistances = new int[calledHaplotypes.getCalls().size() - 1]; + for (int n = 0; n < eventDistances.length; n++) { + eventDistances[n] = Math.abs(calledHaplotypes.getCalls().get(n + 1).getStart() - lastPosition); + } + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, MathUtils.arrayMin(eventDistances)); + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, MathUtils.arrayMax(eventDistances)); + } else { //TODO: putting null is a hack -- we should remove this and update the integration test md5s + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, null); + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, null); + } + + final List annotatedCalls = new ArrayList<>(); + // can we do this with the Annotation classes instead? + for (final VariantContext originalVC : calledHaplotypes.getCalls()) { + final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); + + final Map attributes = new HashMap<>(originalVC.getAttributes()); + attributes.putAll(eventDistanceAttributes); + vcb.attributes(attributes); + + final Set filters = new HashSet<>(originalVC.getFilters()); + + final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); + if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { + filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); + } + + // if we are in artifact detection mode, apply the thresholds for the LOD scores + if (!MTAC.ARTIFACT_DETECTION_MODE) { + filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); + } + + vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); + + if (printTCGAsampleHeader) { + final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); + final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); + vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); + } + annotatedCalls.add(vcb.make()); + } + return annotatedCalls; + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index 20940c74f..a0b259e1e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -51,10 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; -import com.google.java.contract.Ensures; -import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; -import org.apache.commons.collections.ListUtils; import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Logger; @@ -63,7 +60,6 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvid import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -87,6 +83,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { private final String matchedNormalSampleName; private final String DEBUG_READ_NAME; + //Mutect2 does not run in GGA mode + private static final List NO_GIVEN_ALLELES = Collections.EMPTY_LIST; + // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; } @@ -109,7 +108,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { this.DEBUG_READ_NAME = DEBUG_READ_NAME; // coverage related initialization - final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE / 10); + //TODO: in GATK4, use a QualityUtils method + final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE/10); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); } @@ -118,48 +118,40 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * * The list of samples we're working with is obtained from the readLikelihoods - - * @param haplotypes Haplotypes to assign likelihoods to * @param readLikelihoods Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window - * @param activeAllelesToGenotype Alleles to genotype * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * */ - // TODO - can this be refactored? this is hard to follow! public CalledHaplotypes callMutations ( - final List haplotypes, final ReadLikelihoods readLikelihoods, final Map originalNormalReadQualities, final Map> perSampleFilteredReadList, final byte[] ref, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype) { - // sanity check input arguments - if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + final RefMetaDataTracker tracker) { + //TODO: in GATK4 use Utils.nonNull if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + final List haplotypes = readLikelihoods.alleles(); + + // Somatic Tumor/Normal Sample Handling if (!readLikelihoods.samples().contains(tumorSampleName)) { - throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample "+ tumorSampleName); + throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample " + tumorSampleName); } - - // if we don't have the normal sample, we are in tumor only mode - // TODO: check in MuTect2.java for code we can skip when in tumor only mode final boolean hasNormal = matchedNormalSampleName != null; // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // that carry events among the haplotypes - final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype); + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, NO_GIVEN_ALLELES); // Walk along each position in the key set and create each event to be outputted final Set calledHaplotypes = new HashSet<>(); @@ -170,7 +162,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { continue; } - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -203,14 +195,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code. **/ - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); // converting ReadLikelihoods to ReadLikeliHoods ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); - // LDG: do we want to do this before or after pulling out overlapping reads? - // TODO: do we want this at all? How does downsampling help? + //LDG: do we want to do this before or after pulling out overlapping reads? if (MTAC.isSampleContaminationPresent()) { readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); } @@ -232,24 +222,18 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); } - // TODO: another good breakpoint e.g. compute normal LOD/set thresholds // TODO: anything related to normal should be encapsulated in Optional - // Normal LOD must exceed this threshold for the variant to make it in the vcf - // TODO: variable name too log - double normalLodThresholdForVCF = -Double.MIN_VALUE; - // A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk' // This is a more stringent threshold than normalLodThresholdForVCF - double normalLodFilterThreshold = -Double.MIN_VALUE; - + double normalLodFilterThreshold = -Double.MAX_VALUE; PerReadAlleleLikelihoodMap normalPRALM = null; final PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + // if normal bam is available, compute normal LOD // TODO: this if statement should be a standalone method for computing normal LOD // TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty() - // if normal bam is available, compute normal LOD if (hasNormal) { normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); @@ -258,10 +242,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); final Collection cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); final Collection dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); - final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty(); - normalLodThresholdForVCF = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD; // compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5 @@ -284,7 +266,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { for (final Allele altAllele : mergedVC.getAlternateAlleles()) { final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; - final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= normalLodThresholdForVCF : true; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= MTAC.INITIAL_NORMAL_LOD_THRESHOLD : true; if (passesTumorLodThreshold && passesNormalLodThreshold) { numPassingAlts++; allelesThatPassThreshold.add(altAllele); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java index b7ca42c4f..1c99946db 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java @@ -70,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest { final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals"; final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals"; - - final String commandLine = "-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s"; @@ -160,9 +158,6 @@ public class MuTect2IntegrationTest extends WalkerTest { M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b"); } - /** - * Test that tumor-only mode does not create an empty vcf - */ @Test public void testTumorOnly(){ m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0"); @@ -177,6 +172,4 @@ public class MuTect2IntegrationTest extends WalkerTest { public void testClusteredReadPositionFilter() { M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2"); } - - }