More small refactorings of Mutect2 code

This commit is contained in:
David Benjamin 2016-08-25 11:54:02 -04:00
parent 814ad21006
commit 601c26a592
3 changed files with 116 additions and 245 deletions

View File

@ -71,6 +71,7 @@ import org.broadinstitute.gatk.tools.walkers.haplotypecaller.*;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
@ -189,20 +190,18 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
protected SampleList samplesList; protected SampleList samplesList;
protected boolean printTCGAsampleHeader = false; protected boolean printTCGAsampleHeader = false;
// fasta reference reader to supplement the edges of the reference sequence
protected CachingIndexedFastaSequenceFile referenceReader; protected CachingIndexedFastaSequenceFile referenceReader;
// the assembly engine
protected LocalAssemblyEngine assemblyEngine = null; protected LocalAssemblyEngine assemblyEngine = null;
// the likelihoods engine
protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null; protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null;
protected SomaticGenotypingEngine genotypingEngine = null;
// the genotyping engine private HaplotypeBAMWriter haplotypeBAMWriter;
protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
private byte MIN_TAIL_QUALITY; private byte MIN_TAIL_QUALITY;
private double log10GlobalReadMismappingRate; private double log10GlobalReadMismappingRate;
private static final int REFERENCE_PADDING = 500;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
@ArgumentCollection @ArgumentCollection
protected M2ArgumentCollection MTAC = new M2ArgumentCollection(); protected M2ArgumentCollection MTAC = new M2ArgumentCollection();
@ -221,10 +220,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
@Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging") @Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging")
final public int MQthreshold = 20; final public int MQthreshold = 20;
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
private HaplotypeBAMWriter haplotypeBAMWriter;
/** /**
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated * If a call overlaps with a record from the provided comp track, the INFO field will be annotated
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
@ -291,12 +286,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
protected boolean justDetermineActiveRegions = false; protected boolean justDetermineActiveRegions = false;
// reference base padding size /**
private static final int REFERENCE_PADDING = 500; * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
*/
@Advanced
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
protected boolean doNotRunPhysicalPhasing = false;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
@Override @Override
public void initialize() { public void initialize() {
@ -388,7 +385,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC, genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
tumorSampleName, normalSampleName, DEBUG_READ_NAME); tumorSampleName, normalSampleName, DEBUG_READ_NAME);
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
genotypingEngine.setAnnotationEngine(annotationEngine); genotypingEngine.setAnnotationEngine(annotationEngine);
@ -438,7 +434,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getM2HeaderLines());
headerInfo.addAll(getSampleHeaderLines()); headerInfo.addAll(getSampleHeaderLines());
final List<String> outputSampleNames = getOutputSampleNames(); // if printTCGAsampleHeader, we already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
final List<String> outputSampleNames = printTCGAsampleHeader ? Arrays.asList("TUMOR", "NORMAL") : SampleListUtils.asList(samplesList);
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
@ -507,19 +504,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return sampleLines; return sampleLines;
} }
private List<String> getOutputSampleNames(){
if (printTCGAsampleHeader) {
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
final List<String> sampleNamePlaceholders = new ArrayList<>(2);
sampleNamePlaceholders.add("TUMOR");
sampleNamePlaceholders.add("NORMAL");
return sampleNamePlaceholders;
}
else {
return SampleListUtils.asList(samplesList);
}
}
@Override @Override
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if( context == null || context.getBasePileup().isEmpty() ) if( context == null || context.getBasePileup().isEmpty() )
@ -564,7 +548,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
prob = 1; prob = 1;
logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling"); logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling");
} }
} }
return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null); return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null);
@ -573,16 +556,13 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
private final static List<VariantContext> NO_CALLS = Collections.emptyList(); private final static List<VariantContext> NO_CALLS = Collections.emptyList();
@Override @Override
public List<VariantContext> map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { public List<VariantContext> map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
if ( justDetermineActiveRegions ) if ( justDetermineActiveRegions ) {
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS; return NO_CALLS;
} else if( !originalActiveRegion.isActive() ) {
if( !originalActiveRegion.isActive() )
// Not active so nothing to do!
return referenceModelForNoVariation(originalActiveRegion, true); return referenceModelForNoVariation(originalActiveRegion, true);
} else if( originalActiveRegion.size() == 0 ) {
// No reads here so nothing to do! return referenceModelForNoVariation(originalActiveRegion, true);
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } }
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
// create the assembly using just high quality reads (Q20 or higher). We want to use lower // create the assembly using just high quality reads (Q20 or higher). We want to use lower
@ -593,30 +573,25 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
assemblyActiveRegion.add(rec); assemblyActiveRegion.add(rec);
} }
} }
logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region"); logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region");
// run the local assembler, getting back a collection of information on how we should proceed // run the local assembler, getting back a collection of information on how we should proceed
final List<VariantContext> givenAlleles = new ArrayList<>(); final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, Collections.EMPTY_LIST);
final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, givenAlleles);
final TreeSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); final TreeSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
// TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway
// TODO - so check and remove if that is the case:
allVariationEvents.addAll(givenAlleles);
final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
return referenceModelForNoVariation(originalActiveRegion, false);
// Stop the trimming madness!!! }
if (!trimmingResult.isVariationPresent())
return referenceModelForNoVariation(originalActiveRegion,false);
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
final AssemblyResultSet assemblyResult = final AssemblyResultSet assemblyResult =
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
// it is conceivable that the region is active yet has no events upon assembling only the well-mapped reads
if( ! assemblyResult.isVariationPresent() ) {
return referenceModelForNoVariation(originalActiveRegion, false);
}
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
@ -624,7 +599,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
} }
// filter out reads from genotyping which fail mapping quality based criteria // filter out reads from genotyping which fail mapping quality based criteria
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
//TODO - on the originalActiveRegion? //TODO - on the originalActiveRegion?
@ -639,17 +613,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads); final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
// abort early if something is out of the acceptable range
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if( ! assemblyResult.isVariationPresent() )
return referenceModelForNoVariation(originalActiveRegion, false);
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if( regionForGenotyping.size() == 0 ) {
// no reads remain after filtering so nothing else to do!
return referenceModelForNoVariation(originalActiveRegion, false);
}
// evaluate each sample's reads against all haplotypes // evaluate each sample's reads against all haplotypes
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList(); final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
@ -658,6 +621,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
} }
//TODO: this should be written as
//TODO final Map<String, Integer> ARreads_origNormalMQ = regionForGenotyping.getReads().stream()
//TODO .collect(Collectors.toMap(GATKSAMRecord::getReadName, GATKSAMRecord::getMappingQuality));
//TODO: but for some reason sometimes streamifying Mutect2 code causes a MalformedWalkerArgumentsException
//TODO: this will probably not occur after the port to GATK4
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>(); final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
for (final GATKSAMRecord read : regionForGenotyping.getReads()) { for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
@ -671,14 +639,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString()); logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString());
final ReadLikelihoods<Haplotype> readLikelihoods = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads);
// Calculate the likelihoods: CPU intensive part.
final ReadLikelihoods<Haplotype> readLikelihoods =
likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads);
// Realign reads to their best haplotype.
// KCIBUL: this is new stuff -- review it!
final Map<GATKSAMRecord,GATKSAMRecord> readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc()); final Map<GATKSAMRecord,GATKSAMRecord> readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc());
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
@ -693,22 +655,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
} }
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.callMutations(
// was a bad interaction between that selection and the marginalization that happens over each event when computing
// GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = ((SomaticGenotypingEngine)genotypingEngine).callMutations(
haplotypes,
readLikelihoods, readLikelihoods,
ARreads_origNormalMQ, ARreads_origNormalMQ,
perSampleFilteredReadList, perSampleFilteredReadList,
assemblyResult.getFullReferenceWithPadding(), assemblyResult.getFullReferenceWithPadding(),
assemblyResult.getPaddedReferenceLoc(), assemblyResult.getPaddedReferenceLoc(),
regionForGenotyping.getLocation(), regionForGenotyping.getLocation(),
metaDataTracker, metaDataTracker);
givenAlleles);
if ( MTAC.bamWriter != null ) { if ( MTAC.bamWriter != null ) {
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
@ -727,66 +681,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
return annotateVCs(calledHaplotypes, metaDataTracker);
final List<VariantContext> annotatedCalls = new ArrayList<>();
final int eventCount = calledHaplotypes.getCalls().size();
Integer minEventDistance = null;
Integer maxEventDistance = null;
Integer lastPosition = null;
for (final VariantContext vc : calledHaplotypes.getCalls()) {
if (lastPosition == null) {
lastPosition = vc.getStart();
} else {
final int dist = Math.abs(vc.getStart() - lastPosition);
if (maxEventDistance == null || dist > maxEventDistance) {
maxEventDistance = dist;
}
if (minEventDistance == null || dist < minEventDistance) {
minEventDistance = dist;
}
}
}
final Map<String, Object> eventDistanceAttributes = new HashMap<>();
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
// can we do this with the Annotation classes instead?
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
attributes.putAll(eventDistanceAttributes);
vcb.attributes(attributes);
final Set<String> filters = new HashSet<>(originalVC.getFilters());
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
}
// if we are in artifact detection mode, apply the thresholds for the LOD scores
if (!MTAC.ARTIFACT_DETECTION_MODE) {
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
}
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
if (printTCGAsampleHeader) {
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
}
annotatedCalls.add(vcb.make());
}
// TODO: find a better place for this debug message
// logger.info("We had " + TumorPowerCalculator.numCacheHits + " hits in starnd artifact power calculation");
return annotatedCalls;
} }
private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) { private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
@ -828,15 +723,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) { if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) {
filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME); filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME);
} else { } else if ( eventCount > 1 && normalAltCounts >= 1) {
filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME);
// NOTE: does normal alt counts presume the normal had all these events in CIS? } else if (eventCount >= 3) {
if ( eventCount > 1 && normalAltCounts >= 1) { filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME);
filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME);
} else if (eventCount >= 3) {
filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME);
}
} }
// STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit) // STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit)
@ -861,7 +751,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME);
} }
// clustered read position filter // clustered read position filter
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
@ -878,7 +767,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return filters; return filters;
} }
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
/** /**
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
@ -916,6 +804,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return (normalSampleName != null); return (normalSampleName != null);
} }
//TODO: streamify in GATK4
protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
int i=0; int i=0;
for( final PileupElement p : pileup ) { for( final PileupElement p : pileup ) {
@ -960,37 +849,21 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
protected Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion) { protected Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion) {
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>(); final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
for( final GATKSAMRecord rec : activeRegion.getReads() ) { for( final GATKSAMRecord rec : activeRegion.getReads() ) {
//TODO: Takuto points out that this is questionable. Let's think hard abut it.
// KCIBUL: only perform read quality filtering on tumor reads... // KCIBUL: only perform read quality filtering on tumor reads...
if (isReadFromNormal(rec)) { if (isReadFromNormal(rec)) {
if( rec.getReadLength() < MIN_READ_LENGTH ) { if( rec.getReadLength() < MIN_READ_LENGTH ) {
readsToRemove.add(rec); readsToRemove.add(rec);
} }
} else if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < MQthreshold || BadMateFilter.hasBadMate(rec) ||
} else {
if( rec.getReadLength() < MIN_READ_LENGTH ||
rec.getMappingQuality() < MQthreshold ||
BadMateFilter.hasBadMate(rec) ||
(keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec); readsToRemove.add(rec);
}
} }
} }
activeRegion.removeAll(readsToRemove); activeRegion.removeAll(readsToRemove);
return readsToRemove; return readsToRemove;
} }
private static GATKSAMRecord findReadByName(final Collection<GATKSAMRecord> reads, final String name) {
for(final GATKSAMRecord read : reads) {
if (name.equals(read.getReadName())) return read;
}
return null;
}
/** /**
* Instantiates the appropriate likelihood calculation engine. * Instantiates the appropriate likelihood calculation engine.
* *
@ -1025,13 +898,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// enable non primary and extended reads in the active region // enable non primary and extended reads in the active region
@Override @Override
public EnumSet<ActiveRegionReadState> desiredReadStates() { public EnumSet<ActiveRegionReadState> desiredReadStates() {
// if ( includeUnmappedReads ) return EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED);
// throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional");
// else
return EnumSet.of(
ActiveRegionReadState.PRIMARY,
ActiveRegionReadState.NONPRIMARY,
ActiveRegionReadState.EXTENDED);
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -1063,7 +930,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
logger.info("Ran local assembly on " + result + " active regions"); logger.info("Ran local assembly on " + result + " active regions");
} }
// The following are not used but are required by the AnnotatorCompatible interface // The following are not used but are required by the AnnotatorCompatible interface
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; } public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); } public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
@ -1087,15 +953,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
// Create ReadErrorCorrector object if requested - will be used within assembly engine. // Create ReadErrorCorrector object if requested - will be used within assembly engine.
ReadErrorCorrector readErrorCorrector = null; final ReadErrorCorrector readErrorCorrector = errorCorrectReads ? new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION,
if (errorCorrectReads) RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding) : null;
readErrorCorrector = new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding);
try { try {
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector ); final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector );
assemblyResultSet.debugDump(logger); assemblyResultSet.debugDump(logger);
return assemblyResultSet; return assemblyResultSet;
} catch ( final Exception e ) { } catch ( final Exception e ) {
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested // Capture any exception that might be thrown, and write out the assembly failure BAM if requested
if ( captureAssemblyFailureBAM ) { if ( captureAssemblyFailureBAM ) {
@ -1151,10 +1014,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads); haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads);
} }
// handle overlapping read pairs from the same fragment
// KC: commented out as we handle overlapping read pairs in a different way...
//cleanOverlappingReadPairs(downsampledReads, normalSampleNames);
activeRegion.clearReads(); activeRegion.clearReads();
activeRegion.addAll(downsampledReads); activeRegion.addAll(downsampledReads);
activeRegion.setFinalized(true); activeRegion.setFinalized(true);
@ -1189,11 +1048,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList); final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList);
for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() )
// in MuTect -- right now we compare the
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
} }
} }
@ -1236,20 +1091,61 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
private boolean isReadFromNormal(final GATKSAMRecord rec) { private boolean isReadFromNormal(final GATKSAMRecord rec) {
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
} }
public String getTumorSampleName(){ public String getTumorSampleName(){
return tumorSampleName; return tumorSampleName;
} }
// KCIBUL: new stuff -- read up on this!! final List<VariantContext> annotateVCs(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes, final RefMetaDataTracker metaDataTracker) {
/** final int eventCount = calledHaplotypes.getCalls().size();
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. final Map<String, Object> eventDistanceAttributes = new HashMap<>(); //TODO: should be Map<String, Integer> -- see TODO below
*/ eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
@Advanced if (eventCount > 1) {
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) final int lastPosition = calledHaplotypes.getCalls().get(0).getStart();
protected boolean doNotRunPhysicalPhasing = false; final int[] eventDistances = new int[calledHaplotypes.getCalls().size() - 1];
for (int n = 0; n < eventDistances.length; n++) {
eventDistances[n] = Math.abs(calledHaplotypes.getCalls().get(n + 1).getStart() - lastPosition);
}
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, MathUtils.arrayMin(eventDistances));
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, MathUtils.arrayMax(eventDistances));
} else { //TODO: putting null is a hack -- we should remove this and update the integration test md5s
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, null);
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, null);
}
final List<VariantContext> annotatedCalls = new ArrayList<>();
// can we do this with the Annotation classes instead?
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
attributes.putAll(eventDistanceAttributes);
vcb.attributes(attributes);
final Set<String> filters = new HashSet<>(originalVC.getFilters());
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
}
// if we are in artifact detection mode, apply the thresholds for the LOD scores
if (!MTAC.ARTIFACT_DETECTION_MODE) {
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
}
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
if (printTCGAsampleHeader) {
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
}
annotatedCalls.add(vcb.make());
}
return annotatedCalls;
}
} }

View File

@ -51,10 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.cancer.m2; package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import com.google.java.contract.Ensures;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.*;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableDouble;
import org.apache.commons.lang.mutable.MutableInt; import org.apache.commons.lang.mutable.MutableInt;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -63,7 +60,6 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvid
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
@ -87,6 +83,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
private final String matchedNormalSampleName; private final String matchedNormalSampleName;
private final String DEBUG_READ_NAME; private final String DEBUG_READ_NAME;
//Mutect2 does not run in GGA mode
private static final List<VariantContext> NO_GIVEN_ALLELES = Collections.EMPTY_LIST;
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; } public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
@ -109,7 +108,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
this.DEBUG_READ_NAME = DEBUG_READ_NAME; this.DEBUG_READ_NAME = DEBUG_READ_NAME;
// coverage related initialization // coverage related initialization
final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE / 10); //TODO: in GATK4, use a QualityUtils method
final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE/10);
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
} }
@ -118,48 +118,40 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
* *
* The list of samples we're working with is obtained from the readLikelihoods * The list of samples we're working with is obtained from the readLikelihoods
* @param haplotypes Haplotypes to assign likelihoods to
* @param readLikelihoods Map from reads->(haplotypes,likelihoods) * @param readLikelihoods Map from reads->(haplotypes,likelihoods)
* @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods.
* @param ref Reference bytes at active region * @param ref Reference bytes at active region
* @param refLoc Corresponding active region genome location * @param refLoc Corresponding active region genome location
* @param activeRegionWindow Active window * @param activeRegionWindow Active window
* @param activeAllelesToGenotype Alleles to genotype
* *
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
* *
*/ */
// TODO - can this be refactored? this is hard to follow!
public CalledHaplotypes callMutations ( public CalledHaplotypes callMutations (
final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods, final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, Integer> originalNormalReadQualities, final Map<String, Integer> originalNormalReadQualities,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList, final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref, final byte[] ref,
final GenomeLoc refLoc, final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow, final GenomeLoc activeRegionWindow,
final RefMetaDataTracker tracker, final RefMetaDataTracker tracker) {
final List<VariantContext> activeAllelesToGenotype) { //TODO: in GATK4 use Utils.nonNull
// sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
final List<Haplotype> haplotypes = readLikelihoods.alleles();
// Somatic Tumor/Normal Sample Handling
if (!readLikelihoods.samples().contains(tumorSampleName)) { if (!readLikelihoods.samples().contains(tumorSampleName)) {
throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample "+ tumorSampleName); throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample " + tumorSampleName);
} }
// if we don't have the normal sample, we are in tumor only mode
// TODO: check in MuTect2.java for code we can skip when in tumor only mode
final boolean hasNormal = matchedNormalSampleName != null; final boolean hasNormal = matchedNormalSampleName != null;
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes // that carry events among the haplotypes
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype); final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, NO_GIVEN_ALLELES);
// Walk along each position in the key set and create each event to be outputted // Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<>(); final Set<Haplotype> calledHaplotypes = new HashSet<>();
@ -170,7 +162,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
continue; continue;
} }
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES);
if( eventsAtThisLoc.isEmpty() ) { continue; } if( eventsAtThisLoc.isEmpty() ) { continue; }
@ -203,14 +195,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
* If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code. * If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code.
**/ **/
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper); final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
// converting ReadLikelihoods<Haplotype> to ReadLikeliHoods<Allele> // converting ReadLikelihoods<Haplotype> to ReadLikeliHoods<Allele>
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
// LDG: do we want to do this before or after pulling out overlapping reads? //LDG: do we want to do this before or after pulling out overlapping reads?
// TODO: do we want this at all? How does downsampling help?
if (MTAC.isSampleContaminationPresent()) { if (MTAC.isSampleContaminationPresent()) {
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
} }
@ -232,24 +222,18 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
} }
// TODO: another good breakpoint e.g. compute normal LOD/set thresholds // TODO: another good breakpoint e.g. compute normal LOD/set thresholds
// TODO: anything related to normal should be encapsulated in Optional // TODO: anything related to normal should be encapsulated in Optional
// Normal LOD must exceed this threshold for the variant to make it in the vcf
// TODO: variable name too log
double normalLodThresholdForVCF = -Double.MIN_VALUE;
// A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk' // A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk'
// This is a more stringent threshold than normalLodThresholdForVCF // This is a more stringent threshold than normalLodThresholdForVCF
double normalLodFilterThreshold = -Double.MIN_VALUE; double normalLodFilterThreshold = -Double.MAX_VALUE;
PerReadAlleleLikelihoodMap normalPRALM = null; PerReadAlleleLikelihoodMap normalPRALM = null;
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection(); final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
// if normal bam is available, compute normal LOD
// TODO: this if statement should be a standalone method for computing normal LOD // TODO: this if statement should be a standalone method for computing normal LOD
// TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty() // TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty()
// if normal bam is available, compute normal LOD
if (hasNormal) { if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
@ -258,10 +242,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty(); final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty();
normalLodThresholdForVCF = MTAC.INITIAL_NORMAL_LOD_THRESHOLD;
normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD; normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD;
// compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5 // compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5
@ -284,7 +266,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
for (final Allele altAllele : mergedVC.getAlternateAlleles()) { for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= normalLodThresholdForVCF : true; final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= MTAC.INITIAL_NORMAL_LOD_THRESHOLD : true;
if (passesTumorLodThreshold && passesNormalLodThreshold) { if (passesTumorLodThreshold && passesNormalLodThreshold) {
numPassingAlts++; numPassingAlts++;
allelesThatPassThreshold.add(altAllele); allelesThatPassThreshold.add(altAllele);

View File

@ -70,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest {
final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals"; final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals";
final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals"; final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals";
final String commandLine = final String commandLine =
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s"; "-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s";
@ -160,9 +158,6 @@ public class MuTect2IntegrationTest extends WalkerTest {
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b"); M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b");
} }
/**
* Test that tumor-only mode does not create an empty vcf
*/
@Test @Test
public void testTumorOnly(){ public void testTumorOnly(){
m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0"); m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0");
@ -177,6 +172,4 @@ public class MuTect2IntegrationTest extends WalkerTest {
public void testClusteredReadPositionFilter() { public void testClusteredReadPositionFilter() {
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2"); M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2");
} }
} }