Merge pull request #1463 from broadinstitute/db_m2
More Mutect2 refactoring
This commit is contained in:
commit
d582ee6d3c
|
|
@ -71,6 +71,7 @@ import org.broadinstitute.gatk.tools.walkers.haplotypecaller.*;
|
||||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
|
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.gatk.utils.MathUtils;
|
||||||
import org.broadinstitute.gatk.utils.QualityUtils;
|
import org.broadinstitute.gatk.utils.QualityUtils;
|
||||||
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
|
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
|
||||||
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
|
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
|
||||||
|
|
@ -189,20 +190,18 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
protected SampleList samplesList;
|
protected SampleList samplesList;
|
||||||
protected boolean printTCGAsampleHeader = false;
|
protected boolean printTCGAsampleHeader = false;
|
||||||
|
|
||||||
// fasta reference reader to supplement the edges of the reference sequence
|
|
||||||
protected CachingIndexedFastaSequenceFile referenceReader;
|
protected CachingIndexedFastaSequenceFile referenceReader;
|
||||||
|
|
||||||
// the assembly engine
|
|
||||||
protected LocalAssemblyEngine assemblyEngine = null;
|
protected LocalAssemblyEngine assemblyEngine = null;
|
||||||
|
|
||||||
// the likelihoods engine
|
|
||||||
protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null;
|
protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null;
|
||||||
|
protected SomaticGenotypingEngine genotypingEngine = null;
|
||||||
// the genotyping engine
|
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||||
protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
|
|
||||||
|
|
||||||
private byte MIN_TAIL_QUALITY;
|
private byte MIN_TAIL_QUALITY;
|
||||||
private double log10GlobalReadMismappingRate;
|
private double log10GlobalReadMismappingRate;
|
||||||
|
private static final int REFERENCE_PADDING = 500;
|
||||||
|
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||||
|
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||||
|
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||||
|
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
protected M2ArgumentCollection MTAC = new M2ArgumentCollection();
|
protected M2ArgumentCollection MTAC = new M2ArgumentCollection();
|
||||||
|
|
@ -221,10 +220,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
@Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging")
|
@Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging")
|
||||||
final public int MQthreshold = 20;
|
final public int MQthreshold = 20;
|
||||||
|
|
||||||
|
|
||||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
|
|
||||||
|
|
||||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
|
||||||
/**
|
/**
|
||||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||||
|
|
@ -291,12 +286,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||||
protected boolean justDetermineActiveRegions = false;
|
protected boolean justDetermineActiveRegions = false;
|
||||||
|
|
||||||
// reference base padding size
|
/**
|
||||||
private static final int REFERENCE_PADDING = 500;
|
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
|
||||||
|
protected boolean doNotRunPhysicalPhasing = false;
|
||||||
|
|
||||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
|
||||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
|
||||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -388,7 +385,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
|
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
|
||||||
tumorSampleName, normalSampleName, DEBUG_READ_NAME);
|
tumorSampleName, normalSampleName, DEBUG_READ_NAME);
|
||||||
|
|
||||||
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
|
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
|
||||||
genotypingEngine.setAnnotationEngine(annotationEngine);
|
genotypingEngine.setAnnotationEngine(annotationEngine);
|
||||||
|
|
||||||
|
|
@ -438,7 +434,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
headerInfo.addAll(getM2HeaderLines());
|
headerInfo.addAll(getM2HeaderLines());
|
||||||
headerInfo.addAll(getSampleHeaderLines());
|
headerInfo.addAll(getSampleHeaderLines());
|
||||||
|
|
||||||
final List<String> outputSampleNames = getOutputSampleNames();
|
// if printTCGAsampleHeader, we already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
|
||||||
|
final List<String> outputSampleNames = printTCGAsampleHeader ? Arrays.asList("TUMOR", "NORMAL") : SampleListUtils.asList(samplesList);
|
||||||
|
|
||||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
|
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
|
||||||
|
|
||||||
|
|
@ -507,19 +504,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return sampleLines;
|
return sampleLines;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getOutputSampleNames(){
|
|
||||||
if (printTCGAsampleHeader) {
|
|
||||||
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
|
|
||||||
final List<String> sampleNamePlaceholders = new ArrayList<>(2);
|
|
||||||
sampleNamePlaceholders.add("TUMOR");
|
|
||||||
sampleNamePlaceholders.add("NORMAL");
|
|
||||||
return sampleNamePlaceholders;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return SampleListUtils.asList(samplesList);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||||
if( context == null || context.getBasePileup().isEmpty() )
|
if( context == null || context.getBasePileup().isEmpty() )
|
||||||
|
|
@ -564,7 +548,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
prob = 1;
|
prob = 1;
|
||||||
logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling");
|
logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null);
|
return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null);
|
||||||
|
|
@ -573,16 +556,13 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
private final static List<VariantContext> NO_CALLS = Collections.emptyList();
|
private final static List<VariantContext> NO_CALLS = Collections.emptyList();
|
||||||
@Override
|
@Override
|
||||||
public List<VariantContext> map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
|
public List<VariantContext> map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||||
if ( justDetermineActiveRegions )
|
if ( justDetermineActiveRegions ) {
|
||||||
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
|
|
||||||
return NO_CALLS;
|
return NO_CALLS;
|
||||||
|
} else if( !originalActiveRegion.isActive() ) {
|
||||||
if( !originalActiveRegion.isActive() )
|
|
||||||
// Not active so nothing to do!
|
|
||||||
return referenceModelForNoVariation(originalActiveRegion, true);
|
return referenceModelForNoVariation(originalActiveRegion, true);
|
||||||
|
} else if( originalActiveRegion.size() == 0 ) {
|
||||||
// No reads here so nothing to do!
|
return referenceModelForNoVariation(originalActiveRegion, true);
|
||||||
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
}
|
||||||
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
|
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
|
||||||
|
|
||||||
// create the assembly using just high quality reads (Q20 or higher). We want to use lower
|
// create the assembly using just high quality reads (Q20 or higher). We want to use lower
|
||||||
|
|
@ -593,30 +573,25 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
assemblyActiveRegion.add(rec);
|
assemblyActiveRegion.add(rec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region");
|
logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region");
|
||||||
|
|
||||||
// run the local assembler, getting back a collection of information on how we should proceed
|
// run the local assembler, getting back a collection of information on how we should proceed
|
||||||
final List<VariantContext> givenAlleles = new ArrayList<>();
|
final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, Collections.EMPTY_LIST);
|
||||||
final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, givenAlleles);
|
|
||||||
|
|
||||||
|
|
||||||
final TreeSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
|
final TreeSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
|
||||||
// TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway
|
|
||||||
// TODO - so check and remove if that is the case:
|
|
||||||
allVariationEvents.addAll(givenAlleles);
|
|
||||||
|
|
||||||
final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents);
|
final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents);
|
||||||
|
if (!trimmingResult.isVariationPresent()) {
|
||||||
|
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||||
// Stop the trimming madness!!!
|
}
|
||||||
if (!trimmingResult.isVariationPresent())
|
|
||||||
return referenceModelForNoVariation(originalActiveRegion,false);
|
|
||||||
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
|
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
|
||||||
|
|
||||||
final AssemblyResultSet assemblyResult =
|
final AssemblyResultSet assemblyResult =
|
||||||
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
|
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
|
||||||
|
|
||||||
|
// it is conceivable that the region is active yet has no events upon assembling only the well-mapped reads
|
||||||
|
if( ! assemblyResult.isVariationPresent() ) {
|
||||||
|
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||||
|
}
|
||||||
|
|
||||||
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
||||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
|
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
|
||||||
|
|
||||||
|
|
@ -624,7 +599,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
|
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// filter out reads from genotyping which fail mapping quality based criteria
|
// filter out reads from genotyping which fail mapping quality based criteria
|
||||||
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
|
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
|
||||||
//TODO - on the originalActiveRegion?
|
//TODO - on the originalActiveRegion?
|
||||||
|
|
@ -639,17 +613,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
|
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
|
||||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
|
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
|
||||||
|
|
||||||
// abort early if something is out of the acceptable range
|
|
||||||
// TODO is this ever true at this point??? perhaps GGA. Need to check.
|
|
||||||
if( ! assemblyResult.isVariationPresent() )
|
|
||||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
|
||||||
|
|
||||||
// TODO is this ever true at this point??? perhaps GGA. Need to check.
|
|
||||||
if( regionForGenotyping.size() == 0 ) {
|
|
||||||
// no reads remain after filtering so nothing else to do!
|
|
||||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
// evaluate each sample's reads against all haplotypes
|
// evaluate each sample's reads against all haplotypes
|
||||||
|
|
||||||
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
||||||
|
|
@ -658,6 +621,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
|
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO: this should be written as
|
||||||
|
//TODO final Map<String, Integer> ARreads_origNormalMQ = regionForGenotyping.getReads().stream()
|
||||||
|
//TODO .collect(Collectors.toMap(GATKSAMRecord::getReadName, GATKSAMRecord::getMappingQuality));
|
||||||
|
//TODO: but for some reason sometimes streamifying Mutect2 code causes a MalformedWalkerArgumentsException
|
||||||
|
//TODO: this will probably not occur after the port to GATK4
|
||||||
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
|
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
|
||||||
for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
|
for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
|
||||||
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
|
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
|
||||||
|
|
@ -671,14 +639,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString());
|
logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString());
|
||||||
|
final ReadLikelihoods<Haplotype> readLikelihoods = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads);
|
||||||
|
|
||||||
|
|
||||||
// Calculate the likelihoods: CPU intensive part.
|
|
||||||
final ReadLikelihoods<Haplotype> readLikelihoods =
|
|
||||||
likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads);
|
|
||||||
|
|
||||||
// Realign reads to their best haplotype.
|
|
||||||
// KCIBUL: this is new stuff -- review it!
|
|
||||||
final Map<GATKSAMRecord,GATKSAMRecord> readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc());
|
final Map<GATKSAMRecord,GATKSAMRecord> readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc());
|
||||||
|
|
||||||
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
|
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
|
||||||
|
|
@ -693,22 +655,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
|
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
|
final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.callMutations(
|
||||||
// was a bad interaction between that selection and the marginalization that happens over each event when computing
|
|
||||||
// GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
|
|
||||||
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
|
|
||||||
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
|
|
||||||
|
|
||||||
final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = ((SomaticGenotypingEngine)genotypingEngine).callMutations(
|
|
||||||
haplotypes,
|
|
||||||
readLikelihoods,
|
readLikelihoods,
|
||||||
ARreads_origNormalMQ,
|
ARreads_origNormalMQ,
|
||||||
perSampleFilteredReadList,
|
perSampleFilteredReadList,
|
||||||
assemblyResult.getFullReferenceWithPadding(),
|
assemblyResult.getFullReferenceWithPadding(),
|
||||||
assemblyResult.getPaddedReferenceLoc(),
|
assemblyResult.getPaddedReferenceLoc(),
|
||||||
regionForGenotyping.getLocation(),
|
regionForGenotyping.getLocation(),
|
||||||
metaDataTracker,
|
metaDataTracker);
|
||||||
givenAlleles);
|
|
||||||
|
|
||||||
if ( MTAC.bamWriter != null ) {
|
if ( MTAC.bamWriter != null ) {
|
||||||
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
|
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
|
||||||
|
|
@ -727,66 +681,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||||
|
return annotateVCs(calledHaplotypes, metaDataTracker);
|
||||||
|
|
||||||
final List<VariantContext> annotatedCalls = new ArrayList<>();
|
|
||||||
final int eventCount = calledHaplotypes.getCalls().size();
|
|
||||||
Integer minEventDistance = null;
|
|
||||||
Integer maxEventDistance = null;
|
|
||||||
Integer lastPosition = null;
|
|
||||||
for (final VariantContext vc : calledHaplotypes.getCalls()) {
|
|
||||||
if (lastPosition == null) {
|
|
||||||
lastPosition = vc.getStart();
|
|
||||||
} else {
|
|
||||||
final int dist = Math.abs(vc.getStart() - lastPosition);
|
|
||||||
if (maxEventDistance == null || dist > maxEventDistance) {
|
|
||||||
maxEventDistance = dist;
|
|
||||||
}
|
|
||||||
if (minEventDistance == null || dist < minEventDistance) {
|
|
||||||
minEventDistance = dist;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
final Map<String, Object> eventDistanceAttributes = new HashMap<>();
|
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
|
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
|
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
|
|
||||||
|
|
||||||
|
|
||||||
// can we do this with the Annotation classes instead?
|
|
||||||
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
|
|
||||||
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
|
||||||
|
|
||||||
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
|
||||||
attributes.putAll(eventDistanceAttributes);
|
|
||||||
vcb.attributes(attributes);
|
|
||||||
|
|
||||||
final Set<String> filters = new HashSet<>(originalVC.getFilters());
|
|
||||||
|
|
||||||
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
|
||||||
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
|
|
||||||
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we are in artifact detection mode, apply the thresholds for the LOD scores
|
|
||||||
if (!MTAC.ARTIFACT_DETECTION_MODE) {
|
|
||||||
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
|
|
||||||
}
|
|
||||||
|
|
||||||
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
|
|
||||||
|
|
||||||
if (printTCGAsampleHeader) {
|
|
||||||
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
|
|
||||||
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
|
|
||||||
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
|
|
||||||
}
|
|
||||||
|
|
||||||
annotatedCalls.add(vcb.make());
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: find a better place for this debug message
|
|
||||||
// logger.info("We had " + TumorPowerCalculator.numCacheHits + " hits in starnd artifact power calculation");
|
|
||||||
return annotatedCalls;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
|
private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
|
||||||
|
|
@ -828,15 +723,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) {
|
if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) {
|
||||||
filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME);
|
filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME);
|
||||||
} else {
|
} else if ( eventCount > 1 && normalAltCounts >= 1) {
|
||||||
|
filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME);
|
||||||
// NOTE: does normal alt counts presume the normal had all these events in CIS?
|
} else if (eventCount >= 3) {
|
||||||
if ( eventCount > 1 && normalAltCounts >= 1) {
|
filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME);
|
||||||
filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME);
|
|
||||||
} else if (eventCount >= 3) {
|
|
||||||
filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit)
|
// STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit)
|
||||||
|
|
@ -861,7 +751,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME);
|
filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// clustered read position filter
|
// clustered read position filter
|
||||||
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
|
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
|
||||||
final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
||||||
|
|
@ -878,7 +767,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
|
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
|
||||||
/**
|
/**
|
||||||
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
||||||
|
|
@ -916,6 +804,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return (normalSampleName != null);
|
return (normalSampleName != null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO: streamify in GATK4
|
||||||
protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
|
protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
|
||||||
int i=0;
|
int i=0;
|
||||||
for( final PileupElement p : pileup ) {
|
for( final PileupElement p : pileup ) {
|
||||||
|
|
@ -960,37 +849,21 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
protected Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion) {
|
protected Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion) {
|
||||||
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
||||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||||
|
//TODO: Takuto points out that this is questionable. Let's think hard abut it.
|
||||||
// KCIBUL: only perform read quality filtering on tumor reads...
|
// KCIBUL: only perform read quality filtering on tumor reads...
|
||||||
if (isReadFromNormal(rec)) {
|
if (isReadFromNormal(rec)) {
|
||||||
|
|
||||||
if( rec.getReadLength() < MIN_READ_LENGTH ) {
|
if( rec.getReadLength() < MIN_READ_LENGTH ) {
|
||||||
readsToRemove.add(rec);
|
readsToRemove.add(rec);
|
||||||
}
|
}
|
||||||
|
} else if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < MQthreshold || BadMateFilter.hasBadMate(rec) ||
|
||||||
} else {
|
|
||||||
|
|
||||||
|
|
||||||
if( rec.getReadLength() < MIN_READ_LENGTH ||
|
|
||||||
rec.getMappingQuality() < MQthreshold ||
|
|
||||||
BadMateFilter.hasBadMate(rec) ||
|
|
||||||
|
|
||||||
(keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
(keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||||
readsToRemove.add(rec);
|
readsToRemove.add(rec);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
activeRegion.removeAll(readsToRemove);
|
activeRegion.removeAll(readsToRemove);
|
||||||
return readsToRemove;
|
return readsToRemove;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static GATKSAMRecord findReadByName(final Collection<GATKSAMRecord> reads, final String name) {
|
|
||||||
for(final GATKSAMRecord read : reads) {
|
|
||||||
if (name.equals(read.getReadName())) return read;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates the appropriate likelihood calculation engine.
|
* Instantiates the appropriate likelihood calculation engine.
|
||||||
*
|
*
|
||||||
|
|
@ -1025,13 +898,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
// enable non primary and extended reads in the active region
|
// enable non primary and extended reads in the active region
|
||||||
@Override
|
@Override
|
||||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||||
// if ( includeUnmappedReads )
|
return EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED);
|
||||||
// throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional");
|
|
||||||
// else
|
|
||||||
return EnumSet.of(
|
|
||||||
ActiveRegionReadState.PRIMARY,
|
|
||||||
ActiveRegionReadState.NONPRIMARY,
|
|
||||||
ActiveRegionReadState.EXTENDED);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -1063,7 +930,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
logger.info("Ran local assembly on " + result + " active regions");
|
logger.info("Ran local assembly on " + result + " active regions");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// The following are not used but are required by the AnnotatorCompatible interface
|
// The following are not used but are required by the AnnotatorCompatible interface
|
||||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||||
|
|
@ -1087,15 +953,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
|
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
|
||||||
|
|
||||||
// Create ReadErrorCorrector object if requested - will be used within assembly engine.
|
// Create ReadErrorCorrector object if requested - will be used within assembly engine.
|
||||||
ReadErrorCorrector readErrorCorrector = null;
|
final ReadErrorCorrector readErrorCorrector = errorCorrectReads ? new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION,
|
||||||
if (errorCorrectReads)
|
RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding) : null;
|
||||||
readErrorCorrector = new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector );
|
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector );
|
||||||
assemblyResultSet.debugDump(logger);
|
assemblyResultSet.debugDump(logger);
|
||||||
return assemblyResultSet;
|
return assemblyResultSet;
|
||||||
|
|
||||||
} catch ( final Exception e ) {
|
} catch ( final Exception e ) {
|
||||||
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
|
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
|
||||||
if ( captureAssemblyFailureBAM ) {
|
if ( captureAssemblyFailureBAM ) {
|
||||||
|
|
@ -1151,10 +1014,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads);
|
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads);
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle overlapping read pairs from the same fragment
|
|
||||||
// KC: commented out as we handle overlapping read pairs in a different way...
|
|
||||||
//cleanOverlappingReadPairs(downsampledReads, normalSampleNames);
|
|
||||||
|
|
||||||
activeRegion.clearReads();
|
activeRegion.clearReads();
|
||||||
activeRegion.addAll(downsampledReads);
|
activeRegion.addAll(downsampledReads);
|
||||||
activeRegion.setFinalized(true);
|
activeRegion.setFinalized(true);
|
||||||
|
|
@ -1189,11 +1048,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList);
|
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList);
|
||||||
for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() )
|
for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() )
|
||||||
|
|
||||||
// in MuTect -- right now we compare the
|
|
||||||
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
|
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1236,20 +1091,61 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
private boolean isReadFromNormal(final GATKSAMRecord rec) {
|
private boolean isReadFromNormal(final GATKSAMRecord rec) {
|
||||||
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
|
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTumorSampleName(){
|
public String getTumorSampleName(){
|
||||||
return tumorSampleName;
|
return tumorSampleName;
|
||||||
}
|
}
|
||||||
|
|
||||||
// KCIBUL: new stuff -- read up on this!!
|
final List<VariantContext> annotateVCs(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes, final RefMetaDataTracker metaDataTracker) {
|
||||||
/**
|
final int eventCount = calledHaplotypes.getCalls().size();
|
||||||
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
|
final Map<String, Object> eventDistanceAttributes = new HashMap<>(); //TODO: should be Map<String, Integer> -- see TODO below
|
||||||
*/
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
|
||||||
@Advanced
|
if (eventCount > 1) {
|
||||||
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
|
final int lastPosition = calledHaplotypes.getCalls().get(0).getStart();
|
||||||
protected boolean doNotRunPhysicalPhasing = false;
|
final int[] eventDistances = new int[calledHaplotypes.getCalls().size() - 1];
|
||||||
|
for (int n = 0; n < eventDistances.length; n++) {
|
||||||
|
eventDistances[n] = Math.abs(calledHaplotypes.getCalls().get(n + 1).getStart() - lastPosition);
|
||||||
|
}
|
||||||
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, MathUtils.arrayMin(eventDistances));
|
||||||
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, MathUtils.arrayMax(eventDistances));
|
||||||
|
} else { //TODO: putting null is a hack -- we should remove this and update the integration test md5s
|
||||||
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, null);
|
||||||
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<VariantContext> annotatedCalls = new ArrayList<>();
|
||||||
|
// can we do this with the Annotation classes instead?
|
||||||
|
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
|
||||||
|
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
||||||
|
|
||||||
|
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
||||||
|
attributes.putAll(eventDistanceAttributes);
|
||||||
|
vcb.attributes(attributes);
|
||||||
|
|
||||||
|
final Set<String> filters = new HashSet<>(originalVC.getFilters());
|
||||||
|
|
||||||
|
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
||||||
|
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
|
||||||
|
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we are in artifact detection mode, apply the thresholds for the LOD scores
|
||||||
|
if (!MTAC.ARTIFACT_DETECTION_MODE) {
|
||||||
|
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
|
||||||
|
}
|
||||||
|
|
||||||
|
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
|
||||||
|
|
||||||
|
if (printTCGAsampleHeader) {
|
||||||
|
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
|
||||||
|
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
|
||||||
|
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
|
||||||
|
}
|
||||||
|
annotatedCalls.add(vcb.make());
|
||||||
|
}
|
||||||
|
return annotatedCalls;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,10 +51,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
||||||
|
|
||||||
import com.google.java.contract.Ensures;
|
|
||||||
import htsjdk.samtools.util.StringUtil;
|
|
||||||
import htsjdk.variant.variantcontext.*;
|
import htsjdk.variant.variantcontext.*;
|
||||||
import org.apache.commons.collections.ListUtils;
|
|
||||||
import org.apache.commons.lang.mutable.MutableDouble;
|
import org.apache.commons.lang.mutable.MutableDouble;
|
||||||
import org.apache.commons.lang.mutable.MutableInt;
|
import org.apache.commons.lang.mutable.MutableInt;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
@ -63,7 +60,6 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvid
|
||||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
|
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.gatk.utils.commandline.RodBinding;
|
|
||||||
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
|
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
|
||||||
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||||
|
|
@ -87,6 +83,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
private final String matchedNormalSampleName;
|
private final String matchedNormalSampleName;
|
||||||
private final String DEBUG_READ_NAME;
|
private final String DEBUG_READ_NAME;
|
||||||
|
|
||||||
|
//Mutect2 does not run in GGA mode
|
||||||
|
private static final List<VariantContext> NO_GIVEN_ALLELES = Collections.EMPTY_LIST;
|
||||||
|
|
||||||
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
|
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
|
||||||
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
|
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
|
||||||
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
|
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
|
||||||
|
|
@ -109,7 +108,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
|
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
|
||||||
|
|
||||||
// coverage related initialization
|
// coverage related initialization
|
||||||
final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE / 10);
|
//TODO: in GATK4, use a QualityUtils method
|
||||||
|
final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE/10);
|
||||||
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
|
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -118,48 +118,40 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
|
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
|
||||||
*
|
*
|
||||||
* The list of samples we're working with is obtained from the readLikelihoods
|
* The list of samples we're working with is obtained from the readLikelihoods
|
||||||
|
|
||||||
* @param haplotypes Haplotypes to assign likelihoods to
|
|
||||||
* @param readLikelihoods Map from reads->(haplotypes,likelihoods)
|
* @param readLikelihoods Map from reads->(haplotypes,likelihoods)
|
||||||
* @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods.
|
* @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods.
|
||||||
* @param ref Reference bytes at active region
|
* @param ref Reference bytes at active region
|
||||||
* @param refLoc Corresponding active region genome location
|
* @param refLoc Corresponding active region genome location
|
||||||
* @param activeRegionWindow Active window
|
* @param activeRegionWindow Active window
|
||||||
* @param activeAllelesToGenotype Alleles to genotype
|
|
||||||
*
|
*
|
||||||
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
// TODO - can this be refactored? this is hard to follow!
|
|
||||||
public CalledHaplotypes callMutations (
|
public CalledHaplotypes callMutations (
|
||||||
final List<Haplotype> haplotypes,
|
|
||||||
final ReadLikelihoods<Haplotype> readLikelihoods,
|
final ReadLikelihoods<Haplotype> readLikelihoods,
|
||||||
final Map<String, Integer> originalNormalReadQualities,
|
final Map<String, Integer> originalNormalReadQualities,
|
||||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||||
final byte[] ref,
|
final byte[] ref,
|
||||||
final GenomeLoc refLoc,
|
final GenomeLoc refLoc,
|
||||||
final GenomeLoc activeRegionWindow,
|
final GenomeLoc activeRegionWindow,
|
||||||
final RefMetaDataTracker tracker,
|
final RefMetaDataTracker tracker) {
|
||||||
final List<VariantContext> activeAllelesToGenotype) {
|
//TODO: in GATK4 use Utils.nonNull
|
||||||
// sanity check input arguments
|
|
||||||
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
|
||||||
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
|
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
|
||||||
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
||||||
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||||
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||||
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
|
||||||
|
|
||||||
|
final List<Haplotype> haplotypes = readLikelihoods.alleles();
|
||||||
|
|
||||||
|
// Somatic Tumor/Normal Sample Handling
|
||||||
if (!readLikelihoods.samples().contains(tumorSampleName)) {
|
if (!readLikelihoods.samples().contains(tumorSampleName)) {
|
||||||
throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample "+ tumorSampleName);
|
throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample " + tumorSampleName);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we don't have the normal sample, we are in tumor only mode
|
|
||||||
// TODO: check in MuTect2.java for code we can skip when in tumor only mode
|
|
||||||
final boolean hasNormal = matchedNormalSampleName != null;
|
final boolean hasNormal = matchedNormalSampleName != null;
|
||||||
|
|
||||||
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
|
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
|
||||||
// that carry events among the haplotypes
|
// that carry events among the haplotypes
|
||||||
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype);
|
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, NO_GIVEN_ALLELES);
|
||||||
|
|
||||||
// Walk along each position in the key set and create each event to be outputted
|
// Walk along each position in the key set and create each event to be outputted
|
||||||
final Set<Haplotype> calledHaplotypes = new HashSet<>();
|
final Set<Haplotype> calledHaplotypes = new HashSet<>();
|
||||||
|
|
@ -170,7 +162,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES);
|
||||||
|
|
||||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||||
|
|
||||||
|
|
@ -203,14 +195,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
* If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code.
|
* If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code.
|
||||||
**/
|
**/
|
||||||
|
|
||||||
|
|
||||||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||||
|
|
||||||
// converting ReadLikelihoods<Haplotype> to ReadLikeliHoods<Allele>
|
// converting ReadLikelihoods<Haplotype> to ReadLikeliHoods<Allele>
|
||||||
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
||||||
|
|
||||||
// LDG: do we want to do this before or after pulling out overlapping reads?
|
//LDG: do we want to do this before or after pulling out overlapping reads?
|
||||||
// TODO: do we want this at all? How does downsampling help?
|
|
||||||
if (MTAC.isSampleContaminationPresent()) {
|
if (MTAC.isSampleContaminationPresent()) {
|
||||||
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
||||||
}
|
}
|
||||||
|
|
@ -232,24 +222,18 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO: another good breakpoint e.g. compute normal LOD/set thresholds
|
// TODO: another good breakpoint e.g. compute normal LOD/set thresholds
|
||||||
// TODO: anything related to normal should be encapsulated in Optional
|
// TODO: anything related to normal should be encapsulated in Optional
|
||||||
|
|
||||||
// Normal LOD must exceed this threshold for the variant to make it in the vcf
|
|
||||||
// TODO: variable name too log
|
|
||||||
double normalLodThresholdForVCF = -Double.MIN_VALUE;
|
|
||||||
|
|
||||||
// A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk'
|
// A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk'
|
||||||
// This is a more stringent threshold than normalLodThresholdForVCF
|
// This is a more stringent threshold than normalLodThresholdForVCF
|
||||||
double normalLodFilterThreshold = -Double.MIN_VALUE;
|
double normalLodFilterThreshold = -Double.MAX_VALUE;
|
||||||
|
|
||||||
PerReadAlleleLikelihoodMap normalPRALM = null;
|
PerReadAlleleLikelihoodMap normalPRALM = null;
|
||||||
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||||
|
|
||||||
|
// if normal bam is available, compute normal LOD
|
||||||
// TODO: this if statement should be a standalone method for computing normal LOD
|
// TODO: this if statement should be a standalone method for computing normal LOD
|
||||||
// TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty()
|
// TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty()
|
||||||
// if normal bam is available, compute normal LOD
|
|
||||||
if (hasNormal) {
|
if (hasNormal) {
|
||||||
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
||||||
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
||||||
|
|
@ -258,10 +242,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
||||||
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
|
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
|
||||||
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
|
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
|
||||||
|
|
||||||
final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty();
|
final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty();
|
||||||
|
|
||||||
normalLodThresholdForVCF = MTAC.INITIAL_NORMAL_LOD_THRESHOLD;
|
|
||||||
normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD;
|
normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD;
|
||||||
|
|
||||||
// compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5
|
// compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5
|
||||||
|
|
@ -284,7 +266,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
|
|
||||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
|
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
|
||||||
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
||||||
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= normalLodThresholdForVCF : true;
|
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= MTAC.INITIAL_NORMAL_LOD_THRESHOLD : true;
|
||||||
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
||||||
numPassingAlts++;
|
numPassingAlts++;
|
||||||
allelesThatPassThreshold.add(altAllele);
|
allelesThatPassThreshold.add(altAllele);
|
||||||
|
|
|
||||||
|
|
@ -70,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest {
|
||||||
final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals";
|
final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals";
|
||||||
final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals";
|
final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
final String commandLine =
|
final String commandLine =
|
||||||
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s";
|
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s";
|
||||||
|
|
||||||
|
|
@ -160,9 +158,6 @@ public class MuTect2IntegrationTest extends WalkerTest {
|
||||||
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b");
|
M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Test that tumor-only mode does not create an empty vcf
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void testTumorOnly(){
|
public void testTumorOnly(){
|
||||||
m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0");
|
m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0");
|
||||||
|
|
@ -177,6 +172,4 @@ public class MuTect2IntegrationTest extends WalkerTest {
|
||||||
public void testClusteredReadPositionFilter() {
|
public void testClusteredReadPositionFilter() {
|
||||||
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2");
|
M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue