diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java index 8ad89a848..5cc07df4e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java @@ -51,12 +51,41 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.Advanced; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Hidden; +import org.broadinstitute.gatk.utils.commandline.*; + +import java.util.Collections; +import java.util.List; public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection { + + /***************************************/ + // Reference Metadata inputs + /***************************************/ + /** + * MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant + * in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal + * sample to prove the variant is not present in germline. + */ + @Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false) + public List> cosmicRod = Collections.emptyList(); + + /** + * A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants. + */ + @Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false) + public List> normalPanelRod = Collections.emptyList(); + + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + @Advanced @Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false) public boolean M2_DEBUG = false; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index b3c7d950f..4f509f796 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -201,7 +201,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // the genotyping engine protected HaplotypeCallerGenotypingEngine genotypingEngine = null; - private byte MIN_TAIL_QUALITY; private double log10GlobalReadMismappingRate; @@ -214,9 +213,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @ArgumentCollection protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection(); - @Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process") - public String DEBUG_READ_NAME = null; + protected String DEBUG_READ_NAME = null; @Hidden @Advanced @@ -224,24 +222,81 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final public int MQthreshold = 20; - /***************************************/ - // Reference Metadata inputs - /***************************************/ - /** - * MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant - * in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal - * sample to prove the variant is not present in germline. - */ - @Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false) - public List> cosmicRod = Collections.emptyList(); - - /** - * A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants. - */ - @Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false) - public List> normalPanelRod = Collections.emptyList(); + public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } private HaplotypeBAMWriter haplotypeBAMWriter; + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Advanced + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + /** + * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); + + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Advanced + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"})); + + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { }; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + /** + * Active region trimmer reference. + */ + @ArgumentCollection + protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); + + @Hidden + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + protected String keepRG = null; + + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public byte MIN_BASE_QUALTY_SCORE = 10; + + public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE; + + @Hidden + @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectReads = false; + + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + + @Advanced + @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) + protected boolean dontUseSoftClippedBases = false; + + @Hidden + @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) + protected boolean justDetermineActiveRegions = false; + + // reference base padding size + private static final int REFERENCE_PADDING = 500; + + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @Override public void initialize() { @@ -250,15 +305,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))); // MUTECT: check that we have at least one tumor bam - for(SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) { + for(final SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) { if (id.getTags().getPositionalTags().size() == 0) { throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'"); } // only supports single-sample BAMs (ie first read group is representative) - String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample(); + final String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample(); - for(String tag : id.getTags().getPositionalTags()) { + for(final String tag : id.getTags().getPositionalTags()) { if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) { tumorSAMReaderIDs.add(id); if (tumorSampleName == null) { @@ -330,9 +385,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes(); final GenomeAnalysisEngine toolkit = getToolkit(); - final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser(); - genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), MTAC, logger), !doNotRunPhysicalPhasing, MTAC); + genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC, + tumorSampleName, normalSampleName, DEBUG_READ_NAME); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); @@ -353,7 +408,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i trimmer.snpPadding = 50; samplesList = toolkit.getReadSampleList(); - Set sampleSet = SampleListUtils.asSet(samplesList); + final Set sampleSet = SampleListUtils.asSet(samplesList); if( MTAC.CONTAMINATION_FRACTION_FILE != null ) MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger)); @@ -367,7 +422,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); @@ -383,7 +438,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getSampleHeaderLines()); - List outputSampleNames = getOutputSampleNames(); + final List outputSampleNames = getOutputSampleNames(); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); @@ -391,7 +446,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } private Set getM2HeaderLines(){ - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY)); @@ -430,21 +485,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } private Set getSampleHeaderLines(){ - Set sampleLines = new HashSet<>(); + final Set sampleLines = new HashSet<>(); if (printTCGAsampleHeader) { //NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one - Map normalSampleHeaderAttributes = new HashMap<>(); + final Map normalSampleHeaderAttributes = new HashMap<>(); normalSampleHeaderAttributes.put("ID", "NORMAL"); normalSampleHeaderAttributes.put("SampleName", normalSampleName); if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath()); - VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes); - Map tumorSampleHeaderAttributes = new HashMap<>(); + final VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes); + final Map tumorSampleHeaderAttributes = new HashMap<>(); tumorSampleHeaderAttributes.put("ID", "TUMOR"); tumorSampleHeaderAttributes.put("SampleName", tumorSampleName); if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath()); - VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes); + final VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes); sampleLines.add(normalSampleHeader); sampleLines.add(tumorSampleHeader); @@ -455,7 +510,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private List getOutputSampleNames(){ if (printTCGAsampleHeader) { //Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() - List sampleNamePlaceholders = new ArrayList<>(2); + final List sampleNamePlaceholders = new ArrayList<>(2); sampleNamePlaceholders.add("TUMOR"); sampleNamePlaceholders.add("NORMAL"); return sampleNamePlaceholders; @@ -466,14 +521,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if( context == null || context.getBasePileup().isEmpty() ) // if we don't have any data, just abort early return new ActivityProfileState(ref.getLocus(), 0.0); final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); - AlignmentContext tumorContext = splitContexts.get(tumorSampleName); - AlignmentContext normalContext = splitContexts.get(normalSampleName); + final AlignmentContext tumorContext = splitContexts.get(tumorSampleName); + final AlignmentContext normalContext = splitContexts.get(normalSampleName); // if there are no tumor reads... there is no activity! if (tumorContext == null) { @@ -481,7 +536,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } // KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel - ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold); + final ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold); final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE); final double tumorLod = tumorGLs[1] - tumorGLs[0]; @@ -495,7 +550,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // in any case, we have to handle the case where there is no normal (and thus no normal context) which is // different than having a normal but having no reads (where we should not enter the active region) if (normalSampleName != null && normalContext != null) { - int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE); + final int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE); final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f); final double normalLod = normalGLs[0] - normalGLs[1]; @@ -528,13 +583,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // No reads here so nothing to do! if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } - logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); // create the assembly using just high quality reads (Q20 or higher). We want to use lower // quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter - ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension()); - for (GATKSAMRecord rec : originalActiveRegion.getReads()) { + final ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension()); + for (final GATKSAMRecord rec : originalActiveRegion.getReads()) { if (rec.getMappingQuality() >= MQthreshold ) { assemblyActiveRegion.add(rec); } @@ -558,18 +612,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // Stop the trimming madness!!! if (!trimmingResult.isVariationPresent()) return referenceModelForNoVariation(originalActiveRegion,false); - logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); final AssemblyResultSet assemblyResult = trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; -// final AssemblyResultSet assemblyResult = untrimmedAssemblyResult; - - // after talking to Ryan -- they grab the reads out of the assembly (and trim then) to pass into the PairHMM - // because at one point they were trying error correcting of the reads based on the haplotypes.. but that is not - // working out, so it's safe for us just to take the reads -// final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); @@ -577,10 +624,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); } -// -// final ActiveRegion regionForGenotyping = trimmingResult.getCallableRegion(); - -// final ActiveRegion regionForGenotyping = originalActiveRegion; // filter out reads from genotyping which fail mapping quality based criteria //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method @@ -594,7 +637,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } final Map> perSampleFilteredReadList = splitReadsBySample(filteredReads); - logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); // abort early if something is out of the acceptable range @@ -612,17 +654,17 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final List haplotypes = assemblyResult.getHaplotypeList(); final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); - for (List rec : reads.values()) { + for (final List rec : reads.values()) { logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); } final HashMap ARreads_origNormalMQ = new HashMap<>(); - for (GATKSAMRecord read : regionForGenotyping.getReads()) { + for (final GATKSAMRecord read : regionForGenotyping.getReads()) { ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); } // modify MAPQ scores in normal to be high so that we don't do any base quality score capping - for(GATKSAMRecord rec : regionForGenotyping.getReads()) { + for(final GATKSAMRecord rec : regionForGenotyping.getReads()) { if (isReadFromNormal(rec)) { rec.setMappingQuality(60); } @@ -647,8 +689,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } readLikelihoods.changeReads(readRealignments); - - for (GATKSAMRecord rec : readRealignments.keySet()) { + for (final GATKSAMRecord rec : readRealignments.keySet()) { logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); } @@ -666,15 +707,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i assemblyResult.getFullReferenceWithPadding(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping.getLocation(), - getToolkit().getGenomeLocParser(), metaDataTracker, - givenAlleles, false , - tumorSampleName, - normalSampleName, - dbsnp.dbsnp, - cosmicRod, - DEBUG_READ_NAME - ); + givenAlleles); if ( MTAC.bamWriter != null ) { final Set calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); @@ -695,16 +729,16 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - List annotatedCalls = new ArrayList<>(); - int eventCount = calledHaplotypes.getCalls().size(); + final List annotatedCalls = new ArrayList<>(); + final int eventCount = calledHaplotypes.getCalls().size(); Integer minEventDistance = null; Integer maxEventDistance = null; Integer lastPosition = null; - for (VariantContext vc : calledHaplotypes.getCalls()) { + for (final VariantContext vc : calledHaplotypes.getCalls()) { if (lastPosition == null) { lastPosition = vc.getStart(); } else { - int dist = Math.abs(vc.getStart() - lastPosition); + final int dist = Math.abs(vc.getStart() - lastPosition); if (maxEventDistance == null || dist > maxEventDistance) { maxEventDistance = dist; } @@ -713,23 +747,23 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } } - Map eventDistanceAttributes = new HashMap<>(); + final Map eventDistanceAttributes = new HashMap<>(); eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance); // can we do this with the Annotation classes instead? - for (VariantContext originalVC : calledHaplotypes.getCalls()) { - VariantContextBuilder vcb = new VariantContextBuilder(originalVC); + for (final VariantContext originalVC : calledHaplotypes.getCalls()) { + final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); - Map attributes = new HashMap<>(originalVC.getAttributes()); + final Map attributes = new HashMap<>(originalVC.getAttributes()); attributes.putAll(eventDistanceAttributes); vcb.attributes(attributes); - Set filters = new HashSet<>(originalVC.getFilters()); + final Set filters = new HashSet<>(originalVC.getFilters()); - double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); + final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); } @@ -739,22 +773,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); } - if (filters.size() > 0) { - vcb.filters(filters); - } else { - vcb.passFilters(); - } + vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); if (printTCGAsampleHeader) { - GenotypesContext genotypesWithBamSampleNames = originalVC.getGenotypes(); - List renamedGenotypes = new ArrayList<>(); - GenotypeBuilder GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(tumorSampleName)); - GTbuilder.name("TUMOR"); - renamedGenotypes.add(GTbuilder.make()); - GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(normalSampleName)); - GTbuilder.name("NORMAL"); - renamedGenotypes.add(GTbuilder.make()); - vcb.genotypes(renamedGenotypes); + final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); + final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); + vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); } annotatedCalls.add(vcb.make()); @@ -765,15 +789,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return annotatedCalls; } - private Set calculateFilters(RefMetaDataTracker metaDataTracker, VariantContext vc, Map eventDistanceAttributes) { - Set filters = new HashSet<>(); + private Set calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map eventDistanceAttributes) { + final Set filters = new HashSet<>(); - Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); - Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY); + final Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); + final Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY); - Collection panelOfNormalsVC = metaDataTracker.getValues(normalPanelRod, + final Collection panelOfNormalsVC = metaDataTracker.getValues(MTAC.normalPanelRod, getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart())); - VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next(); + final VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next(); if (ponVc != null) { filters.add(GATKVCFConstants.PON_FILTER_NAME); @@ -788,13 +812,13 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i double normalF = 0; int normalAltQualityScoreSum = 0; if (hasNormal()) { - Genotype normalGenotype = vc.getGenotype(normalSampleName); + final Genotype normalGenotype = vc.getGenotype(normalSampleName); // NOTE: how do we get the non-ref depth here? normalAltCounts = normalGenotype.getAD()[1]; normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY); - Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY); + final Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY); if (qss != null) { normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1]; } else { @@ -819,11 +843,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we // hard filter them out by default if (vc.isIndel()) { - ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY); - String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, ""); + final ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY); + final String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, ""); if (rpa != null && rpa.size() > 1 && ru.length() > 1) { - int refCount = (Integer) rpa.get(0); - int altCount = (Integer) rpa.get(1); + final int refCount = (Integer) rpa.get(0); + final int altCount = (Integer) rpa.get(1); if (refCount - altCount == 1) { filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME); @@ -840,10 +864,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // clustered read position filter if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ - Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); - Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); - Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); - Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); + final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); + final Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); + final Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); + final Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) @@ -866,14 +890,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i */ protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) { final double[] genotypeLikelihoods = new double[2]; - int AA = 0, AB=1; + final int AA = 0; + final int AB=1; for( final PileupElement p : pileup ) { final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); if( p.isDeletion() || qual > minBaseQual ) { // TODO: why not use base qualities here? //double pobs = QualityUtils.qualToErrorProbLog10(qual); - double pobs = 1.0d - pow(10, (30 / -10.0)); + final double pobs = 1.0d - pow(10, (30 / -10.0)); if( isNonRef(refBase, p)) { genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d); genotypeLikelihoods[AA] += Math.log10((1-pobs)/3); @@ -905,7 +930,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { - double f = calculateF(pileup, refBase, minBaseQual); + final double f = calculateF(pileup, refBase, minBaseQual); return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f); } @@ -923,11 +948,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } } - double f = (double) altCount / ((double) refCount + (double) altCount); - return f; + return (double) altCount / (refCount + altCount); } - private boolean isNonRef(byte refBase, PileupElement p) { + private boolean isNonRef(final byte refBase, final PileupElement p) { return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip(); } @@ -960,8 +984,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return readsToRemove; } - private static GATKSAMRecord findReadByName(Collection reads, String name) { - for(GATKSAMRecord read : reads) { + private static GATKSAMRecord findReadByName(final Collection reads, final String name) { + for(final GATKSAMRecord read : reads) { if (name.equals(read.getReadName())) return read; } return null; @@ -1022,7 +1046,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public Integer reduce(List callsInRegion, Integer numCalledRegions) { + public Integer reduce(final List callsInRegion, final Integer numCalledRegions) { for( final VariantContext call : callsInRegion ) { vcfWriter.add( call ); } @@ -1030,7 +1054,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public void onTraversalDone(Integer result) { + public void onTraversalDone(final Integer result) { // if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it // referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed @@ -1045,114 +1069,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } - /** - * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. - * dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline. - */ - @ArgumentCollection - protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } - - /** - * If a call overlaps with a record from the provided comp track, the INFO field will be annotated - * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). - * Records that are filtered in the comp track will be ignored. - * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). - */ - @Advanced - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) - public List> comps = Collections.emptyList(); - public List> getCompRodBindings() { return comps; } - - - - /** - * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. - */ - @Advanced - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) -// protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); -// protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator", -// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"})); - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); - - /** - * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, - * so annotations will be excluded even if they are explicitly included with the other options. - */ - @Advanced - @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"})); - - /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. - */ - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - //protected String[] annotationGroupsToUse = { StandardAnnotation.class.getSimpleName() }; - protected String[] annotationClassesToUse = { }; - - /** - * A raw, unfiltered, highly sensitive callset in VCF format. - */ - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - /** - * Active region trimmer reference. - */ - @ArgumentCollection - protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); - - @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) - protected String keepRG = null; - - - - - /** - * The minimum confidence needed for a given base for it to be used in variant calling. - */ - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public byte MIN_BASE_QUALTY_SCORE = 10; - - - -// PAIR-HMM-Related Goodness - -// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; -// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.AGGRESSIVE; - public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE; - - // Parameters to control read error correction - @Hidden - @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectReads = false; - - @Hidden - @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) - protected boolean captureAssemblyFailureBAM = false; - - @Advanced - @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) - protected boolean dontUseSoftClippedBases = false; - - @Hidden - @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) - protected boolean justDetermineActiveRegions = false; - - - - - // reference base padding size - private static final int REFERENCE_PADDING = 500; - - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument - - - /** * High-level function that runs the assembler on the active region reads, * returning a data structure with the resulting information needed @@ -1224,14 +1140,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop()); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { - //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } } } - // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. - final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { @@ -1268,9 +1181,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i * * @param reads the list of reads to consider */ - private void cleanOverlappingReadPairs(final List reads, Set normalSampleNames) { - Map> data = splitReadsBySample(reads); - for ( String sampleName : data.keySet() ) { + private void cleanOverlappingReadPairs(final List reads, final Set normalSampleNames) { + final Map> data = splitReadsBySample(reads); + for ( final String sampleName : data.keySet() ) { final boolean isTumor = !normalSampleNames.contains(sampleName); final List perSampleReadList = data.get(sampleName); @@ -1284,16 +1197,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } - public static void logReadInfo(String readName, Collection records, String message) { + public static void logReadInfo(final String readName, final Collection records, final String message) { if (readName != null) { - for (GATKSAMRecord rec : records) { + for (final GATKSAMRecord rec : records) { logReadInfo(readName, rec, message); } - } } - public static void logReadInfo(String readName, GATKSAMRecord rec, String message) { + public static void logReadInfo(final String readName, final GATKSAMRecord rec, final String message) { if (readName != null && rec != null && readName.equals(rec.getReadName())) { logger.info("Found " + rec.toString() + " - " + message); } @@ -1322,7 +1234,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return result; } - private boolean isReadFromNormal(GATKSAMRecord rec) { + private boolean isReadFromNormal(final GATKSAMRecord rec) { return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); } @@ -1338,7 +1250,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Advanced @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) protected boolean doNotRunPhysicalPhasing = false; - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index dbacfbe41..dfbcf9cee 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -54,9 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; import com.google.java.contract.Ensures; import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; +import org.apache.commons.collections.ListUtils; import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -78,16 +80,33 @@ import java.util.*; public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { - protected final M2ArgumentCollection MTAC; + private final M2ArgumentCollection MTAC; private final TumorPowerCalculator strandArtifactPowerCalculator; - final boolean REF_AND_ALT = false; - final boolean ALT_ONLY = true; + + private final String tumorSampleName; + private final String matchedNormalSampleName; + private final String DEBUG_READ_NAME; + + // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy + private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { + public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; } + }; private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class); - public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) { - super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing); + public SomaticGenotypingEngine(final M2ArgumentCollection configuration, + final SampleList samples, + final GenomeLocParser genomeLocParser, + final boolean doPhysicalPhasing, + final M2ArgumentCollection MTAC, + final String tumorSampleName, + final String matchedNormalSampleName, + final String DEBUG_READ_NAME) { + super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing); this.MTAC = MTAC; + this.tumorSampleName = tumorSampleName; + this.matchedNormalSampleName = matchedNormalSampleName; + this.DEBUG_READ_NAME = DEBUG_READ_NAME; // coverage related initialization final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10)); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); @@ -105,37 +124,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window - * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts. * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * */ -// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! public CalledHaplotypes callMutations ( - final List haplotypes, - //final Map haplotypeReadMap, - final ReadLikelihoods readLikelihoods, - final Map originalNormalReadQualities, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype, - final boolean emitReferenceConfidence, - final String tumorSampleName, - final String matchedNormalSampleName, - final RodBinding dbsnpRod, - final List> cosmicRod, - final String DEBUG_READ_NAME - - ) { - + final List haplotypes, + final ReadLikelihoods readLikelihoods, + final Map originalNormalReadQualities, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final RefMetaDataTracker tracker, + final List activeAllelesToGenotype) { // sanity check input arguments if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); @@ -143,8 +147,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - // Somatic Tumor/Normal Sample Handling verifySamplePresence(tumorSampleName, readLikelihoods.samples()); @@ -159,293 +161,217 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final List returnCalls = new ArrayList<>(); for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); - - if( eventsAtThisLoc.isEmpty() ) { continue; } - - // Create the event mapping object which maps the original haplotype events to the events present at just this locus - final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); - - // Sanity check the priority list for mistakes - final List priorityList = makePriorityList(eventsAtThisLoc); - - // Merge the event to find a common reference representation - - VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - if( mergedVC == null ) { continue; } - - if (emitReferenceConfidence) - mergedVC = addNonRefSymbolicAllele(mergedVC); - - final Map mergeMap = new LinkedHashMap<>(); - mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { - mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function - } - - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - - if( configuration.DEBUG && logger != null ) { - if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - } - - ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); - - //LDG: do we want to do this before or after pulling out overlapping reads? - if (MTAC.isSampleContaminationPresent()) - readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); - - //if (!mergedVC.isBiallelic()) { - // logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString()); - // continue; - //} - - // TODO: once tests are passing, refactor to use the new data structure (not the deprecated one) - // handle overlapping fragments - // TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!) - PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); - filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); - MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); - // extend to multiple samples - - // compute tumor LOD for each alternate allele - // TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you - final PerAlleleCollection altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false); - final PerAlleleCollection tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions); - - if( configuration.DEBUG && logger != null ) { - StringBuilder outputSB = new StringBuilder("Calculated allelic fraction at " + loc + " = ["); - for (Allele allele : altAlleleFractions.getAltAlleles()){ - outputSB.append( allele + ": " + altAlleleFractions.getAlt(allele) + ", "); - } - outputSB.append("]"); - logger.info(outputSB.toString()); - } - - final PerAlleleCollection tumorLods = PerAlleleCollection.createPerAltAlleleCollection(); - for (Allele altAllele : mergedVC.getAlternateAlleles()){ - tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); - } - - if (configuration.DEBUG && logger != null) { - StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = ["); - for (Allele altAllele : tumorLods.getAltAlleles()) { - outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", "); - } - outputSB.append("]"); - logger.info(outputSB.toString()); - } - - double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - PerReadAlleleLikelihoodMap normalPRALM = null; - PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); - - // if normal bam is available, compute normal LOD - if (hasNormal) { - normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); - filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); - MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads"); - - GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); - Collection cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc); - Collection dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc); - // remove the effect of cosmic from dbSNP - final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); - - INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal - NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; - - - // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 - // note normal LOD is REF over ALT, the reciprocal of the tumor LOD - final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - for (final Allele allele : mergedVC.getAlternateAlleles()){ - diploidHetAlleleFractions.setAlt(allele, 0.5); - } - - final PerAlleleCollection normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions); - - for (final Allele altAllele : mergedVC.getAlternateAlleles()){ - normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele)); - } - - } - - int numPassingAlts = 0; - Set allelesThatPassThreshold = new HashSet<>(); - Allele alleleWithHighestTumorLOD = null; - - // TODO: use lambda - for (Allele altAllele : mergedVC.getAlternateAlleles()) { - final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; - final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; - if (passesTumorLodThreshold && passesNormalLodThreshold) { - numPassingAlts++; - allelesThatPassThreshold.add(altAllele); - if (alleleWithHighestTumorLOD == null - || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ - alleleWithHighestTumorLOD = altAllele; - } - } - } - - final boolean emitVariant = numPassingAlts > 0; - - VariantContext call = null; - if (emitVariant) { - VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); - // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... - int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); - callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); - callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); - - if (hasNormal) { - callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); - if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { - callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); - } - } - - // M1-style strand artifact filter - // TODO: move code to MuTect2::calculateFilters() - // skip if VC has multiple alleles - it will get filtered later anyway - if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { - final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); - final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); - splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); - - MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split"); - MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split"); - MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split"); - - // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] - // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. - final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); - final PerAlleleCollection tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward); - - final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); - final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); - - if( configuration.DEBUG && logger != null ) { - StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = ["); - StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = ["); - - for (Allele altAllele : altAlleleFractions.getAltAlleles()){ - forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", "); - reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", "); - } - - forwardMessage.append("]"); - reverseMessage.append("]"); - - logger.info(forwardMessage.toString()); - logger.info(reverseMessage.toString()); - } - - double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); - double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); - - double tumorSBpower_fwd = 0.0; - double tumorSBpower_rev = 0.0; - try { - // Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction - tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); - tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); - } - catch (Throwable t) { - System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc); - t.printStackTrace(System.err); - throw new RuntimeException(t); - } - - callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); - callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); - callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); - callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); - // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); - - if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || - (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) - callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME); - } - - // TODO: this probably belongs in M2::calculateFilters() - if (numPassingAlts > 1) { - callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME); - } - - // build genotypes TODO: this part needs review and refactor - List tumorAlleles = new ArrayList<>(); - tumorAlleles.add(mergedVC.getReference()); - tumorAlleles.add(alleleWithHighestTumorLOD); - Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) - .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) - .make(); // TODO: add ADs? - List genotypes = new ArrayList<>(); - genotypes.add(tumorGenotype); - - // We assume that the genotype in the normal is 0/0 - // TODO: is normal always homozygous reference? - List homRefAllelesforNormalGenotype = new ArrayList<>(); - homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference())); - - // if we are calling with a normal, build the genotype for the sample to appear in vcf - int REF = 0, ALT = 1; - if (hasNormal) { - PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); - final int normalRefAlleleDepth = normalCounts.getRef(); - final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); - final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; - final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); - - final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) - .AD(normalAlleleDepths) - .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) - .make(); - genotypes.add(normalGenotype); - } - - //only use alleles found in the tumor ( - call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make(); - - } - - // how should we be making use of _perSampleFilteredReadList_? - if( call != null ) { - readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList, - genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call); - - ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref); - VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false); - - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - - // maintain the set of all called haplotypes - for ( final Allele calledAllele : call.getAlleles() ) { - final List haplotypeList = alleleMapper.get(calledAllele); - if (haplotypeList == null) continue; - calledHaplotypes.addAll(haplotypeList); - } - - returnCalls.add( annotatedCall ); - } - + if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) { + continue; } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + + if( eventsAtThisLoc.isEmpty() ) { continue; } + + // Create the event mapping object which maps the original haplotype events to the events present at just this locus + final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); + + final List priorityList = makePriorityList(eventsAtThisLoc); + + VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + if( mergedVC == null ) { continue; } + + final Map mergeMap = new LinkedHashMap<>(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele + for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { + mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + } + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + + ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); + + //LDG: do we want to do this before or after pulling out overlapping reads? + if (MTAC.isSampleContaminationPresent()) + readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); + + final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); + filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); + // extend to multiple samples + + // compute tumor LOD for each alternate allele + // TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you + final PerAlleleCollection altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false); + final PerAlleleCollection tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions); + + final PerAlleleCollection tumorLods = PerAlleleCollection.createPerAltAlleleCollection(); + for (final Allele altAllele : mergedVC.getAlternateAlleles()){ + tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); + } + + double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + PerReadAlleleLikelihoodMap normalPRALM = null; + final PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + + // if normal bam is available, compute normal LOD + if (hasNormal) { + normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); + filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); + MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads"); + + final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); + final Collection cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); + final Collection dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); + // remove the effect of cosmic from dbSNP + final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); + + INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal + NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; + + // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 + // note normal LOD is REF over ALT, the reciprocal of the tumor LOD + final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + for (final Allele allele : mergedVC.getAlternateAlleles()){ + diploidHetAlleleFractions.setAlt(allele, 0.5); + } + + final PerAlleleCollection normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions); + + for (final Allele altAllele : mergedVC.getAlternateAlleles()){ + normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele)); + } + } + + int numPassingAlts = 0; + final Set allelesThatPassThreshold = new HashSet<>(); + Allele alleleWithHighestTumorLOD = null; + + // TODO: use lambda + for (final Allele altAllele : mergedVC.getAlternateAlleles()) { + final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; + if (passesTumorLodThreshold && passesNormalLodThreshold) { + numPassingAlts++; + allelesThatPassThreshold.add(altAllele); + if (alleleWithHighestTumorLOD == null + || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ + alleleWithHighestTumorLOD = altAllele; + } + } + } + if (numPassingAlts == 0) { + continue; + } + + + final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); + // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... + final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); + callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); + callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); + + if (hasNormal) { + callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); + if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { + callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); + } + } + + // M1-style strand artifact filter + // TODO: move code to MuTect2::calculateFilters() + // skip if VC has multiple alleles - it will get filtered later anyway + if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { + final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); + final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); + splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); + + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split"); + + // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] + // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. + final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); + final PerAlleleCollection tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward); + + final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); + final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); + + final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); + final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); + + // Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction + final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + + + callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); + callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); + // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); + + if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || + (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) + callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME); + } + + // TODO: this probably belongs in M2::calculateFilters() + if (numPassingAlts > 1) { + callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME); + } + + // build genotypes TODO: this part needs review and refactor + final List tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD); + final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) + .make(); // TODO: add ADs? + final List genotypes = new ArrayList<>(); + genotypes.add(tumorGenotype); + + // We assume that the genotype in the normal is 0/0 + // TODO: is normal always homozygous reference? + final List homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference()); + + // if we are calling with a normal, build the genotype for the sample to appear in vcf + if (hasNormal) { + final PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); + final int normalRefAlleleDepth = normalCounts.getRef(); + final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); + final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; + final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); + + final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) + .AD(normalAlleleDepths) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) + .make(); + genotypes.add(normalGenotype); + } + + //only use alleles found in the tumor ( + final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make(); + + // how should we be making use of _perSampleFilteredReadList_? + readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList, + genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call); + + final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref); + VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false); + + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + + // maintain the set of all called haplotypes + call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll); + returnCalls.add( annotatedCall ); } // TODO: understand effect of enabling this for somatic calling... - final List phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; - return new CalledHaplotypes(phasedCalls, calledHaplotypes); - //return new CalledHaplotypes(returnCalls, calledHaplotypes); + final List outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; + return new CalledHaplotypes(outputCalls, calledHaplotypes); } - private void verifySamplePresence(String sampleName, List samples) { + private void verifySamplePresence(final String sampleName, final List samples) { if (!samples.contains(sampleName)) { throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples)); } @@ -459,7 +385,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * @param alleleFractions allele fraction(s) for alternate allele(s) * * @return genotype likelihoods for homRef and het for each alternate allele - */ + */ private PerAlleleCollection getHetGenotypeLogLikelihoods(final VariantContext mergedVC, final PerReadAlleleLikelihoodMap tumorPRALM, final Map originalNormalMQs, @@ -470,13 +396,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } final PerAlleleCollection genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - for (final Allele allele : mergedVC.getAlleles()){ - genotypeLogLikelihoods.set(allele, new MutableDouble(0.0)); - } + mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0))); final Allele refAllele = mergedVC.getReference(); for(Map.Entry> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) { - Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); + final Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) { continue; } @@ -484,9 +408,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele); genotypeLogLikelihoods.getRef().add(readRefLogLikelihood); - for (Allele altAllele : alleleFractions.getAltAlleles()) { - double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); - double adjustedReadAltLL = Math.log10( + for (final Allele altAllele : alleleFractions.getAltAlleles()) { + final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); + final double adjustedReadAltLL = Math.log10( Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) + Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele) ); @@ -515,9 +439,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerAlleleCollection alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly); final PerAlleleCollection alleleFractions = PerAlleleCollection.createPerAltAlleleCollection(); - int refCount = alleleCounts.getRef(); + final int refCount = alleleCounts.getRef(); for ( final Allele altAllele : vc.getAlternateAlleles() ) { - int altCount = alleleCounts.getAlt(altAllele); + final int altCount = alleleCounts.getAlt(altAllele); double alleleFraction = (double) altCount / (refCount + altCount); // weird case, but I've seen it happen in one strand cases if (refCount == 0 && altCount == refCount ) { @@ -552,16 +476,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerAlleleCollection alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - - // initialize the allele counts to 0 - for (final Allele allele : vcAlleles) { - alleleCounts.set(allele, new MutableInt(0)); - } + vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0))); for (final Map.Entry> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = readAlleleLikelihoodMap.getKey(); final Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); - MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) { alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment(); @@ -581,18 +501,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } - private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) { - - Map> m = pralm.getLikelihoodReadMap(); - + private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) { + final Map> m = pralm.getLikelihoodReadMap(); // iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them - Map nameToRead = new HashMap<>(); - Set readsToKeep = new HashSet<>(); + final Map nameToRead = new HashMap<>(); + final Set readsToKeep = new HashSet<>(); - for(GATKSAMRecord rec : m.keySet()) { + for(final GATKSAMRecord rec : m.keySet()) { // if we haven't seen it... just record the name and add it to the list of reads to keep - GATKSAMRecord existing = nameToRead.get(rec.getReadName()); + final GATKSAMRecord existing = nameToRead.get(rec.getReadName()); if (existing == null) { nameToRead.put(rec.getReadName(), rec); readsToKeep.add(rec); @@ -604,11 +522,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!! if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) { - MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing)); - Allele existingAllele = existingMLA.getMostLikelyAllele(); + final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing)); + final Allele existingAllele = existingMLA.getMostLikelyAllele(); - MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec)); - Allele recAllele = recMLA.getMostLikelyAllele(); + final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec)); + final Allele recAllele = recMLA.getMostLikelyAllele(); // if the reads disagree at this position... if (!existingAllele.equals(recAllele)) { @@ -654,7 +572,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) { - Map> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); + final Map> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) { if (read.isStrandless()) continue; @@ -669,17 +587,4 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } } - - - // Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine - private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) { - final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); - final List originalList = mergedVC.getAlleles(); - final List alleleList = new ArrayList<>(originalList.size() + 1); - alleleList.addAll(mergedVC.getAlleles()); - alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); - vcb.alleles(alleleList); - return vcb.make(); - } - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java index 5dd37dc7c..8c09160ff 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java @@ -55,6 +55,7 @@ import org.apache.commons.math.MathException; import org.apache.commons.math.distribution.BinomialDistribution; import org.apache.commons.math.distribution.BinomialDistributionImpl; import org.apache.commons.math3.util.Pair; +import org.broadinstitute.gatk.utils.exceptions.GATKException; import java.util.Arrays; import java.util.HashMap; @@ -70,7 +71,6 @@ public class TumorPowerCalculator { private final double tumorLODThreshold; private final double contamination; private final boolean enableSmoothing; - public static int numCacheHits = 0; private final HashMap cache = new HashMap(); @@ -134,16 +134,18 @@ public class TumorPowerCalculator { * @throws MathException * */ - public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException { + public double cachedPowerCalculation(final int numReads, final double alleleFraction) { PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction); // we first look up if power for given number of read and allele fraction has already been computed and stored in the cache. // if not we compute it and store it in teh cache. Double power = cache.get(key); if (power == null) { - power = calculatePower(numReads, alleleFraction); + try { + power = calculatePower(numReads, alleleFraction); + } catch (final Exception ex) { + throw new GATKException("Power calculation failed", ex); + } cache.put(key, power); - } else { - numCacheHits++; } return power; }